In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from ml_summary import *

In [3]:
#graphics set up
pd.set_option('display.max_colwidth', 250)

In [4]:
int_remove_cols = ["carmaker", "transmission", "drive", "car_style",
                    "emission_class", "door_count", "color", "carmaker_volkswagen", "transmission_manual", "drive_4x2", "car_style_sedan", "door_count_4_5", "color_s", "country_origin_Germany", 'emission_class_old', 'fuel_type', 'fuel_type_petrol', 'country_origin']
int_cont_cols = ["power", "cubic_capacity", 
                 "fuel_consumption_combined", "co2_emission", "vehicle_age"]

In [5]:
df_comparison = pd.DataFrame(columns=["method", "MSE_train", "MAD_train",
                                       "R2_train", "MSE_test", "MAD_test", "R2_test"])

# Introduction

Following report is dedicated to finding a model and its specification that can be used for the most accurate price prediction in our data set. In the estimation of the individual models, we will use 9:1 train test ratio, and three fold cross validation when training the models. The best fits will be selected based on the value of mean squared error. In order to explore different model specifications I use GrindSearchCV function from sklearn package. Following models will be tried: OLS, KNN Regression, Decison Tree Regression, Random Forrest Regression, XGBoost Tree Regression.

In [6]:
#read data
scaler = StandardScaler()
data = pd.read_csv("../data/data_pred.csv")
data.head()

Unnamed: 0,power,cubic_capacity,fuel_consumption_combined,co2_emission,mileage,vehicle_age,carmaker_audi,carmaker_bmw,carmaker_citroen,carmaker_dacia,...,country_origin_Italy,country_origin_Luxembourg,country_origin_Netherlands,country_origin_Poland,country_origin_Romania,country_origin_Slovakia,country_origin_Spain,country_origin_Sweden,country_origin,price
0,52.0,1.193,4.5,104.0,0.0,113,False,False,False,False,...,False,False,False,False,False,False,False,False,Germany,361990
1,75.0,1.598,5.2,136.0,163.0,3188,False,False,False,False,...,False,False,False,False,False,False,False,False,Belgium,305490
2,66.0,1.197,4.7,107.0,78.56,2731,False,False,False,False,...,False,False,False,False,False,False,False,False,Germany,328990
3,120.0,1.997,6.9,179.0,124.516,4496,False,False,True,False,...,False,False,False,False,False,False,False,False,Germany,120490
4,92.0,1.0,6.0,134.0,43.8,2000,False,False,False,False,...,False,False,False,False,False,False,False,False,Germany,401490


In [7]:
#prepare data for estimation
columns =  data.columns
x_columns = list(data.columns)
x_columns.remove("price")
for column in data.select_dtypes(include=['bool']):
    data[column] = data[column].astype(int) 
for col in int_remove_cols:
    x_columns.remove(col)
X = data[x_columns]

#scale continous variables
X[int_cont_cols] = scaler.fit_transform(X[int_cont_cols])

#target variable - price  
y = data["price"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[int_cont_cols] = scaler.fit_transform(X[int_cont_cols])


In [8]:
#class used for storing results using several different ML algorithms
predict_num = PredictionSummary()
#create train/test data sets inside class for estimation
predict_num.load_data(X=X, y=y)

## OLS

In [9]:
param_grid = {
    'fit_intercept': [True],  # List of possible values for n_neighbors
}
grid_search = predict_num.find_best_model(method="ols", param_grid=param_grid)
predict_num.estimate_test(grid_search, {'fit_intercept': True})
(predict_num.summary_df[predict_num.summary_df['method'] == 'ols']
 .sort_values('R2_valid', ascending=False)
 .head())

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END fit_intercept=True; neg_mean_absolute_error: (train=-166672.529, test=-167423.210) neg_mean_squared_error: (train=-109348180718.714, test=-113905078532.683) r2: (train=0.731, test=0.729) total time=   0.5s
[CV 2/3] END fit_intercept=True; neg_mean_absolute_error: (train=-167552.192, test=-167864.856) neg_mean_squared_error: (train=-111240124873.198, test=-110135818432.688) r2: (train=0.730, test=0.731) total time=   0.5s
[CV 3/3] END fit_intercept=True; neg_mean_absolute_error: (train=-168258.056, test=-167359.024) neg_mean_squared_error: (train=-111917257955.487, test=-108809294284.678) r2: (train=0.730, test=0.730) total time=   0.5s


  self.summary_df = pd.concat([self.summary_df, pd.DataFrame(stat_row, index=[self.df_index])])


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
0,ols,-110835200000.0,-167494.258849,0.730173,-110950100000.0,-167549.029815,0.729902,114249315327.1947,168154.31387,0.729155,"{""fit_intercept"": true}",0.1,3,1


Firstly, I attempted to predict the prices using the OLS. I tested only one model specification with all of the variables included in the estimation, nevertheless there seems to be very little difference in performacne between training validation and testing so the overfitting does not seem to be a problem in this case. The model was able to explain approximately 73 % of variance in all 3 phases and the mean absolute error was 167 500 which is obviously too much for the model to have any practictal use.

## K-nearest Neighbors

In [10]:
param_grid = {
    'n_neighbors': [3, 7, 11],  # List of possible values for n_neighbors
}
grid_search_knreg = predict_num.find_best_model(
    method="knreg", param_grid=param_grid, n_jobs=5
    )
(predict_num.summary_df[predict_num.summary_df['method'] == 'knreg']
 .sort_values('R2_valid', ascending=False)
 .head())

Fitting 3 folds for each of 3 candidates, totalling 9 fits


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
1,knreg,-18746010000.0,-67738.789981,0.954359,-39978290000.0,-99507.146188,0.902656,,,,"{""n_neighbors"": 3}",0.1,3,1
2,knreg,-30089920000.0,-85513.642555,0.926742,-41277700000.0,-100128.487665,0.899485,,,,"{""n_neighbors"": 7}",0.1,3,1
3,knreg,-35653350000.0,-92638.659812,0.913194,-43677200000.0,-102389.694762,0.893654,,,,"{""n_neighbors"": 11}",0.1,3,1


In [11]:
predict_num.estimate_test(grid_search_knreg, {"n_neighbors": 3})
(predict_num.summary_df[predict_num.summary_df['method'] == 'knreg']
 .sort_values('R2_valid', ascending=False)
 .head())

Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
1,knreg,-18746010000.0,-67738.789981,0.954359,-39978290000.0,-99507.146188,0.902656,35755528274.0893,94426.014204,0.915236,"{""n_neighbors"": 3}",0.1,3,1
2,knreg,-30089920000.0,-85513.642555,0.926742,-41277700000.0,-100128.487665,0.899485,,,,"{""n_neighbors"": 7}",0.1,3,1
3,knreg,-35653350000.0,-92638.659812,0.913194,-43677200000.0,-102389.694762,0.893654,,,,"{""n_neighbors"": 11}",0.1,3,1


Secondly, I tried to predict the prices using K-nearest Neigbours regression, even though the method is better suited for analysis of smaller data set than the one I have available in this case. Given the long computation time, only 3 specifications were tried, different in number of neigbours that are used for calculation of the predicted values of which the version with smallest number of neighbours - 3 show the best results. In order to calculate distance between individual observations euclidean distance was used. The performance was improved significantly in comparison with the OLS, the KNN regression was able to explain over 90 % of the variation in the data but the mean absolute error was still quite high at almost 100000. 

## Decision Tree

In [12]:
param_grid = {
    'max_depth': [4, 8, 12, 18, 24, 30, 36],
      'min_samples_split': [8, 12, 14, 18, 24, 28]  # List of possible values for n_neighbors
}
grid_search_dtreg = predict_num.find_best_model(
    method="dtreg", param_grid=param_grid, n_jobs=5
    )
(predict_num.summary_df[predict_num.summary_df['method'] == 'dtreg']
 .sort_values('R2_valid', ascending=False)
 .head())

Fitting 3 folds for each of 42 candidates, totalling 126 fits


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
31,dtreg,-10841950000.0,-52689.547218,0.973606,-27790570000.0,-78329.300057,0.932355,,,,"{""max_depth"": 24, ""min_samples_split"": 18}",0.1,3,1
39,dtreg,-13633000000.0,-56405.649794,0.966816,-27959250000.0,-78306.037783,0.931946,,,,"{""max_depth"": 30, ""min_samples_split"": 28}",0.1,3,1
45,dtreg,-13577790000.0,-55997.760464,0.966951,-28002580000.0,-78375.897537,0.931838,,,,"{""max_depth"": 36, ""min_samples_split"": 28}",0.1,3,1
33,dtreg,-14042700000.0,-58908.127457,0.965819,-28031030000.0,-78589.587371,0.931771,,,,"{""max_depth"": 24, ""min_samples_split"": 28}",0.1,3,1
32,dtreg,-13041100000.0,-56785.966241,0.968256,-28101660000.0,-78509.493425,0.931613,,,,"{""max_depth"": 24, ""min_samples_split"": 24}",0.1,3,1


In [13]:
predict_num.estimate_test(
    grid_search_dtreg, {"max_depth": 24, "min_samples_split": 18}
    )
(predict_num.summary_df[predict_num.summary_df['method'] == 'dtreg']
 .sort_values('R2_valid', ascending=False)
 .head())

Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
31,dtreg,-10841950000.0,-52689.547218,0.973606,-27790570000.0,-78329.300057,0.932355,26127710198.082478,74976.708088,0.93806,"{""max_depth"": 24, ""min_samples_split"": 18}",0.1,3,1
39,dtreg,-13633000000.0,-56405.649794,0.966816,-27959250000.0,-78306.037783,0.931946,,,,"{""max_depth"": 30, ""min_samples_split"": 28}",0.1,3,1
45,dtreg,-13577790000.0,-55997.760464,0.966951,-28002580000.0,-78375.897537,0.931838,,,,"{""max_depth"": 36, ""min_samples_split"": 28}",0.1,3,1
33,dtreg,-14042700000.0,-58908.127457,0.965819,-28031030000.0,-78589.587371,0.931771,,,,"{""max_depth"": 24, ""min_samples_split"": 28}",0.1,3,1
32,dtreg,-13041100000.0,-56785.966241,0.968256,-28101660000.0,-78509.493425,0.931613,,,,"{""max_depth"": 24, ""min_samples_split"": 24}",0.1,3,1


The using decision tree offers further improvement in the performance, the best models explains almost 94% of the variation in testing data set with mean absolute error approximately 74 700. Same as in the case of KNN regression there is quite a decrease in performance between training and testing data set but in comparison with the KNN offers more ways in which the overfitting can be dealt with. Only two parameters I tried to change were min sample split and max depth of the tree so there is definetly room for further imporvement but given that other method as random forrest or xgboost tree are to be expected to perform better than the simple decission tree I did not explore different model set ups further. 

# Random Forrest

In [14]:
param_grid = {
    'max_depth': [18, 24, 36],
    'min_samples_split': [9, 18],
    'n_estimators': [50, 100] 
}
grid_search_rfreg = predict_num.find_best_model(
    method="rfreg", param_grid=param_grid, n_jobs=7
    )

(predict_num.summary_df[predict_num.summary_df['method'] == 'rfreg']
 .sort_values('R2_valid', ascending=False)
 .head())

Fitting 3 folds for each of 12 candidates, totalling 36 fits


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
55,rfreg,-6315973000.0,-39037.928055,0.984625,-18125180000.0,-64489.409338,0.955879,,,,"{""max_depth"": 36, ""min_samples_split"": 9, ""n_estimators"": 100}",0.1,3,1
51,rfreg,-6898501000.0,-43694.873722,0.983208,-18202360000.0,-65246.7168,0.955692,,,,"{""max_depth"": 24, ""min_samples_split"": 9, ""n_estimators"": 100}",0.1,3,1
54,rfreg,-6458061000.0,-39351.622645,0.98428,-18347200000.0,-64839.582846,0.955338,,,,"{""max_depth"": 36, ""min_samples_split"": 9, ""n_estimators"": 50}",0.1,3,1
50,rfreg,-7014289000.0,-43964.599537,0.982925,-18441040000.0,-65523.63882,0.955108,,,,"{""max_depth"": 24, ""min_samples_split"": 9, ""n_estimators"": 50}",0.1,3,1
56,rfreg,-9759756000.0,-48952.314318,0.976242,-19181000000.0,-66331.267064,0.953313,,,,"{""max_depth"": 36, ""min_samples_split"": 18, ""n_estimators"": 50}",0.1,3,1


In [15]:
predict_num.estimate_test(
grid_search_rfreg, {"max_depth": 36, "min_samples_split": 9, "n_estimators": 100}	)
(predict_num.summary_df[predict_num.summary_df['method'] == 'rfreg']
 .sort_values('R2_valid', ascending=False)
 .head())

Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
55,rfreg,-6315973000.0,-39037.928055,0.984625,-18125180000.0,-64489.409338,0.955879,16070733182.618862,61890.798323,0.961902,"{""max_depth"": 36, ""min_samples_split"": 9, ""n_estimators"": 100}",0.1,3,1
51,rfreg,-6898501000.0,-43694.873722,0.983208,-18202360000.0,-65246.7168,0.955692,,,,"{""max_depth"": 24, ""min_samples_split"": 9, ""n_estimators"": 100}",0.1,3,1
54,rfreg,-6458061000.0,-39351.622645,0.98428,-18347200000.0,-64839.582846,0.955338,,,,"{""max_depth"": 36, ""min_samples_split"": 9, ""n_estimators"": 50}",0.1,3,1
50,rfreg,-7014289000.0,-43964.599537,0.982925,-18441040000.0,-65523.63882,0.955108,,,,"{""max_depth"": 24, ""min_samples_split"": 9, ""n_estimators"": 50}",0.1,3,1
56,rfreg,-9759756000.0,-48952.314318,0.976242,-19181000000.0,-66331.267064,0.953313,,,,"{""max_depth"": 36, ""min_samples_split"": 18, ""n_estimators"": 50}",0.1,3,1


The best random forrest model explains over 95 % of variance in the testing data set and manages to squeze the mean absolute error under  67000. In comparision with the decision tree we can see that this model deals bettet with the overfitting. The performance on training data set is very similiar between these two models but random forrest performs better in validation and testing. 

## XGBOOST 

In [16]:
param_grid = {
    'max_depth': [3, 6, 9, 12, 15, 18, 13, 17, 21, 22, 24, 26, 28], 'min_samples_split': [18], 
    'n_estimators': [100, 50, 150, 200, 250], 
    'eta': [0.05, 0.1, 0.01, 0.15, 0.2], 
    'lambda': [1, 3, 5], 
    'gamma': [5, 10, 15], 
    'subsample': [0.25, 0.5, 0.75, 1], 
    'colsample_bytree': [0.7, 0.8, 0.5]
}
grid_search_xgboost = predict_num.find_best_model(
    method="xboostreg", param_grid=param_grid, n_jobs=7
    )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [17]:
predict_num.estimate_test(
    grid_search_xgboost, {"colsample_bytree": 0.8, "eta": 0.15, "gamma": 5, "max_depth": 26, "n_estimators": 100, "subsample": 0.75}
    )
(predict_num.summary_df[predict_num.summary_df['method'] == 'xboostreg']
 .sort_values('R2_valid', ascending=False)
 .head())

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
58,xboostreg,-2224016000.0,-27997.13664,0.994586,-15984780000.0,-62445.494969,0.96108,14374140072.877008,59836.928617,0.965924,"{""colsample_bytree"": 0.8, ""eta"": 0.15, ""gamma"": 5, ""max_depth"": 26, ""n_estimators"": 100, ""subsample"": 0.75}",0.1,3,1


After lengthy search, I was able to achieve to eplain over 96 % of variation in validation and testing. The tree depth is quite high at 26 which contributes to an overfitting of the model and great difference in performance between training and validation, so there is possibly space for further improving the performance by tweaking of model parameters. Nevertheless after experimenting with different possible values of parameters used to deal with the overfitting (gamma, min_child_weight, max_depth, colsample_bytree, subsample) I was only able make the difference smaller by decreasing performance in the training but not improving it in the validation. Further, 96 % of explained variation or mean absolute error around 60 000 seem to be reasonable given the fact that quite a lot of information important for determining the price of used vehicle was not availiable or was not possible to implement for training of the model such as information about the gadgets that are part of the equipment, the model of car or simply the fact that even vehicles of similiar age and mileage can differ greatly between each other in their condition which is obviously also not controlled for in the model. Even though it has to be also mentioned that this last point might not be of major importance in this case given that there are mostly cars 6 years or younger in the data set whose mileage does not exceed 150 000 kilometres.    

## Summary Results

In [25]:
predict_num.summary_df[~predict_num.summary_df['R2_test'].isnull()].sort_values('R2_valid', ascending=False)

Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
58,xboostreg,-2224016000.0,-27997.13664,0.994586,-15984780000.0,-62445.494969,0.96108,14374140072.877008,59836.928617,0.965924,"{""colsample_bytree"": 0.8, ""eta"": 0.15, ""gamma"": 5, ""max_depth"": 26, ""n_estimators"": 100, ""subsample"": 0.75}",0.1,3,1
55,rfreg,-6315973000.0,-39037.928055,0.984625,-18125180000.0,-64489.409338,0.955879,16070733182.618862,61890.798323,0.961902,"{""max_depth"": 36, ""min_samples_split"": 9, ""n_estimators"": 100}",0.1,3,1
31,dtreg,-10841950000.0,-52689.547218,0.973606,-27790570000.0,-78329.300057,0.932355,26127710198.082478,74976.708088,0.93806,"{""max_depth"": 24, ""min_samples_split"": 18}",0.1,3,1
1,knreg,-18746010000.0,-67738.789981,0.954359,-39978290000.0,-99507.146188,0.902656,35755528274.0893,94426.014204,0.915236,"{""n_neighbors"": 3}",0.1,3,1
0,ols,-110835200000.0,-167494.258849,0.730173,-110950100000.0,-167549.029815,0.729902,114249315327.1947,168154.31387,0.729155,"{""fit_intercept"": true}",0.1,3,1


The final results show that as expected the model that was able to predict the best the used car prices is XGBoost. It was able to pushed the mean absolute error under 63 000 in validation and under 60 000 in testing. Nevertheless, it is very closely followed by  random forrest which was able to explain only 5% less of variation in prediction. Both decission tree and KNN Regression also were able to record R2 higher than 90 %, on the other hand OLS was able achieved only score slightly aboce 70 % showing that assumption of linearity in parameters does not fit the problem that well. Further, it seems that with further experimenting with tweaking of various parameters that both XGBoost and RF offer their performance could may be still marginally improved but there are obvious bounderies in what can be achieved with current data. So being able to predict 96 % of the variation in the data seems to be reasonably good result.