In [3]:
from sklearn.preprocessing import StandardScaler
from ml_summary import *

In [4]:
int_remove_cols = ["carmaker", "transmission", "drive", "car_style",
                    "emission_class", "door_count", "color", "carmaker_volkswagen", "transmission_manual", "drive_4x2", "car_style_sedan", "door_count_4_5", "color_s", "country_origin_Germany", 'emission_class_old', 'fuel_type', 'fuel_type_petrol', 'country_origin']
int_cont_cols = ["power", "cubic_capacity", 
                 "fuel_consumption_combined", "co2_emission", "vehicle_age"]

In [5]:
df_comparison = pd.DataFrame(columns=["method", "MSE_train", "MAD_train",
                                       "R2_train", "MSE_test", "MAD_test", "R2_test"])

# Introduction

Following report is dedicated to finding a model and its specification that can be used for the most accurate price prediction in our data set. In the estimation of the individual models, we will use 9:1 train test ratio, and three fold cross validation when training the models. The best fits will be selected based on the value of mean squared error. In order to explore different model specifications I use GrindSearchCV function from sklearn package. Following models will be tried: OLS, KNN Regression, Decison Tree Regression, Random Forrest Regression, XGBoost Tree Regression.

In [6]:
#read data
scaler = StandardScaler()
data = pd.read_csv("../data/data_pred.csv")
data.head()

Unnamed: 0,power,cubic_capacity,fuel_consumption_combined,co2_emission,mileage,vehicle_age,carmaker_audi,carmaker_bmw,carmaker_citroen,carmaker_dacia,...,country_origin_Italy,country_origin_Luxembourg,country_origin_Netherlands,country_origin_Poland,country_origin_Romania,country_origin_Slovakia,country_origin_Spain,country_origin_Sweden,country_origin,price
0,52.0,1.193,4.5,104.0,0.0,113,False,False,False,False,...,False,False,False,False,False,False,False,False,Germany,361990
1,75.0,1.598,5.2,136.0,163.0,3188,False,False,False,False,...,False,False,False,False,False,False,False,False,Belgium,305490
2,66.0,1.197,4.7,107.0,78.56,2731,False,False,False,False,...,False,False,False,False,False,False,False,False,Germany,328990
3,120.0,1.997,6.9,179.0,124.516,4496,False,False,True,False,...,False,False,False,False,False,False,False,False,Germany,120490
4,92.0,1.0,6.0,134.0,43.8,2000,False,False,False,False,...,False,False,False,False,False,False,False,False,Germany,401490


In [7]:
#prepare data for estimation
columns =  data.columns
x_columns = list(data.columns)
x_columns.remove("price")
for column in data.select_dtypes(include=['bool']):
    data[column] = data[column].astype(int) 
for col in int_remove_cols:
    x_columns.remove(col)
X = data[x_columns]

#scale continous variables
X[int_cont_cols] = scaler.fit_transform(X[int_cont_cols])

#target variable - price  
y = data["price"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[int_cont_cols] = scaler.fit_transform(X[int_cont_cols])


In [8]:
#class used for storing results using several different ML algorithms
predict_num = PredictionSummary()
#create train/test data sets inside class for estimation
predict_num.load_data(X=X, y=y)

## OLS

In [15]:
param_grid = {
    'fit_intercept': [True],  # List of possible values for n_neighbors
}
grid_search = predict_num.find_best_model(method="ols", param_grid=param_grid)
predict_num.estimate_test(grid_search, {'fit_intercept': True})
predict_num.summary_df[summary_df['method'] == 'ols'].sort_values('R2_valid', ascending=False).head()

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END fit_intercept=True; neg_mean_absolute_error: (train=-166756.743, test=-167410.949) neg_mean_squared_error: (train=-109060085958.908, test=-113584049932.714) r2: (train=0.732, test=0.729) total time=   0.6s
[CV 2/3] END fit_intercept=True; neg_mean_absolute_error: (train=-167603.662, test=-167914.343) neg_mean_squared_error: (train=-110950710209.075, test=-109818217618.593) r2: (train=0.730, test=0.732) total time=   0.5s
[CV 3/3] END fit_intercept=True; neg_mean_absolute_error: (train=-168295.525, test=-167501.953) neg_mean_squared_error: (train=-111597728869.786, test=-108551507666.278) r2: (train=0.731, test=0.731) total time=   0.5s


  self.summary_df = pd.concat([self.summary_df, pd.DataFrame(stat_row, index=[self.df_index])])


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
0,ols,-110536200000.0,-167551.97663,0.730901,-110651300000.0,-167609.08205,0.730629,113990066862.7722,168211.863199,0.729769,"{""fit_intercept"": true}",0.1,3,1


Firstly, I attempted to predict the prices using the OLS. I tested only one model specification with all of the variables included in the estimation, nevertheless there seems to be very little difference in performacne between training validation and testing so the overfitting does not seem to be a problem in this case. The model was able to explain approximately 73 % of variance in all 3 phases and the mean absolute error was 167 500 which is obviously too much for the model to have any practictal use.

## K-nearest Neighbors

In [9]:
param_grid = {
    'n_neighbors': [3, 7, 11],  # List of possible values for n_neighbors
}
grid_search_knreg = predict_num.find_best_model(
    method="knreg", param_grid=param_grid, n_jobs=5
    )
predict_num.summary_df[summary_df['method'] == 'knreg'].sort_values('R2_valid', ascending=False).head()

{'n_neighbors': 3}
{'method': 'knreg', 'MSE_train': -18746012575.120937, 'MSE_valid': -39978290516.27935, 'MAD_train': -67738.78998082389, 'MAD_valid': -99507.14618756752, 'R2_valid': 0.9026563348313258, 'R2_train': 0.9543592446580654, 'params': '{"n_neighbors": 3}', 'iteration': 1, 'cv_n': 3, 'test_size': 0.09999999999999998}
  method     MSE_train      MAD_train  R2_train     MSE_valid      MAD_valid  \
0    ols -1.108352e+11 -167494.258849  0.730173 -1.109501e+11 -167549.029815   

   R2_valid             MSE_test      MAD_test   R2_test  \
0  0.729902  114249315327.194702  168154.31387  0.729155   

                    params  test_size cv_n iteration  
0  {"fit_intercept": true}        0.1    3         1  


Unnamed: 0,method,MSE_train,MAD_train,R2_train,MSE_valid,MAD_valid,R2_valid,MSE_test,MAD_test,R2_test,params,test_size,cv_n,iteration
0,ols,-110835200000.0,-167494.258849,0.730173,-110950100000.0,-167549.029815,0.729902,114249315327.1947,168154.31387,0.729155,"{""fit_intercept"": true}",0.1,3,1
1,knreg,-18746010000.0,-67738.789981,0.954359,-39978290000.0,-99507.146188,0.902656,35755528274.0893,94426.014204,0.915236,"{""n_neighbors"": 3}",0.1,3,1


In [None]:
predict_num.estimate_test(grid_search_knreg, {"n_neighbors": 3})
predict_num.summary_df[predict_num.summary_df['method'] == 'knreg'].sort_values('R2_valid', ascending=False).head()

Secondly, I tried to predict the prices using K-nearest Neigbours regression, even though the method is better suited for analysis of smaller data set than the one I have available in this case. Given the long computation time, only 3 specifications were tried, different in number of neigbours that are used for calculation of the predicted values of which the version with smallest number of neighbours - 3 show the best results. In order to calculate distance between individual observations euclidean distance was used. The performance was improved significantly in comparison with the OLS, the KNN regression was able to explain over 90 % of the variation in the data but the mean absolute error was still quite high at almost 100000. 

## Decision Tree

In [16]:
param_grid = {
    'max_depth': [4, 8, 12, 18, 24, 30, 36],
      'min_samples_split': [8, 12, 14, 18, 24, 28]  # List of possible values for n_neighbors
}
grid_search_dtreg = predict_num.find_best_model(
    method="dtreg", param_grid=param_grid, n_jobs=5
    )
predict_num.summary_df[predict_num.summary_df['method'] == 'dtreg'].sort_values('R2_valid', ascending=False).head()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
0,ols,-110536200000.0,-167551.97663,0.730901,-110651300000.0,-167609.08205,0.730629,113990066862.7722,168211.863199,0.729769,"{""fit_intercept"": true}",0.1,3,1


In [None]:
predict_num.estimate_test(
    grid_search_dtreg, {"max_depth": 24, "min_samples_split": 18}
    )
predict_num.summary_df[predict_num.summary_df['method'] == 'dtreg'].sort_values('R2_valid', ascending=False).head()

The using decision tree offers further improvement in the performance, the best models explains almost 94% of the variation in testing data set with mean absolute error approximately 74 700. Same as in the case of KNN regression there is quite a decrease in performance between training and testing data set but in comparison with the KNN offers more ways in which the overfitting can be dealt with. Only two parameters I tried to change were min sample split and max depth of the tree so there is definetly room for further imporvement but given that other method as random forrest or xgboost tree are to be expected to perform better than the simple decission tree I did not explore different model set ups further. 

# Random Forrest

In [12]:
param_grid = {
    'max_depth': [36],
      'min_samples_split': [18],
        'n_estimators': [50] 
}
grid_search_dtreg = predict_num.find_best_model(
    method="rfreg", param_grid=param_grid, n_jobs=7
    )

predict_num.summary_df[predict_num.summary_df['method'] == 'rfreg'].sort_values('R2_valid', ascending=False).head()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [None]:
predict_num.estimate_test(
grid_search_dtreg, {"learning_rate": 0.05, "max_depth": 32, "min_samples_split": 18, "n_estimators": 50})
predict_num.summary_df[predict_num.summary_df['method'] == 'rfreg'].sort_values('R2_valid', ascending=False).head()

NameError: name 'predict_num' is not defined

The best random forrest model explains over 95 % of variance in the testing data set and manages to squeze the mean absolute error under  67000. In comparision with the decision tree we can see that this model deals bettet with the overfitting. The performance on training data set is very similiar between these two models but random forrest performs better in validation and testing. 

## XGBOOST 

In [None]:
param_grid = {
{'max_depth': [3, 6, 9, 12, 15, 18, 13, 17, 21, 22, 24, 26, 28], 'min_samples_split': [18], 
'n_estimators': [100, 50, 150, 200, 250], 
'eta': [0.05, 0.1, 0.01, 0.15, 0.2], 
'lambda': [1, 3, 5], 
'gamma': [5, 10, 15], 
'subsample': [0.25, 0.5, 0.75, 1], 
'colsample_bytree': [0.7, 0.8, 0.5]}
}

In [11]:
param_grid = {
"colsample_bytree": [0.8], "eta": [0.15], "gamma": [5], "max_depth": [26], "n_estimators": [100], "subsample": [0.75], 'min_child_weight': [5]}
grid_search_xgboost = predict_num.find_best_model(
    method="xboostreg", param_grid=param_grid, n_jobs=7
    )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


KeyboardInterrupt: 

In [10]:
predict_num.summary_df

Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
0,xboostreg,-15241700000.0,-60192.937427,0.962893,-20950830000.0,-69982.228267,0.948997,,,,"{""colsample_bytree"": 0.8, ""eta"": 0.15, ""gamma""...",0.1,3,1
1,xboostreg,-22990780000.0,-72190.650075,0.944025,-26646370000.0,-77266.730595,0.935142,,,,"{""colsample_bytree"": 0.8, ""eta"": 0.15, ""gamma""...",0.1,3,1


In [14]:
predict_num.estimate_test(
    grid_search_xgboost, {"colsample_bytree": 0.8, "eta": 0.15, "gamma": 5, "max_depth": 26, "n_estimators": 100, "subsample": 0.75}
    )

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [12]:
pd.set_option('display.max_colwidth', 250)

After lengthy search, I was able to achieve to eplain over 96 % of variation in validation and testing. The tree depth is quite hight at 26 which 

## Summary Results

In [43]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 250)

In [10]:
metrics[metrics['method'] == 'xboostreg'].sort_values('R2_valid', ascending=False).head()

Unnamed: 0,method,MSE_train,MAE_train,R2_train,MSE_valid,MAE_valid,R2_valid,MSE_test,MAE_test,R2_test,params,test_size,cv_n,iteration
205,xboostreg,-2224016000.0,-27997.13664,0.994586,-15984780000.0,-62445.494969,0.96108,,,,"{""colsample_bytree"": 0.8, ""eta"": 0.15, ""gamma""...",0.1,3,1
204,xboostreg,-2764802000.0,-31829.153063,0.993271,-16286750000.0,-64030.115164,0.960343,,,,"{""colsample_bytree"": 0.7, ""eta"": 0.15, ""gamma""...",0.1,3,1
198,xboostreg,-1888969000.0,-24772.127363,0.995402,-16974000000.0,-62923.111557,0.958678,,,,"{""eta"": 0.15, ""gamma"": 10, ""max_depth"": 26, ""n...",0.1,3,1
194,xboostreg,-1888969000.0,-24772.127363,0.995402,-16974000000.0,-62923.111557,0.958678,,,,"{""eta"": 0.15, ""gamma"": 5, ""max_depth"": 26, ""n_...",0.1,3,1
202,xboostreg,-1888969000.0,-24772.127363,0.995402,-16974000000.0,-62923.111557,0.958678,,,,"{""eta"": 0.15, ""gamma"": 15, ""max_depth"": 26, ""n...",0.1,3,1
