In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read data file
car_df = pd.read_csv("used_car_price.csv")

In [3]:
car_df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,3.5,6,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,2.0,4,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,2.4,4,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,3.2,6,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,3.5,6,225,18,24,3880,115,197


In [4]:
# Let's perform One-Hot Encoding for categorical features lie Make, Model, Type, Origin, and DriveTrain
car_df = pd.get_dummies(car_df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"])

In [5]:
car_df

Unnamed: 0,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Make_Acura,...,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon,Origin_Asia,Origin_Europe,Origin_USA,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,36945,3.5,6,265,17,23,4451,106,189,1,...,0,0,0,0,1,0,0,1,0,0
1,23820,2.0,4,200,24,31,2778,101,172,1,...,1,0,0,0,1,0,0,0,1,0
2,26990,2.4,4,200,22,29,3230,105,183,1,...,1,0,0,0,1,0,0,0,1,0
3,33195,3.2,6,270,20,28,3575,108,186,1,...,1,0,0,0,1,0,0,0,1,0
4,43755,3.5,6,225,18,24,3880,115,197,1,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,40565,2.4,5,197,21,28,3450,105,186,0,...,1,0,0,0,0,1,0,0,1,0
424,42565,2.3,5,242,20,26,3450,105,186,0,...,1,0,0,0,0,1,0,0,1,0
425,45210,2.9,6,268,19,26,3653,110,190,0,...,1,0,0,0,0,1,0,0,1,0
426,26135,1.9,4,170,22,29,2822,101,180,0,...,0,0,0,1,0,1,0,0,1,0


In [11]:
# Feeding input features to X and output (MSRP) to y
X = car_df.drop("MSRP", axis = 1)
y = car_df["MSRP"]

In [12]:
X = np.array(X)

In [13]:
y = np.array(y)

In [14]:
# Doing a train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)

In [15]:
X_train.shape

(342, 483)

In [16]:
X_test.shape

(86, 483)

### Applying XGBoost Without Optimization

In [17]:
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 1, max_depth = 3, n_estimators = 500)
model.fit(X_train, y_train)

# predict the score of the trained model using the testing dataset
result = model.score(X_test, y_test)
y_predict = model.predict(X_test)

In [18]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2) 

RMSE = 7936.377 
R2 = 0.7957875034739448


### XGBoost With Grid Search Optimization

In [19]:
from sklearn.model_selection import GridSearchCV
parameters_grid = { 'max_depth': [3, 10, 20], 
                   'learning_rate': [0.1, 0.5, 1],
                   'n_estimators': [100, 500],
                   'colsample_bytree': [0.3, 0.7]}

model = xgb.XGBRegressor(objective ='reg:squarederror')

xgb_gridsearch = GridSearchCV(estimator = model, 
                              param_grid = parameters_grid, 
                              scoring = 'neg_mean_squared_error',  
                              cv = 5, 
                              verbose = 5)

xgb_gridsearch.fit(X_train, y_train)
y_predict = xgb_gridsearch.predict(X_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-33533163.530 total time=   0.4s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-53107052.985 total time=   0.6s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-167915825.074 total time=   0.6s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-26810618.803 total time=   0.5s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-66690186.577 total time=   0.7s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-26548315.032 total time=   2.9s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-47939336.669 total time=   3.0s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.1, max_dept

[CV 1/5] END colsample_bytree=0.3, learning_rate=1, max_depth=3, n_estimators=500;, score=-100882478.405 total time=   0.9s
[CV 2/5] END colsample_bytree=0.3, learning_rate=1, max_depth=3, n_estimators=500;, score=-88694564.274 total time=   0.9s
[CV 3/5] END colsample_bytree=0.3, learning_rate=1, max_depth=3, n_estimators=500;, score=-202057938.073 total time=   0.9s
[CV 4/5] END colsample_bytree=0.3, learning_rate=1, max_depth=3, n_estimators=500;, score=-37111296.349 total time=   0.9s
[CV 5/5] END colsample_bytree=0.3, learning_rate=1, max_depth=3, n_estimators=500;, score=-81669841.103 total time=   1.0s
[CV 1/5] END colsample_bytree=0.3, learning_rate=1, max_depth=10, n_estimators=100;, score=-108581888.616 total time=   0.4s
[CV 2/5] END colsample_bytree=0.3, learning_rate=1, max_depth=10, n_estimators=100;, score=-114418603.912 total time=   0.4s
[CV 3/5] END colsample_bytree=0.3, learning_rate=1, max_depth=10, n_estimators=100;, score=-267789142.624 total time=   0.4s
[CV 4/5]

[CV 2/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=100;, score=-94949734.531 total time=   0.7s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=100;, score=-158611848.758 total time=   0.7s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=100;, score=-32735406.452 total time=   0.9s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=100;, score=-92285053.141 total time=   0.7s
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=500;, score=-51779757.189 total time=   4.1s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=500;, score=-94913496.540 total time=   4.2s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=500;, score=-158623244.207 total time=   3.7s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=10, n_estimators=500;, score=-32740197.970 total tim

In [20]:
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2) 

RMSE = 4897.023 
R2 = 0.9222497221060714


### XGBoost With Random Search

In [21]:
from sklearn.model_selection import RandomizedSearchCV

grid = {
    'n_estimators': [100, 500],
    'max_depth': [3, 10, 20],
    'learning_rate': [0.1, 0.5], 
    'colsample_bytree': [0.3, 0.7]}


import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror')

random_cv = RandomizedSearchCV(estimator = model,
                               param_distributions = grid,
                               cv = 5, 
                               n_iter = 100,
                               scoring = 'neg_mean_absolute_error',
                               verbose = 5, 
                               return_train_score = True)
random_cv.fit(X_train, y_train)

random_cv.best_estimator_
y_predict = random_cv.predict(X_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2732.912, test=-4270.703) total time=   0.1s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2495.782, test=-4787.527) total time=   0.2s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2593.123, test=-4918.865) total time=   0.2s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2700.076, test=-3700.005) total time=   0.2s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2490.215, test=-4627.835) total time=   0.3s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=(train=-1206.932, test=-3785.480) total time=   1.4s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=(train=-1171.053, test=-4483.688) tota

[CV 3/5] END colsample_bytree=0.3, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.001, test=-5450.257) total time=   5.0s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.000, test=-3572.442) total time=   4.6s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.000, test=-5320.435) total time=   4.7s
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2583.328, test=-4078.480) total time=   0.2s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2522.822, test=-4986.141) total time=   0.2s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2379.272, test=-5021.319) total time=   0.2s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=-2607.368, test=-3670.031) total time

[CV 5/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=20, n_estimators=100;, score=(train=-0.003, test=-5279.375) total time=   1.2s
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.001, test=-4470.376) total time=   6.6s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.001, test=-5780.966) total time=   6.9s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.001, test=-5850.443) total time=   7.2s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.001, test=-3710.585) total time=   6.8s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=20, n_estimators=500;, score=(train=-0.001, test=-5279.379) total time=   6.7s


In [22]:
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2) 

RMSE = 4897.023 
R2 = 0.9222497221060714


### XGBoost With Bayesian Optimization

In [23]:
from skopt import BayesSearchCV

search_space = {
        "max_depth": (4, 20, 'log-uniform'),
        "n_estimators": (2, 100, 'log-uniform'),
        'learning_rate': (0.01, 1.0, 'log-uniform')}            

model = xgb.XGBRegressor(objective ='reg:squarederror')

xgb_bayes_search = BayesSearchCV(model, 
                                    search_space, 
                                    n_iter = 100, 
                                    scoring = 'neg_mean_absolute_error', 
                                    cv = 5)

xgb_bayes_search.fit(X_train, y_train) 

y_predict = xgb_bayes_search.predict(X_test)

In [24]:
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2)

RMSE = 6240.85 
R2 = 0.8737227242321206


In [25]:
# From the above experiment GridSearch and RandomSearch provides better result with almost same performance.