In [72]:
import pandas as pd
import numpy as np

In [73]:
cars = pd.read_csv('cars.csv',index_col=0)
cars.head()

Unnamed: 0,region,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,lat,long
13,denver,7995,2010.0,chevrolet,excellent,8 cylinders,gas,194050.0,clean,automatic,4wd,truck,white,co,39.8302,-105.037
25,syracuse,4000,1995.0,dodge,excellent,8 cylinders,gas,133000.0,clean,automatic,4wd,truck,grey,ny,43.0734,-76.1568
28,greensboro,16000,2011.0,bmw,excellent,6 cylinders,gas,85000.0,salvage,automatic,fwd,sedan,grey,nc,35.5895,-82.5671
29,syracuse,10950,2011.0,buick,excellent,6 cylinders,gas,43418.0,clean,automatic,fwd,sedan,red,ny,43.1226,-76.1284
31,syracuse,9400,2011.0,bmw,good,6 cylinders,gas,145000.0,clean,automatic,4wd,SUV,blue,ny,43.1707,-76.0962


In [74]:
cars.drop(['region','state','lat','long'], inplace=True, axis=1)
cars.head(3)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color
13,7995,2010.0,chevrolet,excellent,8 cylinders,gas,194050.0,clean,automatic,4wd,truck,white
25,4000,1995.0,dodge,excellent,8 cylinders,gas,133000.0,clean,automatic,4wd,truck,grey
28,16000,2011.0,bmw,excellent,6 cylinders,gas,85000.0,salvage,automatic,fwd,sedan,grey


In [75]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pprint import pprint

In [76]:
mapper = DataFrameMapper([
    (['condition','cylinders','title_status'], OrdinalEncoder(categories=[['salvage','fair','good','excellent','like new','new'],
                                                                   ['3 cylinders','4 cylinders','5 cylinders','6 cylinders','8 cylinders','10 cylinders','12 cylinders','other'],
                                                                   ['salvage','parts only','missing','rebuilt','lien','clean']])),
    (['manufacturer','fuel','transmission','drive','type','paint_color'], OneHotEncoder(handle_unknown='ignore')),
    (['year', 'odometer'], StandardScaler())
])

In [77]:
feature = cars.columns.drop(['price'])
xtrain, xtest, ytrain, ytest = train_test_split(cars[feature],cars['price'],test_size=.2)

In [78]:
pipelineRF = Pipeline(steps=[
    ('mapper', mapper),
    ('estimator', RandomForestRegressor())
])

In [79]:
pipelineRF.fit(xtrain, ytrain)

Pipeline(memory=None,
         steps=[('mapper',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[(['condition', 'cylinders',
                                             'title_status'],
                                            OrdinalEncoder(categories=[['salvage',
                                                                        'fair',
                                                                        'good',
                                                                        'excellent',
                                                                        'like '
                                                                        'new',
                                                                        'new'],
                                                                       ['3 '
                                                                        'cylinders',
                           

In [80]:
predRF = pipelineRF.predict(xtest)

In [81]:
maerf = mean_absolute_error(ytest, predRF)
rmserf = np.sqrt(mean_squared_error(ytest, predRF))

In [82]:
pipelineDT = Pipeline(steps=[
    ('mapper', mapper),
    ('model', DecisionTreeRegressor())
])

In [83]:
pipelineDT.fit(xtrain, ytrain)

Pipeline(memory=None,
         steps=[('mapper',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[(['condition', 'cylinders',
                                             'title_status'],
                                            OrdinalEncoder(categories=[['salvage',
                                                                        'fair',
                                                                        'good',
                                                                        'excellent',
                                                                        'like '
                                                                        'new',
                                                                        'new'],
                                                                       ['3 '
                                                                        'cylinders',
                           

In [84]:
predDT = pipelineDT.predict(xtest)

In [85]:
maedt = mean_absolute_error(ytest, predDT)
rmsedt = np.sqrt(mean_squared_error(ytest, predDT))

In [86]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [87]:
param = {
    'model__criterion':['mse','mae','friedman_mse'],
    'model__splitter':['best','random'],
    'model__min_samples_split':[2,4,6]
}

In [88]:
gs = GridSearchCV(pipelineDT,param_grid=param,cv=3)

In [89]:
gs.fit(xtrain,ytrain)
# pipelineDT.get_params()

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('mapper',
                                        DataFrameMapper(default=False,
                                                        df_out=False,
                                                        features=[(['condition',
                                                                    'cylinders',
                                                                    'title_status'],
                                                                   OrdinalEncoder(categories=[['salvage',
                                                                                               'fair',
                                                                                               'good',
                                                                                               'excellent',
                                                             

In [90]:
gs.best_params_

{'model__criterion': 'friedman_mse',
 'model__min_samples_split': 6,
 'model__splitter': 'random'}

In [91]:
predGS = gs.predict(xtest)

In [92]:
maegs = mean_absolute_error(ytest, predGS)
rmsegs = np.sqrt(mean_squared_error(ytest,predGS))

In [93]:
from xgboost import XGBRegressor

In [94]:
pipeXGB = Pipeline(steps=[
    ('mapper',mapper),
    ('model',XGBRegressor(objective='reg:squarederror'))
])

In [95]:
parameters = {
    'model__n_estimators': [60, 100, 120, 140],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [5, 7],
    'model__early_stopping_rounds':[5,10],
    'model__max_depth':[5,7]
}

In [96]:
pipeXGB.fit(xtrain,ytrain)

Pipeline(memory=None,
         steps=[('mapper',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[(['condition', 'cylinders',
                                             'title_status'],
                                            OrdinalEncoder(categories=[['salvage',
                                                                        'fair',
                                                                        'good',
                                                                        'excellent',
                                                                        'like '
                                                                        'new',
                                                                        'new'],
                                                                       ['3 '
                                                                        'cylinders',
                           

In [97]:
predXGB = pipeXGB.predict(xtest)

In [98]:
maexgb = mean_absolute_error(ytest, predXGB)
rmsexgb = np.sqrt(mean_squared_error(ytest, predXGB))

In [99]:
gs2 = GridSearchCV(pipeXGB,cv=3,param_grid=parameters)

In [100]:
gs2.fit(xtrain,ytrain)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('mapper',
                                        DataFrameMapper(default=False,
                                                        df_out=False,
                                                        features=[(['condition',
                                                                    'cylinders',
                                                                    'title_status'],
                                                                   OrdinalEncoder(categories=[['salvage',
                                                                                               'fair',
                                                                                               'good',
                                                                                               'excellent',
                                                             

In [101]:
gs2.best_params_

{'model__early_stopping_rounds': 5,
 'model__learning_rate': 0.1,
 'model__max_depth': 7,
 'model__n_estimators': 140}

In [102]:
predgs2 = gs2.predict(xtest)

In [103]:
maegs2 = mean_absolute_error(ytest, predgs2)
rmsegs2 = np.sqrt(mean_squared_error(ytest, predgs2))

In [104]:
mae = pd.DataFrame({
    'Model':['Random Forest Regressor','Decision Tree Regressor','Grid Search Decision Tree Regressor', 'XGBoost', 'Grid Search XGBoost'],
    'Mean Absolute Error':[maerf, maedt, maegs, maexgb, maegs2],
    'Rooted Mean Squared Error':[rmserf, rmsedt, rmsegs, rmsexgb, rmsegs2]
})
mae

Unnamed: 0,Model,Mean Absolute Error,Rooted Mean Squared Error
0,Random Forest Regressor,1495.733202,2634.623438
1,Decision Tree Regressor,1819.944615,3436.214886
2,Grid Search Decision Tree Regressor,1929.986825,3365.489352
3,XGBoost,2355.858559,3462.665792
4,Grid Search XGBoost,1957.874816,2981.481709


In [105]:
print("Grid Search Decision Tree Regressor Best Parameters:")
pprint(gs.best_params_)

Grid Search Decision Tree Regressor Best Parameters:
{'model__criterion': 'friedman_mse',
 'model__min_samples_split': 6,
 'model__splitter': 'random'}


In [106]:
print("Grid Search XGBoost Best Parameters:")
pprint(gs2.best_params_)

Grid Search XGBoost Best Parameters:
{'model__early_stopping_rounds': 5,
 'model__learning_rate': 0.1,
 'model__max_depth': 7,
 'model__n_estimators': 140}


In [1]:
# import joblib
# joblib.dump(pipelineRF, 'model')