In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline
from sklearn.model_selection import train_test_split,KFold,GridSearchCV,PredefinedSplit
from sklearn import linear_model,ensemble, tree, model_selection


pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.max_rows = 300



In [None]:
truecar_file = 'truecar_clean_data.plk'
df = pd.read_pickle(truecar_file)
df = df.rename(columns={col: col.replace('_clean','') for col in df.columns})
df = df[df.listPrice< 300000]

In [None]:
df.info()

In [None]:
X,y = df.drop('listPrice', axis=1),df[['listPrice']]

In [None]:
df_factor_data = df.select_dtypes(include=['object'])
df_number_data = df.select_dtypes(exclude=['object']).drop('listPrice',axis=1)

In [None]:
# one split of train and test

In [None]:
# convert categorical variable to average prices 

X_train,X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=197)
df_copy = df.copy(deep = True)
for col in df_factor_data.columns:
    df_copy = df_copy.reset_index().merge(df_copy.loc[X_train.index].pivot_table(values=["listPrice"],\
                                          index=[col], aggfunc=np.mean).reset_index()\
                                          .rename(columns = {'listPrice': col+'_value'}),\
                                    how='left',on=col, suffixes=('', '_y')).set_index('index')
    df_copy[col+'_value'] = df_copy[col+'_value'].fillna(df_copy['listPrice'])

X_cont = pd.concat([df_copy.filter(regex=("value$"),axis=1),df_number_data], axis=1)
X_train_cont = X_cont.loc[X_train.index]
X_test_cont = X_cont.loc[X_test.index]


In [None]:
# compare different models

models = {}
parameters = {}

models['linear_model'] = linear_model.LinearRegression()
models['ridge_model'] = linear_model.Ridge()
models['lasso_model'] = linear_model.Lasso(alpha=.5)
models['robust_regression'] = linear_model.SGDRegressor(loss='huber',n_iter=20)
models['eps_insensitive'] = linear_model.SGDRegressor(loss='epsilon_insensitive',n_iter=20)


models['cart'] = tree.DecisionTreeRegressor(max_depth=7)
models['extratrees'] = tree.ExtraTreeRegressor(max_depth=7)
models['randomForest'] = ensemble.RandomForestRegressor()
models['adaboostedTrees'] = ensemble.AdaBoostRegressor()
models['gradboostedTrees'] = ensemble.GradientBoostingRegressor()


for name,model in models.items():
    selected_columns2 = X_cont.filter(regex=("^(?!interior)"),axis=1).columns
    model2 = model
    model2.fit(X_train_cont[selected_columns2],Y_train)
    test_set_pred2 = model2.predict(X_test_cont[selected_columns2])
    scores = model2.score(X_test_cont[selected_columns2],Y_test)
    rmse = np.sqrt(np.mean((test_set_pred2 - np.array(Y_test))**2))
    print('Model: '+name)
    print("Score: " + str(scores))
    print("RMSE: " + str(rmse))
    print()

    

In [None]:
# five-fold cross-validation

In [None]:
# function that split between k-fold sets and process categorical variables into average prices
def return_data(df):
    kf = KFold(n_splits=5, random_state =43, shuffle = True)
    X = df.drop('listPrice', axis=1)
    y = df[['listPrice']]
    fold = 0
    X_train_all = pd.DataFrame()
    X_test_all = pd.DataFrame()
    y_train_all = pd.DataFrame()
    y_test_all = pd.DataFrame()
    
    
    for train_index, test_index in kf.split(X):
        train_vin_index = X.iloc[train_index].index
        test_vin_index = X.iloc[test_index].index
        
        fold += 1
        
        df_copy = df.copy(deep=True) 
        
        # calculate average prices for categorical variables
        for col in df_factor_data.columns:
            df_copy = df_copy.reset_index().merge(df_copy.loc[train_vin_index].pivot_table(values=["listPrice"],\
                                                  index=[col], aggfunc=np.mean).reset_index()\
                                                  .rename(columns = {'listPrice': col+'_value'}),\
                                            how='left',on=col, suffixes=('', '_y')).set_index('index')
            df_copy[col+'_value'] = df_copy[col+'_value'].fillna(df_copy['listPrice'])
            
        X_continuous = pd.concat([df_copy.filter(regex=("value$"),axis=1),df_number_data], axis=1)
        
        X_train_continuous = X_continuous.loc[train_vin_index]\
                                         .assign(fold_index = 'fold_' + str(fold))\
                                         .set_index('fold_index', append=True)
        X_test_continuous = X_continuous.loc[test_vin_index]\
                                        .assign(fold_index = 'fold_' + str(fold))\
                                        .set_index('fold_index', append=True)
                
        y_train = y.loc[train_vin_index]\
                   .assign(fold_index = 'fold_' + str(fold))\
                   .set_index('fold_index', append=True) 
                
        y_test = y.loc[test_vin_index]\
                  .assign(fold_index = 'fold_' + str(fold))\
                  .set_index('fold_index', append=True)

        X_train_all = pd.concat([X_train_all,X_train_continuous])
        X_test_all = pd.concat([X_test_all,X_test_continuous])
        y_train_all = pd.concat([y_train_all,y_train])
        y_test_all = pd.concat([y_test_all,y_test])
        
    assert X_train_all.shape[0] == y_train_all.shape[0]
    assert X_test_all.shape[0] == y_test_all.shape[0]
    
    return X_train_all, X_test_all, y_train_all, y_test_all


In [None]:
## all models first look

In [None]:
def model_result(df):
    # all models
    x_train, x_test, y_train, y_test = return_data(df)

    models = {}
    parameters = {}

    models['linear_model'] = linear_model.LinearRegression()
    models['ridge_model'] = linear_model.Ridge()
    models['lasso_model'] = linear_model.Lasso(alpha=2)
    models['robust_regression'] = linear_model.SGDRegressor(loss='huber',n_iter=5)
    models['eps_insensitive'] = linear_model.SGDRegressor(loss='epsilon_insensitive',n_iter=20)


    models['cart'] = tree.DecisionTreeRegressor(max_depth=7)
    models['extratrees'] = tree.ExtraTreeRegressor(max_depth=7)
    models['randomForest'] = ensemble.RandomForestRegressor()
    models['adaboostedTrees'] = ensemble.AdaBoostRegressor()
    models['gradboostedTrees'] = ensemble.GradientBoostingRegressor()




    selected_columns = x_train.filter(regex=("^(?!interior)"),axis=1).columns

    score_dict = {}
    rmse_dict = {}
    
    for name,model in models.items():
        test_scores = []
        test_rmse = []
        for i in range(1,6):
            fold_name = 'fold_' + str(i)
            x_train_fold = x_train.loc[x_train.index.get_level_values(1)==fold_name,selected_columns]
            x_test_fold = x_test.loc[x_test.index.get_level_values(1)==fold_name,selected_columns]
            y_train_fold = y_train.loc[y_train.index.get_level_values(1)==fold_name]
            y_test_fold = y_test.loc[y_test.index.get_level_values(1)==fold_name]


            first_model = model
            first_model.fit(x_train_fold, y_train_fold.values.ravel())
            test_scores.append(first_model.score(x_test_fold, y_test_fold))
            prediction = first_model.predict(x_test_fold)
  
            test_rmse.append(np.sqrt(np.mean((prediction- y_test_fold['listPrice'])**2)))
        print('Model: '+ name)

        print("Score mean: " + str(np.mean(test_scores)))
        print("Score std: " + str(np.std(test_scores)))
        print("RMSE mean: " + str(np.mean(test_rmse)))
        print("RMSE std: " + str(np.std(test_rmse)))
        print()
        score_dict[name] = test_scores
        rmse_dict[name] = test_rmse
        
    return score_dict,rmse_dict


In [None]:
model1_score, model1_rmse = model_result(df)

In [None]:
fig = plt.figure(figsize=(12,10))
pd.DataFrame.from_dict(model1_score,orient='index').mean(axis=1).sort_values(ascending = False)[:7].sort_values().plot('barh')
fig.savefig('model_score_comparison.png')


In [None]:
pd.DataFrame.from_dict(model1_score,orient='index').mean(axis=1).sort_values(ascending = False)

In [None]:
pd.DataFrame.from_dict(model1_rmse,orient='index').mean(axis=1).sort_values()

In [None]:
# tune parameters

In [None]:
def plot_overfit(df,model_obj,param_ranges,param_static=None): 
    x_train, x_test, y_train, y_test = return_data(df)

    for parameter,parameter_range in param_ranges.items():
        avg_train_score, avg_test_score = [],[]
        std_train_score, std_test_score = [],[]
        
        for param_val in parameter_range:
            param = {parameter:param_val}
            if param_static:
                param.update(param_static)
            
                
            model = model_obj(**param)
            selected_columns = x_train.filter(regex=("^(?!interior)"),axis=1).columns
         
            train_scores,test_scores = [],[]
            for i in range(1,6):
                fold_name = 'fold_' + str(i)
                x_train_fold = x_train.loc[x_train.index.get_level_values(1)==fold_name,selected_columns]
                x_test_fold = x_test.loc[x_test.index.get_level_values(1)==fold_name,selected_columns]
                y_train_fold = y_train.loc[y_train.index.get_level_values(1)==fold_name]
                y_test_fold = y_test.loc[y_test.index.get_level_values(1)==fold_name]
                
                model.fit(x_train_fold[selected_columns], y_train_fold.values.ravel())
                train_scores.append(model.score(x_train_fold,y_train_fold))
                test_scores.append(model.score(x_test_fold,y_test_fold))
                
            avg_train_score.append(np.mean(train_scores))
            #print(np.std(train_scores))
            avg_test_score.append(np.mean(test_scores))
            #print(np.std(test_scores))
            
            std_train_score.append(np.std(train_scores))
            std_test_score.append(np.std(test_scores))     

            
        fig,ax = plt.subplots()
        ax.errorbar(parameter_range,avg_train_score,yerr=std_train_score,label='training score')
        ax.errorbar(parameter_range,avg_test_score,yerr=std_test_score,label='testing score')
        
        ax.set_xlabel(parameter)
        ax.set_ylabel('score')
        ax.legend(loc=0)

In [None]:
plot_overfit(df,ensemble.RandomForestRegressor,{'max_features':range(10,20)}) # either 12 or 14

In [None]:
plot_overfit(df, ensemble.RandomForestRegressor,{'min_samples_leaf': range(1,10,1)}\
            ,param_static={'max_features':14}) 

In [None]:
plot_overfit(df, ensemble.RandomForestRegressor,{'n_estimators': [1,5,10,20,30,50,100,200,300,500,1000]}\
            ,param_static={'max_features':14, 'min_samples_leaf':2,'min_samples_split':7}) 

In [None]:
plot_overfit(df,ensemble.GradientBoostingRegressor,{'max_features':range(1,20)}) # set around 8 or 13

In [None]:
plot_overfit(df,ensemble.GradientBoostingRegressor,{'min_samples_leaf':range(1,10)})

In [None]:
plot_overfit(df,ensemble.GradientBoostingRegressor,{'max_depth':range(1,10)}) # max_depth around 7

In [None]:
plot_overfit(df,ensemble.GradientBoostingRegressor,{'n_estimators':[1,5,10,20,30,50,100,200,300,500,1000]}) # 500 looks good

In [None]:
plot_overfit(df,ensemble.GradientBoostingRegressor,{'n_estimators':[1,5,10,20,30,50,100,200,300,400,500,1000]}\
             ,param_static={'learning_rate':0.15, 'min_samples_split':200, \
                            'min_samples_leaf' :40, 'max_depth' : 6, 'max_features' : 'sqrt',\
                            'subsample' : 0.8})

In [None]:
plot_overfit(df,ensemble.GradientBoostingRegressor,{'subsample':np.arange(0.2,0.99,0.1)}\
             ,param_static={'learning_rate':0.15,'max_depth': 13, \
                             'max_features':12 \
                             ,'n_estimators':200,'min_samples_split':70,'min_samples_leaf':20})

In [None]:
# Model Result

In [None]:
# helper function that plot model result
def plot_model_result(df, model):
    x_train, x_test, y_train, y_test = return_data(df)
        
    #Predict training set:
    selected_columns = x_train.filter(regex=("^(?!interior)"),axis=1).columns


    test_scores = []
    test_rmse = []
    for i in range(1,6):
        fold_name = 'fold_' + str(i)
        x_train_fold = x_train.loc[x_train.index.get_level_values(1)==fold_name,selected_columns]
        x_test_fold = x_test.loc[x_test.index.get_level_values(1)==fold_name,selected_columns]
        y_train_fold = y_train.loc[y_train.index.get_level_values(1)==fold_name]
        y_test_fold = y_test.loc[y_test.index.get_level_values(1)==fold_name]


        first_model = model
        first_model.fit(x_train_fold, y_train_fold.values.ravel())
        test_scores.append(first_model.score(x_test_fold, y_test_fold))
        prediction = first_model.predict(x_test_fold)

        test_rmse.append(np.sqrt(np.mean((prediction- y_test_fold['listPrice'])**2)))

    print("Score mean: " + str(np.mean(test_scores)))
    print("Score std: " + str(np.std(test_scores)))
    print("RMSE mean: " + str(np.mean(test_rmse)))
    print("RMSE std: " + str(np.std(test_rmse)))
    print()
    
    plt.figure(figsize=(15,10))
    feat_imp = pd.Series(first_model.feature_importances_, selected_columns).sort_values(ascending=True)
    feat_imp.plot(kind='barh', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    return first_model.feature_importances_,selected_columns



In [None]:
randomforest_feature_importance, randomforest_columns = plot_model_result(df, ensemble.RandomForestRegressor(min_samples_split=7, \
                                           min_samples_leaf = 2, max_depth = 13, max_features = 14,\
                                           n_estimators=200))


In [None]:
fig = plt.figure(figsize=(15,10))
feat_imp = pd.Series(randomforest_feature_importance, randomforest_columns).sort_values(ascending=True)
feat_imp.plot(kind='barh', title='Feature Importances')
plt.ylabel('Feature Importance Score')
fig.savefig('randomforest_score_comparison.png')



In [None]:
grad_importance, grad_columns = plot_model_result(df, ensemble.GradientBoostingRegressor(learning_rate=0.15/3.0, min_samples_split=70, \
                                           min_samples_leaf = 20, max_depth = 13, max_features = 12,\
                                           subsample = 0.8,n_estimators=int(200*3.0)))


In [None]:
fig = plt.figure(figsize=(15,10))
feat_imp = pd.Series(grad_importance, grad_columns).sort_values(ascending=True)
feat_imp.plot(kind='barh', title='Feature Importances')
plt.ylabel('Feature Importance Score')
fig.savefig('gradient_score_comparison.png')



In [None]:
# interpret the gradient boosting data

In [None]:
selected_columns3 = X_cont.filter(regex=("^(?!interior)"),axis=1).columns
model3 = ensemble.GradientBoostingRegressor(learning_rate=0.15/3.0, min_samples_split=70, \
                                           min_samples_leaf = 20, max_depth = 13, max_features = 12,\
                                           subsample = 0.8,n_estimators=int(200*3.0))

model3.fit(X_train_cont[selected_columns3],Y_train.values.ravel())
test_set_pred3 = model3.predict(X_test_cont[selected_columns3])
scores = model3.score(X_test_cont[selected_columns3],Y_test)
rmse = np.sqrt(np.mean((test_set_pred3 - np.array(Y_test))**2))





In [None]:
fig = plt.figure(figsize=(10,10))

plt.scatter(Y_test, test_set_pred3,alpha=.1)
plt.plot(np.linspace(0,250000,1000),np.linspace(0,250000,1000), color = 'red')
plt.ylabel('predicted price')
plt.xlabel('listed price')
fig.savefig('predicted_value_check.png')



In [None]:
import ml_insights as mli
mxr = mli.ModelXRay(model3,X_test_cont[selected_columns3])


In [None]:
fig = plt.figure()

indices_diag = mxr.feature_dependence_plots(num_pts=7) # give a sense of importance of different variables
fig.savefig('interpretation.png')
