# Decision Tree regression-Used Car Price Predictor

In [35]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
#Importing required modules
import pandas as pd
import numpy as np
from sqlalchemy import create_engine,inspect, func
from config import password

import seaborn as sns
import scipy.stats as ss
from collections import Counter
import math 
from scipy import stats
import datetime
import matplotlib
import pickle
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

from statsmodels.graphics.tsaplots import plot_acf

In [37]:
#Connecting to the sql database
connection_string = "postgres:"+password+"@localhost:5432/hondadb"
engine = create_engine(f'postgresql://{connection_string}')

In [38]:
df_cleaned = pd.read_sql("SELECT * FROM cleanedcardb2",
                     con=engine)

#### Label Encoding

In [39]:
features = ['Car Type', 'Model']
les = {}

for f in features:
  les[f] = preprocessing.LabelEncoder()
  les[f] = les[f].fit(df_cleaned[f])
  df_cleaned[f] = les[f].transform(df_cleaned[f])

#### Splitting Train and Test

In [40]:
# Splitting the dataset into train and test sets
train_set, test_set = train_test_split(df_cleaned, test_size = 0.3, random_state = 100)

# Separating target labels from the rest
df_train = train_set.drop("Price", axis=1) #train without target
df_price_train = train_set["Price"].copy() #target

df_test  = test_set.drop("Price", axis=1)
df_price_test = test_set["Price"].copy()

#### Best Score Function

In [41]:
# This function returns the best score achieved by the model over all the cv splits
def best_score(forest, cv):
  best_score = 0
  for i in range(0, cv):
    items = list(map(lambda x: abs(x), forest.cv_results_['split'+str(i)+'_test_score']))
    arr = np.append(best_score, items)
    best_score = max(arr)
  
  return best_score

#### Best Param Function

In [42]:
# This functions returns the best combination of parameters, which allows to
# get the best score
def best_params(forest):
  return forest.cv_results_['params'][forest.cv_results_['rank_test_score'][0]-1]

#### Performance Metric Function

In [43]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true (y_true) and predicted (y_predict) values based on the metric chosen. """
    
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

# Decision Trees

In [44]:
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer

def DT_SparkizedGridSearchCV(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 42)

    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[1, 5, 10, 15, 16, 17]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    sc = createLocalSparkSession().sparkContext
    grid = GridSearchCV(sc, estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    tree_reg = grid.fit(X, y)
    
    # Return grid search output after fittig the data
    return tree_reg

In [45]:
from sklearn.model_selection import ShuffleSplit

# Fit the training data to the model using spark parallelized grid search CV
tree_reg = DT_SparkizedGridSearchCV(df_train, df_price_train)

# Takign best parameters
bp = best_params(tree_reg)

# Produce the optimal value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(bp['max_depth']))

Parameter 'max_depth' is 17 for the optimal model.


In [46]:
"""
Due to the limitation of the spark-sklearn library's implementation of
GridSearchCV, best_estimator_ parameter it's not available, so we need to
fit a DecisionTreeRegressor on the best parameters given to us by gridSearchCV
"""
tree_reg_model = DecisionTreeRegressor(
                              max_depth=bp['max_depth'])
%time tree_reg_model.fit(df_train, df_price_train)

Wall time: 298 ms


DecisionTreeRegressor(criterion='mse', max_depth=17, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [48]:
# DecisionTreeRegressor score for price prediction


print("Best Decision Tree Regressor parameters:")
print(bp)
print("\nDecision Tree Regressor score without CV on train set: %.3f" % tree_reg_model.score(df_train, df_price_train)) #score on train set
print("Decision Tree Regressor score without CV on test set: %.3f" % tree_reg_model.score(df_test, df_price_test)) # score on test set
print("Decision Tree Regressor Best score with CV=10: %.3f" % best_score(tree_reg, 10)) # -> best score on test set is high

Best Decision Tree Regressor parameters:
{'max_depth': 17}

Decision Tree Regressor score without CV on train set: 0.975
Decision Tree Regressor score without CV on test set: 0.841
Decision Tree Regressor Best score with CV=10: 0.841


In [49]:
# Prediction on whole training set
price_predictions_train = tree_reg_model.predict(df_train) #using the whole training set for making prediction with the final model given by the best CV parameters

# Reversing np.log operation
price_predictions_train_normal = np.exp(price_predictions_train)
df_price_train_normal = np.exp(df_price_train)

# MSE between target values (i.e known) and predicted values
lin_mse = mean_squared_error(df_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse # is higher than RMSE of linear regression, in fact the best score is smaller (0.58 vs 0.89)

2848.5583551414366

In [50]:
print(price_predictions_train_normal[10:20])
print('\n')
print(list(df_price_train_normal[10:20]))

[49487.         29994.99984163 13920.81628693 55230.
 32383.93179509 25037.47731682 35493.62252994 27898.
  5650.         25888.        ]


[49486.999999998035, 29997.99999999986, 14995.000000000002, 55229.99999999814, 34498.000000000866, 22495.000000000615, 40278.99999999843, 27898.000000000535, 5650.000000000024, 25887.99999999954]


In [51]:
# Prediction on test set
price_predictions_test = tree_reg_model.predict(df_test)

# reversing np.log operation
price_predictions_test_normal = np.exp(price_predictions_test)
df_price_test_normal = np.exp(df_price_test)

final_mse = mean_squared_error(df_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

7411.312200819733

In [52]:
print(price_predictions_test_normal[10:20]) #predictions on test set
print('\n')
print(list(df_price_test_normal[10:20])) #known values in test set

[ 7498.33296283 44900.         17591.23531762 30887.88583873
 17949.817038   41495.         24437.9942026  23065.
  4600.         16880.        ]


[6899.999999999999, 28699.99999999964, 16669.000000000033, 29995.000000000175, 19395.99999999995, 28999.9999999997, 27891.00000000092, 20994.999999999916, 4073.9999999999986, 23994.000000000626]


In [53]:
from sklearn.metrics import r2_score
r2_score(df_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted')

0.8122757141838736

In [54]:
# Saving model for type prediction
pickle.dump(tree_reg_model, open("tree_reg_model_final.pkl", "wb"))

In [55]:
# Loading the model for type prediction
tree_reg_model = pickle.load(open("tree_reg_model_final.pkl", 'rb'))

#### Cross Validation

In [56]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [57]:
# Cross val score on training set

train_scores = cross_val_score(tree_reg_model, df_train, np.exp(df_price_train),
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-train_scores)

display_scores(tree_rmse_scores)

Scores: [7321.32369823 7055.88588058 7747.26488413 6511.75779834 7677.7623151
 6904.09914019 8489.69664859 8637.54255541 7643.18148139 7503.00532738]
Mean: 7549.151972935583
Standard deviation: 626.3765437099219


In [58]:
train_scores = cross_val_score(tree_reg_model, df_train, np.exp(df_price_train),
                          cv=10)

display_scores(train_scores)

Scores: [0.81787627 0.83319383 0.8195794  0.85210479 0.78544239 0.82758622
 0.80826997 0.74480763 0.79530245 0.80303465]
Mean: 0.8087197598900777
Standard deviation: 0.02806795483854974
