# Random Forest-Used Car Price Predictor

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [41]:
#Importing required modules
import pandas as pd
import numpy as np
from sqlalchemy import create_engine,inspect, func
from config import password

import seaborn as sns
import scipy.stats as ss
from collections import Counter
import math 
from scipy import stats
import datetime
import matplotlib
import pickle
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

from statsmodels.graphics.tsaplots import plot_acf

In [42]:
#Connecting to the sql database
connection_string = "postgres:"+password+"@localhost:5432/hondadb"
engine = create_engine(f'postgresql://{connection_string}')

In [43]:
df_cleaned = pd.read_sql("SELECT * FROM cleanedcardb2",
                     con=engine)

In [44]:
df_cleaned = df_cleaned[['Price', 'Milage', 'Year', 'Model', 'Car Type']]

In [45]:
df_cleaned.describe()

Unnamed: 0,Price,Milage,Year
count,23677.0,23677.0,23677.0
mean,10.125652,83959.113359,2016.588588
std,0.600556,63801.946178,3.075055
min,7.309881,10.0,1995.0
25%,9.7283,37344.0,2015.0
50%,10.165275,69000.0,2017.0
75%,10.570984,116000.0,2019.0
max,12.042377,500000.0,2021.0


#### Label Encoding

In [46]:
features = ['Car Type', 'Model']
les = {}

for f in features:
  les[f] = preprocessing.LabelEncoder()
  les[f] = les[f].fit(df_cleaned[f])
  df_cleaned[f] = les[f].transform(df_cleaned[f])

#### Splitting Train and Test

In [47]:
# Splitting the dataset into train and test sets
train_set, test_set = train_test_split(df_cleaned, test_size = 0.3, random_state = 100)

# Separating target labels from the rest
df_train = train_set.drop("Price", axis=1) #train without target
df_price_train = train_set["Price"].copy() #target

df_test  = test_set.drop("Price", axis=1)
df_price_test = test_set["Price"].copy()

#### Best Score Function

In [48]:
# This function returns the best score achieved by the model over all the cv splits
def best_score(forest, cv):
  best_score = 0
  for i in range(0, cv):
    items = list(map(lambda x: abs(x), forest.cv_results_['split'+str(i)+'_test_score']))
    arr = np.append(best_score, items)
    best_score = max(arr)
  
  return best_score

#### Best Param Function

In [49]:
# This functions returns the best combination of parameters, which allows to
# get the best score
def best_params(forest):
  return forest.cv_results_['params'][forest.cv_results_['rank_test_score'][0]-1]

#### Performance Metric Function

In [50]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true (y_true) and predicted (y_predict) values based on the metric chosen. """
    
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

# Random Forest

#### Model Training

In [51]:
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

def RF_SparkizedGridSearchCV(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 100)

    # Create a decision tree regressor object
    regressor = RandomForestRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[16, 17, 18]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    sc = createLocalSparkSession().sparkContext
    grid = GridSearchCV(sc, estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    tree_reg = grid.fit(X, y)
    
    # Return the best parameters after fitting the data
    return tree_reg

In [52]:
from sklearn.model_selection import ShuffleSplit

# Fit the training data to the model using spark parallelized grid search CV
forest_reg = RF_SparkizedGridSearchCV(df_train, df_price_train)

# Takign best parameters
bp = best_params(forest_reg)

# Produce the optimal value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(bp['max_depth']))

Parameter 'max_depth' is 17 for the optimal model.


In [53]:
# Fitting the forest

forest_reg_model = RandomForestRegressor(
                              max_depth=bp['max_depth']
                                 
)

%time forest_reg_model.fit(df_train, df_price_train)

Wall time: 663 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=17,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [54]:
# RandomForestRegressor score for price prediction

print(bp)
print("\nRandom Forest Regressor score without CV on train set: %.3f" % forest_reg_model.score(df_train, df_price_train)) #score on train set
print("Random Forest Regressor score without CV on test set: %.3f" % forest_reg_model.score(df_test, df_price_test)) #score on test set
print("Random Forest Regressor Best score with CV=4: %.3f" % best_score(forest_reg, 4)) # -> best score on test set is high

{'max_depth': 17}

Random Forest Regressor score without CV on train set: 0.967
Random Forest Regressor score without CV on test set: 0.872
Random Forest Regressor Best score with CV=4: 0.880


In [55]:
# Prediction on whole training set
price_predictions_train = forest_reg_model.predict(df_train) #using the whole training set for making prediction with the final model given by the best CV parameters

# Reversing np.log operation
price_predictions_train_normal = np.exp(price_predictions_train)
df_price_train_normal = np.exp(df_price_train)

# MSE between target values (i.e known) and predicted values
lin_mse = mean_squared_error(df_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

4047.5184930402106

In [56]:
print(price_predictions_train_normal[10:20])
print('\n')
print(list(df_price_train_normal[10:20]))

[50106.24367528 27406.55753769 13884.91642975 54675.18533109
 35696.83976081 24597.35452694 35642.10554864 26155.63078519
  6112.48003899 25640.28611567]


[49486.999999998035, 29997.99999999986, 14995.000000000002, 55229.99999999814, 34498.000000000866, 22495.000000000615, 40278.99999999843, 27898.000000000535, 5650.000000000024, 25887.99999999954]


In [57]:
# Prediction on test set
price_predictions_test = forest_reg_model.predict(df_test)

# Reversing np.log operation
price_predictions_test_normal = np.exp(price_predictions_test)
df_price_test_normal = np.exp(df_price_test)

final_mse = mean_squared_error(df_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

7042.024876508299

In [58]:
print(price_predictions_test_normal[10:20]) #predictions on test set
print('\n')
print(list(df_price_test_normal[10:20])) #known values in test set

[ 7451.3317218  38572.68546999 18225.74900844 31666.5499726
 18108.86354096 34336.29850096 20453.37758642 22429.39804302
  5527.14948323 22500.8997803 ]


[6899.999999999999, 28699.99999999964, 16669.000000000033, 29995.000000000175, 19395.99999999995, 28999.9999999997, 27891.00000000092, 20994.999999999916, 4073.9999999999986, 23994.000000000626]


In [59]:
# r2 score between hold out prices and predicted prices
r2_score(df_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted')

0.8305173078026042

In [60]:
# Saving model for type prediction
pickle.dump(forest_reg_model, open("forest_reg_model_final.pkl", "wb"))

In [61]:
# Loading the model for type prediction
forest_reg_model = pickle.load(open("forest_reg_model_final.pkl", 'rb'))

#### Cross Validation

In [62]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [63]:
from sklearn.model_selection import KFold

# Cross val score on training set, although we already used grid search CV
train_scores = cross_val_score(forest_reg_model, df_train, np.exp(df_price_train),
                         scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))
forest_rmse_scores = np.sqrt(-train_scores)

display_scores(forest_rmse_scores)

Scores: [6812.21784983 6695.08213232 6415.75210658 6228.02862027 8872.18994685
 6083.23296031 6763.26311745 6288.19826871 6684.84074418 5923.02119675]
Mean: 6676.58269432636
Standard deviation: 786.5752777395389


In [64]:
train_scores = cross_val_score(forest_reg_model, df_train, np.exp(df_price_train),
                          cv=3)

display_scores(train_scores)

Scores: [0.80750046 0.8320772  0.8123264 ]
Mean: 0.817301353281489
Standard deviation: 0.010632236785498175


#### Predictions on this model

In [65]:
final_model = forest_reg_model

In [66]:
# Prediction on the hold out test set
final_predictions = final_model.predict(df_test)

final_mse = mean_squared_error(np.exp(df_price_test), np.exp(final_predictions))

final_rmse = np.sqrt(final_mse)

In [67]:
print("Test RMSE: %f " % final_rmse)
print("Score on held-out Test Set: %f " % final_model.score(df_test, df_price_test))
print("R2 Score: %f" % r2_score(np.exp(df_price_test), np.exp(final_predictions)))

Test RMSE: 7042.024877 
Score on held-out Test Set: 0.871974 
R2 Score: 0.830517


In [68]:
# Cross validation on the entire dataset, since we are good with out final model

features = df_cleaned.drop(['Price'], axis=1)
prices = df_cleaned['Price'].copy()

final_rmses= cross_val_score(final_model, features, np.exp(prices),
                          scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))

final_rmse_scores = np.sqrt(-final_rmses)
display_scores(final_rmse_scores)

Scores: [6480.53353471 6658.91437299 6594.26211655 6162.87540254 5849.15829075
 7100.12167884 6127.91702225 6413.90220655 5829.90310809 6044.71107526]
Mean: 6326.229880851052
Standard deviation: 378.4346576071268


In [69]:
final_scores = cross_val_score(forest_reg_model, features, np.exp(prices),
                          cv=KFold(10, shuffle=True))

display_scores(final_scores)

Scores: [0.86358949 0.88391264 0.86823539 0.88739441 0.85360268 0.86539923
 0.861056   0.86345341 0.83891033 0.8893061 ]
Mean: 0.8674859671606686
Standard deviation: 0.014930897222456116
