# Random Forest-Used Car Price Predictor

In [56]:
import warnings
warnings.filterwarnings('ignore')

In [57]:
#Importing required modules
import pandas as pd
import numpy as np
from sqlalchemy import create_engine,inspect, func
from config import password

import seaborn as sns
import scipy.stats as ss
from collections import Counter
import math 
from scipy import stats
import datetime
import matplotlib
import pickle
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

from statsmodels.graphics.tsaplots import plot_acf

In [58]:
#Connecting to the sql database
connection_string = "postgres:"+password+"@localhost:5432/hondadb"
engine = create_engine(f'postgresql://{connection_string}')

In [59]:
df_cleaned = pd.read_sql("SELECT * FROM cleanedcardb2",
                     con=engine)

In [60]:
df_cleaned = df_cleaned[['Price', 'Milage', 'Year', 'Model', 'Car Type']]

In [61]:
df_cleaned.describe()

Unnamed: 0,Price,Milage,Year
count,1324.0,1324.0,1324.0
mean,10.659139,85167.126888,2017.731873
std,0.900439,59016.378417,2.525428
min,8.837536,90.0,2011.0
25%,9.790767,39400.0,2016.0
50%,10.819778,85000.0,2019.0
75%,11.460579,120689.0,2020.0
max,12.021007,265000.0,2020.0


#### Label Encoding

In [62]:
features = ['Car Type', 'Model']
les = {}

for f in features:
  les[f] = preprocessing.LabelEncoder()
  les[f] = les[f].fit(df_cleaned[f])
  df_cleaned[f] = les[f].transform(df_cleaned[f])

#### Splitting Train and Test

In [63]:
# Splitting the dataset into train and test sets
train_set, test_set = train_test_split(df_cleaned, test_size = 0.3, random_state = 100)

# Separating target labels from the rest
df_train = train_set.drop("Price", axis=1) #train without target
df_price_train = train_set["Price"].copy() #target

df_test  = test_set.drop("Price", axis=1)
df_price_test = test_set["Price"].copy()

#### Best Score Function

In [64]:
# This function returns the best score achieved by the model over all the cv splits
def best_score(forest, cv):
  best_score = 0
  for i in range(0, cv):
    items = list(map(lambda x: abs(x), forest.cv_results_['split'+str(i)+'_test_score']))
    arr = np.append(best_score, items)
    best_score = max(arr)
  
  return best_score

#### Best Param Function

In [65]:
# This functions returns the best combination of parameters, which allows to
# get the best score
def best_params(forest):
  return forest.cv_results_['params'][forest.cv_results_['rank_test_score'][0]-1]

#### Performance Metric Function

In [66]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true (y_true) and predicted (y_predict) values based on the metric chosen. """
    
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

# Random Forest

#### Model Training

In [67]:
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

def RF_SparkizedGridSearchCV(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 100)

    # Create a decision tree regressor object
    regressor = RandomForestRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[16, 17, 18]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    sc = createLocalSparkSession().sparkContext
    grid = GridSearchCV(sc, estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    tree_reg = grid.fit(X, y)
    
    # Return the best parameters after fitting the data
    return tree_reg

In [68]:
from sklearn.model_selection import ShuffleSplit

# Fit the training data to the model using spark parallelized grid search CV
forest_reg = RF_SparkizedGridSearchCV(df_train, df_price_train)

# Takign best parameters
bp = best_params(forest_reg)

# Produce the optimal value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(bp['max_depth']))

Parameter 'max_depth' is 16 for the optimal model.


In [69]:
# Fitting the forest

forest_reg_model = RandomForestRegressor(
                              max_depth=bp['max_depth']
                                 
)

%time forest_reg_model.fit(df_train, df_price_train)

Wall time: 34 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [70]:
# RandomForestRegressor score for price prediction

print(bp)
print("\nRandom Forest Regressor score without CV on train set: %.3f" % forest_reg_model.score(df_train, df_price_train)) #score on train set
print("Random Forest Regressor score without CV on test set: %.3f" % forest_reg_model.score(df_test, df_price_test)) #score on test set
print("Random Forest Regressor Best score with CV=4: %.3f" % best_score(forest_reg, 4)) # -> best score on test set is high

{'max_depth': 16}

Random Forest Regressor score without CV on train set: 0.998
Random Forest Regressor score without CV on test set: 0.991
Random Forest Regressor Best score with CV=4: 0.994


In [71]:
# Prediction on whole training set
price_predictions_train = forest_reg_model.predict(df_train) #using the whole training set for making prediction with the final model given by the best CV parameters

# Reversing np.log operation
price_predictions_train_normal = np.exp(price_predictions_train)
df_price_train_normal = np.exp(df_price_train)

# MSE between target values (i.e known) and predicted values
lin_mse = mean_squared_error(df_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2394.066330868384

In [72]:
print(price_predictions_train_normal[10:20])
print('\n')
print(list(df_price_train_normal[10:20]))

[ 94900.          85740.          10500.          42984.
  94900.          85740.         143943.09828489  10988.
 110000.         107652.46867957]


[94899.99999999812, 85740.00000000272, 10499.999999999955, 42984.00000000184, 94899.99999999812, 85740.00000000272, 146995.00000000652, 10987.999999999942, 110000.0000000052, 100615.00000000323]


In [73]:
# Prediction on test set
price_predictions_test = forest_reg_model.predict(df_test)

# Reversing np.log operation
price_predictions_test_normal = np.exp(price_predictions_test)
df_price_test_normal = np.exp(df_price_test)

final_mse = mean_squared_error(df_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

5095.25710006018

In [74]:
print(price_predictions_test_normal[10:20]) #predictions on test set
print('\n')
print(list(df_price_test_normal[10:20])) #known values in test set

[ 17868.          85740.          57563.         105109.93060521
  22990.         108094.46687347  49926.45203077  37999.
  41158.          46314.09762755]


[17868.0, 85740.00000000272, 57563.000000001215, 113774.99999999517, 22989.999999999374, 100615.00000000323, 66702.99999999985, 37999.00000000067, 66702.99999999985, 51720.99999999993]


In [75]:
# r2 score between hold out prices and predicted prices
r2_score(df_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted')

0.9876317601259191

In [76]:
# Saving model for type prediction
pickle.dump(forest_reg_model, open("forest_reg_model_final.pkl", "wb"))

In [77]:
# Loading the model for type prediction
forest_reg_model = pickle.load(open("forest_reg_model_final.pkl", 'rb'))

#### Cross Validation

In [78]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [79]:
from sklearn.model_selection import KFold

# Cross val score on training set, although we already used grid search CV
train_scores = cross_val_score(forest_reg_model, df_train, np.exp(df_price_train),
                         scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))
forest_rmse_scores = np.sqrt(-train_scores)

display_scores(forest_rmse_scores)

Scores: [5404.53108243 4232.7109551  5594.59711766 5108.41831935 5620.91217764
 5146.40928427 3414.18429    4442.85557482 5285.07558311 3920.32250003]
Mean: 4817.001688440033
Standard deviation: 725.3933345786228


In [80]:
train_scores = cross_val_score(forest_reg_model, df_train, np.exp(df_price_train),
                          cv=4)

display_scores(train_scores)

Scores: [0.99205952 0.99218875 0.98621446 0.98829996]
Mean: 0.9896906710293627
Standard deviation: 0.0025431290191954585


#### Predictions on this model

In [81]:
final_model = forest_reg_model

In [82]:
# Prediction on the hold out test set
final_predictions = final_model.predict(df_test)

final_mse = mean_squared_error(np.exp(df_price_test), np.exp(final_predictions))

final_rmse = np.sqrt(final_mse)

In [83]:
print("Test RMSE: %f " % final_rmse)
print("Score on held-out Test Set: %f " % final_model.score(df_test, df_price_test))
print("R2 Score: %f" % r2_score(np.exp(df_price_test), np.exp(final_predictions)))

Test RMSE: 5095.257100 
Score on held-out Test Set: 0.990676 
R2 Score: 0.987632


In [84]:
# Cross validation on the entire dataset, since we are good with out final model

features = df_cleaned.drop(['Price'], axis=1)
prices = df_cleaned['Price'].copy()

final_rmses= cross_val_score(final_model, features, np.exp(prices),
                          scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))

final_rmse_scores = np.sqrt(-final_rmses)
display_scores(final_rmse_scores)

Scores: [5461.89009207 3940.52957571 4983.5284795  4326.78472708 5846.92154438
 4582.70371696 4914.61870832 6184.56937835 4784.92486863 3441.64350825]
Mean: 4846.811459924036
Standard deviation: 795.6935338710726


In [85]:
final_scores = cross_val_score(forest_reg_model, features, np.exp(prices),
                          cv=KFold(10, shuffle=True))

display_scores(final_scores)

Scores: [0.98892818 0.98651726 0.99101885 0.98478435 0.98284822 0.98428597
 0.99313151 0.99272787 0.983935   0.99459796]
Mean: 0.9882775177028966
Standard deviation: 0.004133050242392499
