# Random Forest-Used Car Price Predictor

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing required modules
import pandas as pd
import numpy as np
from sqlalchemy import create_engine,inspect, func
from config import password

import seaborn as sns
import scipy.stats as ss
from collections import Counter
import math 
from scipy import stats
import datetime
import matplotlib
import pickle
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

from statsmodels.graphics.tsaplots import plot_acf

In [3]:
#Connecting to the sql database
connection_string = "postgres:"+password+"@localhost:5432/hondadb"
engine = create_engine(f'postgresql://{connection_string}')

In [4]:
df_cleaned = pd.read_sql("SELECT * FROM cleanedcardb2",
                     con=engine)

In [5]:
df_cleaned = df_cleaned[['Price', 'Milage', 'Year', 'Model', 'Car Type']]

In [6]:
df_cleaned.describe()

Unnamed: 0,Price,Milage,Year
count,23678.0,23678.0,23678.0
mean,10.125652,83962.4304,2016.588563
std,0.600543,63802.640491,3.074993
min,7.309881,10.0,1995.0
25%,9.728315,37345.0,2015.0
50%,10.165275,69000.0,2017.0
75%,10.570977,116000.0,2019.0
max,12.042377,500000.0,2021.0


#### Label Encoding

In [7]:
features = ['Car Type', 'Model']
les = {}

for f in features:
  les[f] = preprocessing.LabelEncoder()
  les[f] = les[f].fit(df_cleaned[f])
  df_cleaned[f] = les[f].transform(df_cleaned[f])

#### Splitting Train and Test

In [8]:
# Splitting the dataset into train and test sets
train_set, test_set = train_test_split(df_cleaned, test_size = 0.3, random_state = 100)

# Separating target labels from the rest
df_train = train_set.drop("Price", axis=1) #train without target
df_price_train = train_set["Price"].copy() #target

df_test  = test_set.drop("Price", axis=1)
df_price_test = test_set["Price"].copy()

#### Best Score Function

In [9]:
# This function returns the best score achieved by the model over all the cv splits
def best_score(forest, cv):
  best_score = 0
  for i in range(0, cv):
    items = list(map(lambda x: abs(x), forest.cv_results_['split'+str(i)+'_test_score']))
    arr = np.append(best_score, items)
    best_score = max(arr)
  
  return best_score

#### Best Param Function

In [10]:
# This functions returns the best combination of parameters, which allows to
# get the best score
def best_params(forest):
  return forest.cv_results_['params'][forest.cv_results_['rank_test_score'][0]-1]

#### Performance Metric Function

In [11]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true (y_true) and predicted (y_predict) values based on the metric chosen. """
    
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

# Random Forest

#### Model Training

In [12]:
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

def RF_SparkizedGridSearchCV(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 100)

    # Create a decision tree regressor object
    regressor = RandomForestRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[16, 17, 18]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    sc = createLocalSparkSession().sparkContext
    grid = GridSearchCV(sc, estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    tree_reg = grid.fit(X, y)
    
    # Return the best parameters after fitting the data
    return tree_reg

  from numpy.core.umath_tests import inner1d


In [13]:
from sklearn.model_selection import ShuffleSplit

# Fit the training data to the model using spark parallelized grid search CV
forest_reg = RF_SparkizedGridSearchCV(df_train, df_price_train)

# Takign best parameters
bp = best_params(forest_reg)

# Produce the optimal value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(bp['max_depth']))

Parameter 'max_depth' is 18 for the optimal model.


In [14]:
# Fitting the forest

forest_reg_model = RandomForestRegressor(
                              max_depth=bp['max_depth']
                                 
)

%time forest_reg_model.fit(df_train, df_price_train)

Wall time: 437 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
# RandomForestRegressor score for price prediction

print(bp)
print("\nRandom Forest Regressor score without CV on train set: %.3f" % forest_reg_model.score(df_train, df_price_train)) #score on train set
print("Random Forest Regressor score without CV on test set: %.3f" % forest_reg_model.score(df_test, df_price_test)) #score on test set
print("Random Forest Regressor Best score with CV=4: %.3f" % best_score(forest_reg, 4)) # -> best score on test set is high

{'max_depth': 18}

Random Forest Regressor score without CV on train set: 0.970
Random Forest Regressor score without CV on test set: 0.875
Random Forest Regressor Best score with CV=4: 0.879


In [16]:
# Prediction on whole training set
price_predictions_train = forest_reg_model.predict(df_train) #using the whole training set for making prediction with the final model given by the best CV parameters

# Reversing np.log operation
price_predictions_train_normal = np.exp(price_predictions_train)
df_price_train_normal = np.exp(df_price_train)

# MSE between target values (i.e known) and predicted values
lin_mse = mean_squared_error(df_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

3993.895715057453

In [17]:
print(price_predictions_train_normal[10:20])
print('\n')
print(list(df_price_train_normal[10:20]))

[ 5919.86576954 50138.47248765 21746.48346395 13688.76778443
 55093.59984985 34423.28024938  7435.61469308 36935.01166107
 13454.88894071 23990.39604162]


[5899.999999999999, 49486.999999998035, 21990.000000000084, 14995.000000000002, 76998.99999999984, 34498.000000000866, 6999.999999999999, 40278.99999999843, 13994.999999999933, 20883.999999999953]


In [18]:
# Prediction on test set
price_predictions_test = forest_reg_model.predict(df_test)

# Reversing np.log operation
price_predictions_test_normal = np.exp(price_predictions_test)
df_price_test_normal = np.exp(df_price_test)

final_mse = mean_squared_error(df_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

6685.352257930515

In [19]:
print(price_predictions_test_normal[10:20]) #predictions on test set
print('\n')
print(list(df_price_test_normal[10:20])) #known values in test set

[13199.86120738 14375.22475689 14234.46850096 16070.70981831
 24756.91364163 32996.67045384 27061.19287004 21565.22020348
 39895.01200862 16471.83709271]


[17994.999999999993, 11499.999999999976, 11994.99999999993, 23500.00000000118, 26887.9999999989, 28999.9999999997, 29189.000000000244, 21488.00000000001, 38952.99999999941, 13994.999999999933]


In [20]:
# r2 score between hold out prices and predicted prices
r2_score(df_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted')

0.8476138600755545

In [21]:
# Saving model for type prediction
pickle.dump(forest_reg_model, open("forest_reg_model_final.pkl", "wb"))

In [22]:
# Loading the model for type prediction
forest_reg_model = pickle.load(open("forest_reg_model_final.pkl", 'rb'))

#### Cross Validation

In [23]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [31]:
from sklearn.model_selection import KFold

# Cross val score on training set, although we already used grid search CV
train_scores = cross_val_score(forest_reg_model, df_train, np.exp(df_price_train),
                         scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))
forest_rmse_scores = np.sqrt(-train_scores)

display_scores(forest_rmse_scores)

Scores: [7321.1910044  6400.12488291 6814.59379631 7249.9377581  6296.58911481
 6960.40840939 7597.554986   7444.68600333 6193.49684415 6847.47943173]
Mean: 6912.606223112741
Standard deviation: 469.9322581786135


In [34]:
train_scores = cross_val_score(forest_reg_model, df_train, np.exp(df_price_train),
                          cv=3)

display_scores(train_scores)

Scores: [0.8037297  0.80599245 0.82237228]
Mean: 0.8106981419818449
Standard deviation: 0.008306385970368505


#### Predictions on this model

In [35]:
final_model = forest_reg_model

In [36]:
# Prediction on the hold out test set
final_predictions = final_model.predict(df_test)

final_mse = mean_squared_error(np.exp(df_price_test), np.exp(final_predictions))

final_rmse = np.sqrt(final_mse)

In [37]:
print("Test RMSE: %f " % final_rmse)
print("Score on held-out Test Set: %f " % final_model.score(df_test, df_price_test))
print("R2 Score: %f" % r2_score(np.exp(df_price_test), np.exp(final_predictions)))

Test RMSE: 6685.352258 
Score on held-out Test Set: 0.875275 
R2 Score: 0.847614


In [38]:
# Cross validation on the entire dataset, since we are good with out final model

features = df_cleaned.drop(['Price'], axis=1)
prices = df_cleaned['Price'].copy()

final_rmses= cross_val_score(final_model, features, np.exp(prices),
                          scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))

final_rmse_scores = np.sqrt(-final_rmses)
display_scores(final_rmse_scores)

Scores: [6212.33741784 5722.97694534 6262.58976859 5967.60706664 7595.54111088
 6644.65166539 6377.69507141 6034.16188404 6711.14783089 6999.71133168]
Mean: 6452.842009269651
Standard deviation: 524.5450007854373


In [39]:
final_scores = cross_val_score(forest_reg_model, features, np.exp(prices),
                          cv=KFold(10, shuffle=True))

display_scores(final_scores)

Scores: [0.88426254 0.88650585 0.85957527 0.8750024  0.87546395 0.82625944
 0.85570544 0.84598935 0.87191092 0.8521658 ]
Mean: 0.8632840965520614
Standard deviation: 0.017900211198537793
