In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [53]:
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv') #Import raw data
df = df.set_index('name') #Index each observation by the Car Model
df['owner'] = df['owner'].apply(lambda x:0 if x == "First Owner" else 1).astype(int) # 0 means one previous owner (car is on the newer side)
# 1 means two or more previous owner and car is on the older side. This definition seems clear and more simplified
df['fuel'] = df['fuel'].apply(lambda x:'Petrol' if x == 'Petrol' else ('Diesel' if x == 'Diesel' else 'Other'))
#Simplify fuel types into 3 categories. Any fuel type that isn't petrol/diesel make up <10% of the dataset and should be lumped as other instead
df = pd.get_dummies(df, columns = ['fuel','seller_type','transmission']) #Make dummy variables for categorical variables so we can model with the data
df['year'] = 2023 - df['year'] #Transform year column into a variable representing a car's age
#df.columns.values[0] = 'Age'
for i in ['selling_price','km_driven']:
    df[i] = np.log10(df[i]) #Apply log scale for a better fit
df = df.drop(columns = ['transmission_Automatic','seller_type_Trustmark Dealer','fuel_Other']) #Drop some dummy variables so we
#have k - 1 variables for k levels 
df.columns.values[0] = 'Age'
sp = df.pop('selling_price')
df.insert(0,'Selling_Price', sp)
df

Unnamed: 0_level_0,Selling_Price,Age,km_driven,owner,fuel_Diesel,fuel_Petrol,seller_type_Dealer,seller_type_Individual,transmission_Manual
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Maruti 800 AC,4.778151,16,4.845098,0,False,True,False,True,True
Maruti Wagon R LXI Minor,5.130334,16,4.698970,0,False,True,False,True,True
Hyundai Verna 1.6 SX,5.778151,11,5.000000,0,True,False,False,True,True
Datsun RediGO T Option,5.397940,6,4.662758,0,False,True,False,True,True
Honda Amaze VX i-DTEC,5.653213,9,5.149219,1,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...
Hyundai i20 Magna 1.4 CRDi (Diesel),5.612783,9,4.903090,1,True,False,False,True,True
Hyundai i20 Magna 1.4 CRDi,5.612783,9,4.903090,1,True,False,False,True,True
Maruti 800 AC BSIII,5.041393,14,4.919078,1,False,True,False,True,True
Hyundai Creta 1.6 CRDi SX Option,5.937016,7,4.954243,0,True,False,False,True,True


In [54]:
X = df.drop("Selling_Price", axis = 1)
y = df["Selling_Price"]

In [18]:
rf = RandomForestRegressor() #Perform a grid search to tune the hyperparameters of rf
pt = {'n_estimators':list(range(90,111))+[200,250,300],'max_depth':[None,8,9,10,11,12,13],'max_features':[2,3,4],'n_jobs':[-1]}
grid = GridSearchCV(estimator = rf, param_grid = pt, n_jobs = -1, scoring = 'r2')
grid.fit(X, y)
print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Score : ", grid.best_score_)

Optimal Hyper-parameters :  {'max_depth': 10, 'max_features': 2, 'n_estimators': 94, 'n_jobs': -1}
Optimal Score :  0.717066727971553


In [55]:
from sklearn.model_selection import cross_validate

rf = RandomForestRegressor(max_depth = 10, max_features = 2, n_estimators = 94, n_jobs = -1, random_state = 0)
CV = cross_validate(rf, X, y, cv=10, scoring=['r2', 'neg_mean_squared_error'], return_estimator = True)
print('r2:\n', CV['test_r2'])
print('MSE:\n', -1*CV['test_neg_mean_squared_error'])

r2:
 [0.72649205 0.80100236 0.77217424 0.72722333 0.66448367 0.79199274
 0.68601119 0.66062374 0.64377804 0.68803908]
MSE:
 [0.03843884 0.02721953 0.03407762 0.03463867 0.04036444 0.03033651
 0.03707229 0.0430129  0.04658799 0.03689108]


In [56]:
print(len(CV["estimator"]))
print(CV["estimator"][1])

10
RandomForestRegressor(max_depth=10, max_features=2, n_estimators=94, n_jobs=-1,
                      random_state=0)


In [57]:
import pickle

# save the best model out of 10 from cross validation
pickle.dump(CV["estimator"][1], open("models/Regression_RF.pkl", "wb"))