In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [6]:
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv') #Import raw data
df = df.set_index('name') #Index each observation by the Car Model
df['selling_price'] =  df['selling_price']/82.41#Convert Rupees to USD
df['owner'] = df['owner'].apply(lambda x:0 if x == "First Owner" else 1).astype(int) # 0 means one previous owner (car is on the newer side)
# 1 means two or more previous owner and car is on the older side. This definition seems clear and more simplified
df['fuel'] = df['fuel'].apply(lambda x:'Petrol' if x == 'Petrol' else ('Diesel' if x == 'Diesel' else 'Other'))
#Simplify fuel types into 3 categories. Any fuel type that isn't petrol/diesel make up <10% of the dataset and should be lumped as other instead
df = pd.get_dummies(df, columns = ['fuel','seller_type','transmission']) #Make dummy variables for categorical variables so we can model with the data
df['year'] = 2023 - df['year'] #Transform year column into a variable representing a car's age
df = df.drop(columns = ['transmission_Automatic','seller_type_Trustmark Dealer','fuel_Other']) #Drop some dummy variables so we
#have k - 1 variables for k levels 
df.columns.values[0] = 'Age'
df['selling_price'] = np.log10(df['selling_price']) #Log transform for a better fit
df['km_driven'] = np.log10(df['km_driven']) 
sp = df.pop('selling_price')
df.insert(0,'Selling_Price', sp)
df

Unnamed: 0_level_0,Selling_Price,Age,km_driven,owner,fuel_Diesel,fuel_Petrol,seller_type_Dealer,seller_type_Individual,transmission_Manual
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Maruti 800 AC,2.862171,16,4.845098,0,0,1,0,1,1
Maruti Wagon R LXI Minor,3.214354,16,4.698970,0,0,1,0,1,1
Hyundai Verna 1.6 SX,3.862171,11,5.000000,0,1,0,0,1,1
Datsun RediGO T Option,3.481960,6,4.662758,0,0,1,0,1,1
Honda Amaze VX i-DTEC,3.737233,9,5.149219,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...
Hyundai i20 Magna 1.4 CRDi (Diesel),3.696803,9,4.903090,1,1,0,0,1,1
Hyundai i20 Magna 1.4 CRDi,3.696803,9,4.903090,1,1,0,0,1,1
Maruti 800 AC BSIII,3.125413,14,4.919078,1,0,1,0,1,1
Hyundai Creta 1.6 CRDi SX Option,4.021036,7,4.954243,0,1,0,0,1,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:],df.iloc[:,0], test_size = .2, random_state = 0) #Split data

In [13]:
rf = RandomForestRegressor() #Perform a grid search to tune the hyperparameters of rf
pt = {'n_estimators':list(range(90,105)),'max_depth':[None,8,9,10,11,12],'max_features':[2,3,4],'n_jobs':[-1]}
grid = GridSearchCV(estimator = rf, param_grid = pt, n_jobs = -1, scoring = 'r2')
grid.fit(X_train,y_train)
print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Score : ", grid.best_score_)
#Performed grid search three times suggests 100+ and 98. I'm going to try less to prevent overfitting

Optimal Hyper-parameters :  {'max_depth': 10, 'max_features': 2, 'n_estimators': 98, 'n_jobs': -1}
Optimal Score :  0.7168048628640387


In [18]:
for i in range(90,100):
    rf = RandomForestRegressor(max_depth = 10, max_features = 2, n_estimators = i, n_jobs = -1)
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    print(i,mean_squared_error(y_test,y_pred))
    print(i,r2_score(y_test,y_pred)) #91 seems to work the best

90 0.036092171836959286
90 0.7379388997549035
91 0.0356526070313764
91 0.7411305291503424
92 0.03635238333416621
92 0.7360495340619073
93 0.036263281284848174
93 0.7366964937733889
94 0.035948975937363535
94 0.7389786286791745
95 0.03568581594680803
95 0.7408894030425739
96 0.035659681519145905
96 0.7410791621099514
97 0.03611447333259439
97 0.7377769711652457
98 0.03579323206213591
98 0.7401094670084101
99 0.03623205258368316
99 0.7369232417735784


In [19]:
rf = RandomForestRegressor(max_depth = 10, max_features = 2, n_estimators = 91, n_jobs = -1)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [20]:
print(mean_squared_error(y_test,y_pred))
print(r2_score(y_test,y_pred)) #RF has the lowest bias and best fit out of all regression models!

0.035546389395941264
0.7419017631601246


In [21]:
df.max(axis = 0)

Selling_Price              5.033410
Age                       31.000000
km_driven                  5.906658
owner                      1.000000
fuel_Diesel                1.000000
fuel_Petrol                1.000000
seller_type_Dealer         1.000000
seller_type_Individual     1.000000
transmission_Manual        1.000000
dtype: float64

In [22]:
df.min(axis = 0)

Selling_Price             2.38505
Age                       3.00000
km_driven                 0.00000
owner                     0.00000
fuel_Diesel               0.00000
fuel_Petrol               0.00000
seller_type_Dealer        0.00000
seller_type_Individual    0.00000
transmission_Manual       0.00000
dtype: float64