In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [16]:
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv') #Import raw data
df = df.set_index('name') #Index each observation by the Car Model
df['owner'] = df['owner'].apply(lambda x:0 if x == "First Owner" else 1).astype(int) # 0 means one previous owner (car is on the newer side)
# 1 means two or more previous owner and car is on the older side. This definition seems clear and more simplified
df['fuel'] = df['fuel'].apply(lambda x:'Petrol' if x == 'Petrol' else ('Diesel' if x == 'Diesel' else 'Other'))
#Simplify fuel types into 3 categories. Any fuel type that isn't petrol/diesel make up <10% of the dataset and should be lumped as other instead
df = pd.get_dummies(df, columns = ['fuel','seller_type','transmission']) #Make dummy variables for categorical variables so we can model with the data
df['year'] = 2023 - df['year'] #Transform year column into a variable representing a car's age
#df.columns.values[0] = 'Age'
for i in ['selling_price','km_driven']:
    df[i] = np.log10(df[i]) #Apply log scale for a better fit
df = df.drop(columns = ['transmission_Automatic','seller_type_Trustmark Dealer','fuel_Other']) #Drop some dummy variables so we
#have k - 1 variables for k levels 
df.columns.values[0] = 'Age'
sp = df.pop('selling_price')
df.insert(0,'Selling_Price', sp)
df

Unnamed: 0_level_0,Selling_Price,Age,km_driven,owner,fuel_Diesel,fuel_Petrol,seller_type_Dealer,seller_type_Individual,transmission_Manual
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Maruti 800 AC,4.778151,16,4.845098,0,0,1,0,1,1
Maruti Wagon R LXI Minor,5.130334,16,4.698970,0,0,1,0,1,1
Hyundai Verna 1.6 SX,5.778151,11,5.000000,0,1,0,0,1,1
Datsun RediGO T Option,5.397940,6,4.662758,0,0,1,0,1,1
Honda Amaze VX i-DTEC,5.653213,9,5.149219,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...
Hyundai i20 Magna 1.4 CRDi (Diesel),5.612783,9,4.903090,1,1,0,0,1,1
Hyundai i20 Magna 1.4 CRDi,5.612783,9,4.903090,1,1,0,0,1,1
Maruti 800 AC BSIII,5.041393,14,4.919078,1,0,1,0,1,1
Hyundai Creta 1.6 CRDi SX Option,5.937016,7,4.954243,0,1,0,0,1,1


In [17]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:],df.iloc[:,0], test_size = .2, random_state = 0) #Split data

In [18]:
rf = RandomForestRegressor() #Perform a grid search to tune the hyperparameters of rf
pt = {'n_estimators':list(range(90,111))+[200,250,300],'max_depth':[None,8,9,10,11,12,13],'max_features':[2,3,4],'n_jobs':[-1]}
grid = GridSearchCV(estimator = rf, param_grid = pt, n_jobs = -1, scoring = 'r2')
grid.fit(X_train,y_train)
print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Score : ", grid.best_score_)

Optimal Hyper-parameters :  {'max_depth': 10, 'max_features': 2, 'n_estimators': 94, 'n_jobs': -1}
Optimal Score :  0.717066727971553


In [19]:
rf = RandomForestRegressor(max_depth = 10, max_features = 2, n_estimators = 94, n_jobs = -1)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [20]:
print(mean_squared_error(y_test,y_pred))
print(r2_score(y_test,y_pred)) #RF has the lowest bias and best fit out of all regression models!

0.036353799692862857
0.73603925004464


In [22]:
df.max(axis = 0)

Selling_Price              6.949390
Age                       31.000000
km_driven                  5.906658
owner                      1.000000
fuel_Diesel                1.000000
fuel_Petrol                1.000000
seller_type_Dealer         1.000000
seller_type_Individual     1.000000
transmission_Manual        1.000000
dtype: float64

In [23]:
df.min(axis = 0)

Selling_Price             4.30103
Age                       3.00000
km_driven                 0.00000
owner                     0.00000
fuel_Diesel               0.00000
fuel_Petrol               0.00000
seller_type_Dealer        0.00000
seller_type_Individual    0.00000
transmission_Manual       0.00000
dtype: float64