In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [3]:
data=pd.read_csv('cleaned_data.csv')
pd.set_option('display.max_columns',None)
data.head(1)

Unnamed: 0,Price,Kilometers,Owner,Insurance_Type,Age,Rating_1,Rating_2,Rating_3,Rating_4,city_c10,city_c2,city_c3,city_c4,city_c5,city_c6,city_c7,city_c8,city_c9,Fuel_Type_Electric,Fuel_Type_Hybrid,Fuel_Type_Petrol,Fuel_Type_Petrol + CNG,Fuel_Type_Petrol + LPG,Transmission_MANUAL,Car_Age_Category_Moderately New,Car_Age_Category_Moderately Old,Car_Age_Category_Old,company_BMW,company_Chevrolet,company_Daewoo,company_Datsun,company_Fiat,company_Force,company_Ford,company_Hindustan,company_Honda,company_Hyundai,company_ICML,company_ISUZU,company_Jaguar,company_Jeep,company_KIA,company_Landrover,company_MG,company_MITSUBISHI,company_Mahindra,company_Maruti,company_Mercedes,company_Mitsubishi,company_Nissan,company_Opel,company_Porsche,company_Premier,company_Renault,company_Skoda,company_Ssangyong,company_Tata,company_Toyota,company_Volkswagen,company_Volvo,Category_Luxury,Category_Off-Road,Category_Standard,Kilometers_Category_Moderate,Kilometers_Category_High,Kilometers_Category_Very High,Price_Category_By_Name_Luxury,Price_Category_By_Name_Mid-Range,Price_Category_By_Name_Premium
0,174699,34854.0,1,0,14,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False


In [5]:
X = data.drop(columns='Price')
y = data['Price']

**1. Linear Regression**

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X,y)

In [10]:
from sklearn.metrics import r2_score
ypred = lr.predict(X)
r2_score(y,ypred)

0.8211230478766314

In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr,X,y,cv=5,scoring='r2').mean()

0.7958674698439683

**2. Decesion Tree**

In [15]:
from sklearn.tree import DecisionTreeRegressor
# Hyper parameter tuning
estimator = DecisionTreeRegressor(random_state=True)

param_grid = {'criterion':['squared_error'],
              'max_depth':list(range(1,10))}

from sklearn.model_selection import GridSearchCV
dt_grid = GridSearchCV(estimator,param_grid,scoring='r2',cv=3)
dt_grid.fit(X,y)

dt = dt_grid.best_estimator_
dt

In [17]:
# Important features
feats_ab = pd.DataFrame(data=dt.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

important_features_dt = feats_ab[feats_ab['Importance']>0].index.tolist()
important_features_dt

['Kilometers',
 'Owner',
 'Insurance_Type',
 'Age',
 'Rating_3',
 'Rating_4',
 'city_c3',
 'city_c4',
 'city_c6',
 'city_c7',
 'city_c8',
 'Fuel_Type_Hybrid',
 'Fuel_Type_Petrol',
 'Fuel_Type_Petrol + CNG',
 'Transmission_MANUAL',
 'company_BMW',
 'company_Chevrolet',
 'company_Honda',
 'company_Jaguar',
 'company_Jeep',
 'company_Landrover',
 'company_Maruti',
 'company_Mercedes',
 'company_Mitsubishi',
 'company_Porsche',
 'company_Tata',
 'company_Toyota',
 'Category_Luxury',
 'Price_Category_By_Name_Luxury',
 'Price_Category_By_Name_Mid-Range',
 'Price_Category_By_Name_Premium']

In [19]:
# Selecting train & Test data
X_dt = X[important_features_dt]

# Modeling
dt = dt_grid.best_estimator_
dt.fit(X_dt,y)

# Evaluation
ypred = dt.predict(X_dt)

from sklearn.metrics import r2_score
print('Train r2:',r2_score(y,ypred))
print('CV Score:',cross_val_score(dt,X_dt,y,cv=3,scoring='r2').mean())

Train r2: 0.8611708518413712
CV Score: 0.7895667207448865


**3. Random Forest**

In [22]:
# Hyper parameter tuning
from sklearn.ensemble import RandomForestRegressor
estimator = RandomForestRegressor(random_state=True)

param_grid = {'n_estimators':list(range(1,12))}

rf_grid = GridSearchCV(estimator,param_grid,scoring='r2',cv=5)
rf_grid.fit(X,y)

rf = rf_grid.best_estimator_
rf

In [24]:
# Important features
feats_ab = pd.DataFrame(data=rf.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

important_features_rf = feats_ab[feats_ab['Importance']>0].index.tolist()
important_features_rf

['Kilometers',
 'Owner',
 'Insurance_Type',
 'Age',
 'Rating_1',
 'Rating_2',
 'Rating_3',
 'Rating_4',
 'city_c10',
 'city_c2',
 'city_c3',
 'city_c4',
 'city_c5',
 'city_c6',
 'city_c7',
 'city_c8',
 'city_c9',
 'Fuel_Type_Electric',
 'Fuel_Type_Hybrid',
 'Fuel_Type_Petrol',
 'Fuel_Type_Petrol + CNG',
 'Fuel_Type_Petrol + LPG',
 'Transmission_MANUAL',
 'Car_Age_Category_Moderately New',
 'Car_Age_Category_Moderately Old',
 'Car_Age_Category_Old',
 'company_BMW',
 'company_Chevrolet',
 'company_Daewoo',
 'company_Datsun',
 'company_Fiat',
 'company_Force',
 'company_Ford',
 'company_Hindustan',
 'company_Honda',
 'company_Hyundai',
 'company_ISUZU',
 'company_Jaguar',
 'company_Jeep',
 'company_KIA',
 'company_Landrover',
 'company_MG',
 'company_MITSUBISHI',
 'company_Mahindra',
 'company_Maruti',
 'company_Mercedes',
 'company_Mitsubishi',
 'company_Nissan',
 'company_Opel',
 'company_Porsche',
 'company_Premier',
 'company_Renault',
 'company_Skoda',
 'company_Ssangyong',
 'company

In [26]:
# Selecting train & Test data
X_rf = X[important_features_rf]

# Modeling
rf = rf_grid.best_estimator_
rf.fit(X_rf,y)

# Evaluation
ypred = rf.predict(X_rf)

print('Train r2:',r2_score(y,ypred))
print('CV Score:',cross_val_score(rf,X_rf,y,cv=5,scoring='r2').mean())

Train r2: 0.9730862481573739
CV Score: 0.8055507268821458


**4. Ada boost**

In [29]:
# Hyper parameter tuning
from sklearn.ensemble import AdaBoostRegressor
estimator = AdaBoostRegressor(random_state=True)
# param_grid = {'n_estimators':list(range(1,11))}
param_grid = {
    'n_estimators': [10, 50, 100],        # Fewer estimators for faster tuning
    'learning_rate': [0.1, 0.5, 1.0],    # Key parameter to tune the contribution of each tree
    'loss': ['linear', 'square', 'exponential']  # Loss function for boosting
}

from sklearn.model_selection import GridSearchCV
ab_grid = GridSearchCV(estimator,param_grid,scoring='r2',cv=5,verbose=3,n_jobs=-1)
ab_grid.fit(X,y)

ab = ab_grid.best_estimator_
ab

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [31]:
# Important features
feats_ab = pd.DataFrame(data=ab.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

important_features_ab = feats_ab[feats_ab['Importance']>0].index.tolist()
important_features_ab

['Kilometers',
 'Age',
 'city_c3',
 'city_c8',
 'company_Jaguar',
 'company_Landrover',
 'company_Porsche',
 'Category_Luxury',
 'Category_Standard',
 'Price_Category_By_Name_Luxury',
 'Price_Category_By_Name_Mid-Range',
 'Price_Category_By_Name_Premium']

In [33]:
# Selecting train & Test data
X_ab = X[important_features_ab]

# Modeling
ab = ab_grid.best_estimator_
ab.fit(X_ab,y)

# Evaluation
ypred = ab.predict(X_ab)

print('Train r2:',r2_score(y,ypred))
print('CV Score:',cross_val_score(ab,X_ab,y,cv=5,scoring='r2').mean())

Train r2: 0.7486734013571488
CV Score: 0.7277938403445818


**5. Gradient Boosting**

In [36]:
# Hyper parameter tuning
from sklearn.ensemble import GradientBoostingRegressor
estimator = GradientBoostingRegressor(random_state=True)

# param_grid = {'n_estimators':list(range(1,10)),
#              'learning_rate':[0.1,0.01]
#              }
param_grid = {
    'n_estimators': [50, 100, 150],        # Number of boosting stages
    'learning_rate': [0.01, 0.05, 0.1],   # Learning rate
    # 'max_depth': [3, 5, 7],               # Maximum depth of the trees
    # 'subsample': [0.6, 0.8, 1.0],         # Fraction of samples used for training
    # 'min_samples_split': [2, 5, 10]       # Minimum samples required to split
}

from sklearn.model_selection import GridSearchCV
gb_grid = GridSearchCV(estimator,param_grid, cv=5,                                 # 3-fold cross-validation for faster tuning
    scoring='neg_mean_squared_error',     # Scoring metric
    verbose=2,                            # Log detailed output
    n_jobs=-1 )
gb_grid.fit(X,y)

gb = gb_grid.best_estimator_
gb

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [38]:
# Important features
feats_ab = pd.DataFrame(data=gb.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

important_features_gb = feats_ab[feats_ab['Importance']>0].index.tolist()
important_features_gb

['Kilometers',
 'Owner',
 'Insurance_Type',
 'Age',
 'Rating_3',
 'Rating_4',
 'city_c10',
 'city_c2',
 'city_c3',
 'city_c4',
 'city_c5',
 'city_c6',
 'city_c7',
 'city_c8',
 'city_c9',
 'Fuel_Type_Electric',
 'Fuel_Type_Hybrid',
 'Fuel_Type_Petrol',
 'Fuel_Type_Petrol + CNG',
 'Fuel_Type_Petrol + LPG',
 'Transmission_MANUAL',
 'Car_Age_Category_Moderately New',
 'Car_Age_Category_Moderately Old',
 'company_BMW',
 'company_Chevrolet',
 'company_Datsun',
 'company_Fiat',
 'company_Ford',
 'company_Honda',
 'company_Hyundai',
 'company_Jaguar',
 'company_Jeep',
 'company_KIA',
 'company_Landrover',
 'company_MG',
 'company_Mahindra',
 'company_Maruti',
 'company_Mercedes',
 'company_Nissan',
 'company_Porsche',
 'company_Renault',
 'company_Skoda',
 'company_Tata',
 'company_Toyota',
 'company_Volkswagen',
 'company_Volvo',
 'Category_Luxury',
 'Category_Standard',
 'Kilometers_Category_Moderate',
 'Kilometers_Category_High',
 'Price_Category_By_Name_Luxury',
 'Price_Category_By_Name_Mi

In [40]:
# Selecting train & Test data
X_gb = X[important_features_gb]

# Modeling
gb = gb_grid.best_estimator_
gb.fit(X_gb,y)

# Evaluation
ypred = gb.predict(X_gb)

from sklearn.metrics import r2_score
print('Train Accuracy:',r2_score(y,ypred))
from sklearn.model_selection import cross_val_score
print('CV Score:',cross_val_score(gb,X_gb,y,cv=5,scoring='r2').mean())

Train Accuracy: 0.8851686539057415
CV Score: 0.8335410411951978


**8. XGBoost**

In [43]:
# Hyper parameter tuning
from xgboost import XGBRegressor
estimator = XGBRegressor()

# param_grid = {'n_estimators':[200,220,250],
#               'max_depth':[3,4,5],
#               'gamma':[0,0.15,0.3,0.5,1]
#              }
param_grid = {
    'n_estimators': [50, 100, 150],        # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],    # Step size shrinkage
    'max_depth': [3, 5, 7],               # Maximum tree depth
    'subsample': [0.6, 0.8, 1.0]          # Fraction of samples used for training
}

from sklearn.model_selection import GridSearchCV
xgb_grid = GridSearchCV(estimator,param_grid,cv=5,                                 # 3-fold cross-validation for faster tuning
    scoring='neg_mean_squared_error',     # Scoring metric
    verbose=2,                            # Log detailed output
    n_jobs=-1  )
xgb_grid.fit(X,y)

xgb = xgb_grid.best_estimator_
xgb

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [45]:
# Important features
feats_xgb = pd.DataFrame(data=xgb.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

important_features_xgb = feats_xgb[feats_xgb['Importance']>0].index.tolist()
important_features_xgb

['Kilometers',
 'Owner',
 'Insurance_Type',
 'Age',
 'Rating_3',
 'Rating_4',
 'city_c10',
 'city_c2',
 'city_c3',
 'city_c4',
 'city_c5',
 'city_c6',
 'city_c7',
 'city_c8',
 'city_c9',
 'Fuel_Type_Electric',
 'Fuel_Type_Hybrid',
 'Fuel_Type_Petrol',
 'Fuel_Type_Petrol + CNG',
 'Fuel_Type_Petrol + LPG',
 'Transmission_MANUAL',
 'company_BMW',
 'company_Chevrolet',
 'company_Datsun',
 'company_Fiat',
 'company_Ford',
 'company_Honda',
 'company_Hyundai',
 'company_Jaguar',
 'company_Jeep',
 'company_KIA',
 'company_Landrover',
 'company_MG',
 'company_Mahindra',
 'company_Maruti',
 'company_Mercedes',
 'company_Nissan',
 'company_Porsche',
 'company_Renault',
 'company_Skoda',
 'company_Tata',
 'company_Toyota',
 'company_Volkswagen',
 'company_Volvo',
 'Category_Luxury',
 'Category_Standard',
 'Kilometers_Category_Moderate',
 'Price_Category_By_Name_Luxury',
 'Price_Category_By_Name_Mid-Range',
 'Price_Category_By_Name_Premium']

In [47]:
# Selecting train & Test data
X_xgb = X[important_features_xgb]

# Modeling
xgb = xgb_grid.best_estimator_
xgb.fit(X_xgb,y)

# Evaluation
ypred = xgb.predict(X_xgb)

print('Train r2:',r2_score(y,ypred))
print('CV Score:',cross_val_score(xgb,X_xgb,y,cv=5,scoring='r2').mean())

Train r2: 0.8784924342460375
CV Score: 0.8383342091555704
