# Car Price Prediction from CarDekho.com

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [103]:
df_main = pd.read_csv("car data.csv")
df = df_main
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [105]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [106]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [107]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
num_features

['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner']

In [108]:
categ_features = [feature for feature in df.columns if df[feature].dtype == 'O']
categ_features

['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission']

In [109]:
#Independent and Dependent Features
X = df.drop('Selling_Price', axis=1)
Y = df['Selling_Price']

In [110]:
X.head()

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,6.87,42450,Diesel,Dealer,Manual,0


#### Apply LabelEncoder to 'Car_Name' before other Encodings

In [111]:
from sklearn.preprocessing import LabelEncoder
# Apply LabelEncoder to 'Car_Name' before splitting
le = LabelEncoder()
X['Car_Name'] = le.fit_transform(X['Car_Name'])

In [112]:
X.head()

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,90,2014,5.59,27000,Petrol,Dealer,Manual,0
1,93,2013,9.54,43000,Diesel,Dealer,Manual,0
2,68,2017,9.85,6900,Petrol,Dealer,Manual,0
3,96,2011,4.15,5200,Petrol,Dealer,Manual,0
4,92,2014,6.87,42450,Diesel,Dealer,Manual,0


In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.23, random_state=40)

## Feature Encoding and Scaling
#### One Hot Encoding for columns which has lesser unique values <25
#### Label or Ordinal Encoding for hgher unique values >=25

In [114]:
# Identify numeric and categorical features
num_features = ['Year', 'Present_Price', 'Kms_Driven', 'Owner']
categ_features = ['Seller_Type', 'Fuel_Type', 'Transmission']

In [115]:
num_features

['Year', 'Present_Price', 'Kms_Driven', 'Owner']

In [116]:
X_train

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
267,69,2016,9.400,19434,Diesel,Dealer,Manual,0
208,84,2017,8.100,3435,Petrol,Dealer,Manual,0
222,84,2014,7.600,77632,Diesel,Dealer,Manual,0
175,22,2011,0.787,75000,Petrol,Individual,Manual,0
44,89,2012,2.690,50000,Petrol,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
268,66,2017,5.800,19000,Petrol,Dealer,Manual,0
193,19,2013,0.650,24000,Petrol,Individual,Manual,1
165,1,2016,0.540,500,Petrol,Individual,Automatic,0
7,91,2015,8.610,33429,Diesel,Dealer,Manual,0


In [117]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define transformers
std_transformer = StandardScaler()
ohe_transformer = OneHotEncoder(drop='first', sparse_output=False)

# Create column transformer
preprocessor = ColumnTransformer([
    ("OneHotEncoder", ohe_transformer, categ_features),
    ("StandardScaler", std_transformer, num_features)
])

# Fit on training data and transform both training and testing data
X_train = preprocessor.fit_transform(X_train)  # Fit & Transform on Train Data
X_test = preprocessor.transform(X_test)        # Only Transform Test Data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (231, 8)
X_test shape: (70, 8)


In [118]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,1.0,0.0,1.0,0.854423,0.159548,-0.421965,-0.200118
1,0.0,0.0,1.0,1.0,1.228233,0.016975,-0.812033,-0.200118
2,0.0,1.0,0.0,1.0,0.106803,-0.037861,0.996949,-0.200118
3,1.0,0.0,1.0,1.0,-1.014627,-0.785053,0.932779,-0.200118
4,0.0,0.0,1.0,1.0,-0.640817,-0.576348,0.323259,-0.200118


In [119]:
pd.DataFrame(X_test).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,0.0,1.0,0.480613,-0.313136,-0.030263,-0.200118
1,1.0,0.0,1.0,1.0,0.854423,-0.706857,-0.683668,-0.200118
2,0.0,1.0,0.0,1.0,0.480613,0.620168,0.762113,-0.200118
3,0.0,1.0,0.0,1.0,-0.640817,0.159548,0.835256,-0.200118
4,0.0,1.0,0.0,1.0,0.480613,0.564236,0.56892,-0.200118


In [120]:
X_train.shape

(231, 8)

In [121]:
X_test.shape

(70, 8)

## Model Training and Model Selection

In [122]:
# import Adaboost Regressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [123]:
# function to evaluate model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2 = r2_score(true, predicted)
    rmse = root_mean_squared_error(true, predicted)
    return mae, mse, rmse, r2

In [124]:
## Beginning with model training
models = {
    "Adaboost Regressor": AdaBoostRegressor(),
    "Gradientboost Regressor": GradientBoostingRegressor(),
    "XGBoost Regressor": XGBRegressor()
}

In [125]:
# Iterate over models, perform predictions and evaluate performance
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, Y_train)
    
    # Make predictions
    Y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae, mse, rmse, r2 = evaluate_model(Y_test, Y_pred)
    
    # Print the results
    print(f"Model: {model_name}")
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    print("-" * 50)

Model: Adaboost Regressor
MAE: 0.8883, MSE: 1.1924, RMSE: 1.0920, R2: 0.9294
--------------------------------------------------
Model: Gradientboost Regressor
MAE: 0.3969, MSE: 0.3942, RMSE: 0.6278, R2: 0.9767
--------------------------------------------------
Model: XGBoost Regressor
MAE: 0.4942, MSE: 0.7108, RMSE: 0.8431, R2: 0.9579
--------------------------------------------------


## Perform Hyperparamtertuning using RandomizeSearchCV

In [126]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [127]:
ada_params = {
    "n_estimators": [50, 100, 150, 200, 250],
    "learning_rate": [1.0, 1.1, 0.1, 0.5]
}

grad_params = {
    "learning_rate": [1.0, 0.1, 0.5, 0.2,.03],
    "n_estimators": [50,100,150,200,250],
    "min_samples_split" : [2, 8, 12, 15],
    "max_depth": [5, 8, 10, 15, None],
}

xgboost_params = {
    "learning_rate": [1.0, 0.1, 0.5, 0.2,.03],
    "n_estimators": [50,100,150,200,250],
    "max_depth": [5, 8, 10, 15, None],
}

In [128]:
random_cv_models = [
    ("AdaBoostRegressor", AdaBoostRegressor(), ada_params),
    ("GradientboostRegressor", GradientBoostingRegressor(), grad_params),
    # ("XGBoostRegressor", XGBRegressor(), xgboost_params)
]

In [129]:
model_param = {}

for name, model, params in random_cv_models:
    random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=100, cv=3, verbose=2, n_jobs=-1)
    random.fit(X_train, Y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"----------Best Params for {model_name}--------------")
    print(model_param[model_name])


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
----------Best Params for AdaBoostRegressor--------------
{'n_estimators': 100, 'learning_rate': 0.5}
----------Best Params for GradientboostRegressor--------------
{'n_estimators': 250, 'min_samples_split': 12, 'max_depth': 15, 'learning_rate': 0.03}


### Now again train this model using Adaboost Classifier and check the score

In [130]:
models = [
    ("AdaBoostRegressor", AdaBoostRegressor(n_estimators=150, learning_rate=1.1)),
    ("GradientboostRegressor", GradientBoostingRegressor(n_estimators=150, min_samples_split=8, max_depth=15, learning_rate=0.5))
]

In [131]:
# Iterate over models, perform predictions and evaluate performance
for model_name, model in models:
    # Train the model
    model.fit(X_train, Y_train)
    
    # Make predictions
    Y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae, mse, rmse, r2 = evaluate_model(Y_test, Y_pred)
    
    # Print the results
    print(f"Model: {model_name}")
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    print("-" * 50)

Model: AdaBoostRegressor
MAE: 0.9802, MSE: 1.3926, RMSE: 1.1801, R2: 0.9176
--------------------------------------------------
Model: GradientboostRegressor
MAE: 0.5017, MSE: 0.7131, RMSE: 0.8444, R2: 0.9578
--------------------------------------------------
