# Car Price Prediction from CarDekho.com

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [87]:
df_main = pd.read_csv("car data.csv")
df = df_main
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [89]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [90]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [91]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
num_features

['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner']

In [92]:
categ_features = [feature for feature in df.columns if df[feature].dtype == 'O']
categ_features

['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission']

In [93]:
#Independent and Dependent Features
X = df.drop('Selling_Price', axis=1)
Y = df['Selling_Price']

In [94]:
X.head()

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,6.87,42450,Diesel,Dealer,Manual,0


#### Apply LabelEncoder to 'Car_Name' before other Encodings

In [95]:
from sklearn.preprocessing import LabelEncoder
# Apply LabelEncoder to 'Car_Name' before splitting
le = LabelEncoder()
X['Car_Name'] = le.fit_transform(X['Car_Name'])

In [96]:
X.head()

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,90,2014,5.59,27000,Petrol,Dealer,Manual,0
1,93,2013,9.54,43000,Diesel,Dealer,Manual,0
2,68,2017,9.85,6900,Petrol,Dealer,Manual,0
3,96,2011,4.15,5200,Petrol,Dealer,Manual,0
4,92,2014,6.87,42450,Diesel,Dealer,Manual,0


In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.23, random_state=42)

## Feature Encoding and Scaling
#### One Hot Encoding for columns which has lesser unique values <25
#### Label or Ordinal Encoding for hgher unique values >=25

In [98]:
# Identify numeric and categorical features
num_features = ['Year', 'Present_Price', 'Kms_Driven', 'Owner']
categ_features = ['Seller_Type', 'Fuel_Type', 'Transmission']

In [99]:
num_features

['Year', 'Present_Price', 'Kms_Driven', 'Owner']

In [100]:
X_train

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
30,90,2012,5.98,51439,Diesel,Dealer,Manual,0
22,93,2011,8.01,50000,Petrol,Dealer,Automatic,0
286,87,2016,7.90,28569,Petrol,Dealer,Manual,0
56,77,2015,7.27,40534,Petrol,Dealer,Manual,0
239,75,2012,4.43,23709,Petrol,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
188,21,2013,0.57,18000,Petrol,Individual,Manual,0
71,71,2011,12.48,45000,Diesel,Dealer,Manual,0
106,40,2014,3.45,16500,Petrol,Individual,Manual,1
270,69,2011,10.00,69341,Petrol,Dealer,Manual,0


In [101]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define transformers
std_transformer = StandardScaler()
ohe_transformer = OneHotEncoder(drop='first', sparse_output=False)

# Create column transformer
preprocessor = ColumnTransformer([
    ("OneHotEncoder", ohe_transformer, categ_features),
    ("StandardScaler", std_transformer, num_features)
])

# Fit on training data and transform both training and testing data
X_train = preprocessor.fit_transform(X_train)  # Fit & Transform on Train Data
X_test = preprocessor.transform(X_test)        # Only Transform Test Data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (231, 8)
X_test shape: (70, 8)


In [102]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,1.0,0.0,1.0,-0.589373,-0.165497,0.324884,-0.178303
1,0.0,0.0,1.0,0.0,-0.938463,0.065999,0.290917,-0.178303
2,0.0,0.0,1.0,1.0,0.806987,0.053455,-0.21496,-0.178303
3,0.0,0.0,1.0,1.0,0.457897,-0.018389,0.067473,-0.178303
4,0.0,0.0,1.0,1.0,-0.589373,-0.342256,-0.32968,-0.178303


In [103]:
pd.DataFrame(X_test).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.0,1.0,0.0,0.806987,-0.782441,-0.322811,-0.178303
1,0.0,0.0,1.0,1.0,0.806987,0.70347,-0.630147,-0.178303
2,0.0,1.0,0.0,1.0,-0.589373,0.224512,0.526966,-0.178303
3,1.0,0.0,1.0,1.0,-0.938463,-0.782441,-0.063157,3.566057
4,0.0,0.0,1.0,1.0,-0.240283,1.274799,0.054891,-0.178303


In [104]:
X_train.shape

(231, 8)

In [105]:
X_test.shape

(70, 8)

## Model Training and Model Selection

In [106]:
# import Adaboost Regressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [107]:
# function to evaluate model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2 = r2_score(true, predicted)
    rmse = root_mean_squared_error(true, predicted)
    return mae, mse, rmse, r2

In [108]:
## Beginning with model training
models = {
    "Adaboost Regressor": AdaBoostRegressor(),
    "Gradientboost Regressor": GradientBoostingRegressor()
}

In [109]:
# Iterate over models, perform predictions and evaluate performance
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, Y_train)
    
    # Make predictions
    Y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae, mse, rmse, r2 = evaluate_model(Y_test, Y_pred)
    
    # Print the results
    print(f"Model: {model_name}")
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    print("-" * 50)

Model: Adaboost Regressor
MAE: 0.9134, MSE: 1.7384, RMSE: 1.3185, R2: 0.9410
--------------------------------------------------
Model: Gradientboost Regressor
MAE: 0.5582, MSE: 0.9679, RMSE: 0.9838, R2: 0.9671
--------------------------------------------------


## Perform Hyperparamtertuning using RandomizeSearchCV

In [110]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [114]:
ada_params = {
    "n_estimators": [50, 100, 150, 200, 250],
    "learning_rate": [1.0, 1.1, 0.1, 0.5]
}

grad_params = {
    "learning_rate": [1.0, 0.1, 0.5, 0.2,.03],
    "n_estimators": [50,100,150,200,250],
    "min_samples_split" : [2, 8, 12, 15],
    "max_depth": [5, 8, 10, 15, None],
}

In [115]:
random_cv_models = [
    ("AdaBoostRegressor", AdaBoostRegressor(), ada_params),
    ("GradientboostRegressor", GradientBoostingRegressor(), grad_params)
]

In [116]:
model_param = {}

for name, model, params in random_cv_models:
    random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=100, cv=3, verbose=2, n_jobs=-1)
    random.fit(X_train, Y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"----------Best Params for {model_name}--------------")
    print(model_param[model_name])


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
----------Best Params for AdaBoostRegressor--------------
{'n_estimators': 150, 'learning_rate': 1.1}
----------Best Params for GradientboostRegressor--------------
{'n_estimators': 150, 'min_samples_split': 8, 'max_depth': 15, 'learning_rate': 0.5}


### Now again train this model using Adaboost Classifier and check the score

In [118]:
random_cv_models = [
    ("AdaBoostRegressor", AdaBoostRegressor(n_estimators=150, learning_rate=1.1)),
    ("GradientboostRegressor", GradientBoostingRegressor(n_estimators=150, min_samples_split=8, max_depth=15, learning_rate=0.5))
]

In [119]:
# Iterate over models, perform predictions and evaluate performance
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, Y_train)
    
    # Make predictions
    Y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae, mse, rmse, r2 = evaluate_model(Y_test, Y_pred)
    
    # Print the results
    print(f"Model: {model_name}")
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    print("-" * 50)

Model: Adaboost Regressor
MAE: 1.0095, MSE: 1.8420, RMSE: 1.3572, R2: 0.9375
--------------------------------------------------
Model: Gradientboost Regressor
MAE: 0.5596, MSE: 0.9646, RMSE: 0.9821, R2: 0.9673
--------------------------------------------------
