# Cardekho Price Prediction

###### Dataset: https://www.kaggle.com/datasets/manishkr1754/cardekho-used-car-data/data

The used car market in India is a dynamic and ever-changing landscape. Prices can fluctuate wildly based on a variety of factors including the make and model of the car, its mileage, its condition and the current market conditions. As a result, it can be difficult for sellers to accurately price their cars.

This dataset contains information about used cars.
This data can be used for a lot of purposes such as Used Car Price Prediction using different Machine Learning Techniques.

#### Data Description (Feature Information)

- `car_name`: Car's Full name, which includes brand and specific model name.
- `brand`: Brand Name of the particular car.
- `model`: Exact model name of the car of a particular brand.
- `seller_type`: Which Type of seller is selling the used car
- `fuel_type`: Fuel used in the used car, which was put up on sale.
- `transmission_type`: Transmission used in the used car, which was put on sale.
- `vehicle_age`: The count of years since car was bought.
- `mileage`: It is the number of kilometer the car runs per litre.
- `engine`: It is the engine capacity in cc(cubic centimeters)
- `max_power`: Max power it produces in BHP.
- `seats`: Total number of seats in car.
- `selling_price`: The sale price which was put up on website.

In [None]:
import time
start_time = time.time()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [None]:
df0 = pd.read_csv('cardekho_dataset.csv.zip')
df = df0.copy()

In [None]:
df.info()

In [None]:
df.head()

# Data Cleaning

In [None]:
df.isna().sum()

# Drop Irrelevant Columns

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df.columns.tolist()

## car_name

In [None]:
df.drop('car_name', axis=1, inplace=True)

In [None]:
df.columns.tolist()

## brand

In [None]:
df.drop('brand', axis=1, inplace=True)

In [None]:
df.columns.tolist()

## model

In [None]:
df['model'].value_counts()

In [None]:
df['model'].unique()

# vehicle_age

In [None]:
sns.histplot(data=df, x='vehicle_age', kde=True, bins=30);

## km_driven

In [None]:
sns.histplot(data=df,x='km_driven',kde=True);

## seller_type

In [None]:
sns.countplot(data=df,x='seller_type',palette='tab10');

## fuel_type

In [None]:
sns.countplot(data=df,x='fuel_type',palette='tab10');

## transmission_type

In [None]:
sns.countplot(data=df,x='transmission_type',palette='tab10');

## mileage

In [None]:
sns.histplot(data=df,x='mileage',kde=True,bins=35);

## engine

In [None]:
df['engine'].unique()

## max_power

In [None]:
sns.histplot(data=df,x='max_power',kde=True);

## seats

In [None]:
sns.countplot(data=df,x='seats',palette='tab10');

## selling_price

In [None]:
sns.histplot(data=df,x='selling_price',kde=True);

---

## Feature Engineering

In [None]:
df.head()

### Get Numerical Features

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O' ]
print('num_features: ', len(num_features))

### Get Categorical Features

In [None]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('cat_features: ', len(cat_features))

### Get Discrete Features

In [None]:
discrete_features = [feature for feature in num_features if len(df[feature].unique()) <= 25]
print('discrete_features: ', len(discrete_features))

### Get Continuous Features

In [None]:
continuous_features = [features for features in num_features if features not in discrete_features]
print('continuous_features: ', len(continuous_features))

## Declare Dependent Feature & Independent Features

In [None]:
X = df.drop('selling_price', axis=1)

In [None]:
X.head()

In [None]:
y = df['selling_price']

In [None]:
y.head()

---

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

## Feature Transformation

In [None]:
num_features = X_train.select_dtypes(exclude='object').columns

In [None]:
num_features

In [None]:
oh_cols = ['seller_type','fuel_type','transmission_type']

In [None]:
len(X_train['seller_type'].unique())

In [None]:
len(X_train['fuel_type'].unique())

In [None]:
len(X_train['transmission_type'].unique())

### Feature Encoding

In [None]:
from sklearn.preprocessing import TargetEncoder

In [None]:
te = TargetEncoder(target_type='auto', smooth='auto', cv=5)

In [None]:
len(X_train['model'].unique())

In [None]:
X_train.head()

In [None]:
X_train['model'] = te.fit_transform(X_train[['model']],y_train)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_test['model'] = te.transform(X_test[['model']])

In [None]:
X_test.head()

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
X_train.head()

In [None]:
numeric_transformer = StandardScaler()

In [None]:
oh_transformer = OneHotEncoder(drop='first')

In [None]:
preprocessor = ColumnTransformer([
    ('OneHotEncoder',oh_transformer,oh_cols),
    ('StandardScaler', numeric_transformer,num_features)
], remainder='passthrough')

In [None]:
preprocessor

In [None]:
X_train.head()

### Apply Transformation

In [None]:
X_train = preprocessor.fit_transform(X_train)

In [None]:
X_train

In [None]:
X_test = preprocessor.transform(X_test)

In [None]:
X_test

In [None]:
X_train

---

# Model Training

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
def eval_model(y_true, y_predicted):
    mae = mean_absolute_error(y_true, y_predicted)
    mse = mean_squared_error(y_true, y_predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_predicted)
    adj_r2 = 1 - (1 - r2) * (len(y_true) - 1) / (len(y_true) - X_test.shape[1] -1)
    return mae, mse, rmse, r2, adj_r2

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'KNN Regressor': KNeighborsRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'AdaBoost Regressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBoost Regressor': XGBRegressor()
}

In [None]:
for i in range(len(list(models))):

    model = list(models.values())[i]
    model.fit(X_train,y_train)


    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)


    model_train_mae, model_train_mse, model_train_rmse, model_train_r2, model_train_adjr2 = eval_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2, model_test_adjr2 = eval_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    print('\n')

    print('Model Performance: Train Set')
    print('- RMSE: {:.4f}'.format(model_train_rmse))
    print('- MSE: {:.4f}'.format(model_train_mse))
    print('- MAE: {:.4f}'.format(model_train_mae))
    print('- R2: {:.4f}'.format(model_train_r2))
    print('- ADJ_R2: {:.4f}'.format(model_train_adjr2))

    print('-'*35)

    print('Model Performance: Test Set')
    print('- RMSE: {:.4f}'.format(model_test_rmse))
    print('- MSE: {:.4f}'.format(model_test_mse))
    print('- MAE: {:.4f}'.format(model_test_mae))
    print('- R2: {:.4f}'.format(model_test_r2))
    print('- ADJ_R2: {:.4f}'.format(model_test_adjr2))

    print('='*35)
    print('\n')

---

# Hyperparameter Tuning

In [None]:
adb_params = {
    'n_estimators':[50,60,70,80],
    'loss':['linear','square','exponential']
}

In [None]:
gdb_params = {
    'loss':['squared_error','huber','absolute_error'],
    'criterion':['friedman_mse','squared_error','mse'],
    'min_samples_split':[2,8,15,20],
    'n_estimators':[100,200,500,1000],
    'max_depth':[None, range(5,16)],
    'learning_rate': [0.1,0.01,0.001,0.2,0.3]
}

In [None]:
knn_params = {
    'n_neighbors':[2,3,10,20,30,40,50] 
}

In [None]:
rf_params = {
    'max_depth':[None, list(range(5,11))],
    'max_features': [None, list(range(5,10))],
    'min_samples_split': [2,8,15,20],
    'n_estimators': [100,200,500,1000]
}

In [None]:
xgb_params = {
    'learning_rate':[0.1,0.01],
    'max_depth':[5,8,12,20,30],
    'n_estimators':[100,200,300],
    'colsample_bytree':[0.5,0.8,1,0.3,0.4]
}

In [None]:
rscv_models = [
    ('KNN Regressor', KNeighborsRegressor(), knn_params),
    ('Random Forest Regressor', RandomForestRegressor(), rf_params),
    ('Adaboost Regressor', AdaBoostRegressor(), adb_params),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), gdb_params),
    ('XGB Regressor',XGBRegressor(), xgb_params)
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
model_param = {}

for name, model, params in rscv_models:

    rscv = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
    )

    rscv.fit(X_train,y_train)

    model_param[name] = rscv.best_params_

In [None]:
for model_name in model_param:
    print(f'------------------------ Best Params for {model_name} ------------------------------')
    print(model_param[model_name])
    print('='*40)
    print('\n')

## Training the Best Models with the Best Hyperparameters

In [None]:
models = {
    'Random Forest Regressor': RandomForestRegressor(
        n_estimators=200,
        min_samples_split=2,
        max_features=None,
        max_depth=None
    ),
    'KNN Regressor': KNeighborsRegressor(
        n_neighbors=10,
        n_jobs=-1
    ),
    'AdaBoostRegressor': AdaBoostRegressor(
        n_estimators=50,
        loss='linear'
    ),
    'GradientBoostingRegressor': GradientBoostingRegressor(
        n_estimators=100, 
        min_samples_split=2, 
        max_depth=None, 
        loss='absolute_error', 
        learning_rate=0.3, 
        criterion='squared_error'
    ),
    'XGB Regressor': XGBRegressor(
        n_estimators=300, 
        max_depth=5, 
        learning_rate=0.1, 
        colsample_bytree=0.4
    )
}

In [None]:
for i in range(len(list(models))):

    model = list(models.values())[i]
    model.fit(X_train,y_train)


    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)


    model_train_mae, model_train_mse, model_train_rmse, model_train_r2, model_train_adjr2 = eval_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2, model_test_adjr2 = eval_model(y_test, y_test_pred)


    print(list(models.keys())[i])

    print('Model Performance: Train Set')
    print('- RMSE: {:.4f}'.format(model_train_rmse))
    print('- MSE: {:.4f}'.format(model_train_mse))
    print('- MAE: {:.4f}'.format(model_train_mae))
    print('- R2: {:.4f}'.format(model_train_r2))
    print('- ADJ_R2: {:.4f}'.format(model_train_adjr2))

    print('-'*35)

    print('Model Performance: Test Set')
    print('- RMSE: {:.4f}'.format(model_test_rmse))
    print('- MSE: {:.4f}'.format(model_test_mse))
    print('- MAE: {:.4f}'.format(model_test_mae))
    print('- R2: {:.4f}'.format(model_test_r2))
    print('- ADJ_R2: {:.4f}'.format(model_test_adjr2))

    print('='*35)
    print('\n')

---

In [None]:
end_time = time.time()
elapsed_seconds = end_time - start_time
elapsed_minutes = elapsed_seconds / 60

print(f"Total notebook execution time: {elapsed_minutes:.2f} minutes")