Imports

In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor



Load Dataset & Basic Info

In [42]:
df = pd.read_csv(r"C:\Users\Harish Raj\OneDrive\Documents\Car Price Pred\CAR DETAILS FROM CAR DEKHO.csv")
df.head()


Unnamed: 0,name,year,selling price,km driven,fuel,seller type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [None]:

df.shape
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling price  4340 non-null   int64 
 3   km driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


name             0
year             0
selling price    0
km driven        0
fuel             0
seller type      0
transmission     0
owner            0
dtype: int64

Data Preparation

In [44]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [45]:
df.drop("name", axis=1, inplace=True)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


Encode Categorical Columns

In [46]:
le = LabelEncoder()

cat_cols = ["fuel", "seller_type", "transmission", "owner"]

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

df.head()


Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,4,1,1,0
1,2007,135000,50000,4,1,1,0
2,2012,600000,100000,1,1,1,0
3,2017,250000,46000,4,1,1,0
4,2014,450000,141000,1,1,1,2


Features & Target

In [47]:
X = df.drop("selling_price", axis=1)
y = df["selling_price"]


Train test split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=22
)


Evaluation

In [49]:
def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    
    return r2, mae, rmse


Linear Regression Model

In [50]:
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_r2, lr_mae, lr_rmse = evaluate(lr, X_test, y_test)

lr_r2, lr_mae, lr_rmse


(0.48416664112403374, 241327.40180818044, np.float64(451468.22187366325))

Random Forest Model

In [51]:
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)

rf_r2, rf_mae, rf_rmse = evaluate(rf, X_test, y_test)

rf_r2, rf_mae, rf_rmse


(0.7665314067739737, 157507.19690205628, np.float64(303729.312721147))

XGBoost Model

In [52]:
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_r2, xgb_mae, xgb_rmse = evaluate(xgb, X_test, y_test)

xgb_r2, xgb_mae, xgb_rmse


(0.7738864421844482, 160842.421875, np.float64(298906.79109046684))

Comparing Models

In [53]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "XGBoost"],
    "R2 Score": [lr_r2, rf_r2, xgb_r2],
    "MAE": [lr_mae, rf_mae, xgb_mae],
    "RMSE": [lr_rmse, rf_rmse, xgb_rmse]
})

results


Unnamed: 0,Model,R2 Score,MAE,RMSE
0,Linear Regression,0.484167,241327.401808,451468.221874
1,Random Forest,0.766531,157507.196902,303729.312721
2,XGBoost,0.773886,160842.421875,298906.79109


In [54]:
best_model = results.sort_values(by="R2 Score", ascending=False)
best_model


Unnamed: 0,Model,R2 Score,MAE,RMSE
2,XGBoost,0.773886,160842.421875,298906.79109
1,Random Forest,0.766531,157507.196902,303729.312721
0,Linear Regression,0.484167,241327.401808,451468.221874


In [55]:
importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": xgb.feature_importances_
}).sort_values(by="Importance", ascending=False)

importance


Unnamed: 0,Feature,Importance
4,transmission,0.631571
2,fuel,0.118736
0,year,0.102841
1,km_driven,0.069021
3,seller_type,0.056025
5,owner,0.021806


Prediction

In [56]:
new_car = pd.DataFrame({
    "year": [2018],
    "km_driven": [35000],
    "fuel": [1],          # petrol usually = 1 (based on encoding)
    "seller_type": [0],
    "transmission": [1],
    "owner": [0]
})

xgb.predict(new_car)


array([804264.], dtype=float32)