<a href="https://colab.research.google.com/github/jrDhiraj/machineLearning/blob/main/car_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [176]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

from sklearn.metrics import accuracy_score, classification_report, r2_score


In [177]:
df = pd.read_csv('quikr_car.csv')
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [179]:
backup = df.copy()

In [180]:
df = df[df['year'].str.isnumeric()]

In [181]:
df['year'] = df['year'].astype(int)

In [182]:
df = df[df['Price']!='Ask For Price']

In [183]:
df['Price'] =  df['Price'].str.replace(',','').astype(int)

In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 819 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        819 non-null    object
 1   company     819 non-null    object
 2   year        819 non-null    int64 
 3   Price       819 non-null    int64 
 4   kms_driven  819 non-null    object
 5   fuel_type   816 non-null    object
dtypes: int64(2), object(4)
memory usage: 44.8+ KB


In [185]:
df['kms_driven'] = df['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [186]:
df[df['kms_driven'].str.isnumeric()]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz VXI ABS,Maruti,2011,270000,50000,Petrol
885,Tata Indica V2 DLE BS III,Tata,2009,110000,30000,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
888,Tata Zest XM Diesel,Tata,2018,260000,27000,Diesel


In [187]:
df = df[df['kms_driven'] != 'Petrol']

df['kms_driven'] = df['kms_driven'].astype(int)

In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int64 
 3   Price       817 non-null    int64 
 4   kms_driven  817 non-null    int64 
 5   fuel_type   816 non-null    object
dtypes: int64(3), object(3)
memory usage: 44.7+ KB


In [189]:
df[~df['fuel_type'].isna()]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz VXI ABS,Maruti,2011,270000,50000,Petrol
885,Tata Indica V2 DLE BS III,Tata,2009,110000,30000,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
888,Tata Zest XM Diesel,Tata,2018,260000,27000,Diesel


In [190]:
df['name']=df['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [191]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int64 
 3   Price       817 non-null    int64 
 4   kms_driven  817 non-null    int64 
 5   fuel_type   816 non-null    object
dtypes: int64(3), object(3)
memory usage: 44.7+ KB


In [192]:
df = df[df['Price']<6e6].reset_index(drop=True)

In [193]:
df.to_csv('car.csv')

In [194]:
X = df.drop(columns='Price')
y = df['Price']

In [195]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [196]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline


In [197]:
ohe = OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [198]:
# Create column transformer with handle_unknown='ignore'
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['name','company','fuel_type']),
    remainder='passthrough'
)

In [199]:
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)

In [200]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [201]:
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

R² Score: 0.6996883918714838


In [202]:
score = []
best_random_states = []

for i in range(652):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans, lr)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2_val = r2_score(y_test, y_pred)
    score.append(r2_val)
    best_random_states.append(i)
    if i % 100 == 0:  # Print progress every 100 iterations
        print(f"Iteration {i}: R² Score: {r2_val}")


Iteration 0: R² Score: 0.6667061889866989
Iteration 100: R² Score: 0.7410386153037278
Iteration 200: R² Score: 0.6928402743968838
Iteration 300: R² Score: 0.7839671530934953
Iteration 400: R² Score: 0.6425458860960517
Iteration 500: R² Score: 0.7846066482870314
Iteration 600: R² Score: 0.47839522162812087


In [203]:
# Find the best random state (the index, not the score value)
best_idx = np.argmax(score)
best_random_state = best_random_states[best_idx]
best_r2 = score[best_idx]


In [204]:
print(f"\nBest R² Score: {best_r2}")
print(f"Best Random State: {best_random_state}")



Best R² Score: 0.8383067568691951
Best Random State: 398


In [205]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=best_random_state)
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
final_r2 = r2_score(y_test, y_pred)
print(f"Final R² Score with best random state: {final_r2}")


Final R² Score with best random state: 0.8383067568691951


# ***`XG_BOOST`***




In [206]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [207]:
df = pd.get_dummies(df, columns=['name'], drop_first=True, dtype=int)


In [208]:
df.head()

Unnamed: 0,company,year,Price,kms_driven,fuel_type,name_Audi A4 1.8,name_Audi A4 2.0,name_Audi A6 2.0,name_Audi A8,name_Audi Q3 2.0,...,name_Volkswagen Passat Diesel,name_Volkswagen Polo,name_Volkswagen Polo Comfortline,name_Volkswagen Polo Highline,name_Volkswagen Polo Highline1.2L,name_Volkswagen Polo Trendline,name_Volkswagen Vento Comfortline,name_Volkswagen Vento Highline,name_Volkswagen Vento Konekt,name_Volvo S80 Summum
0,Hyundai,2007,80000,45000,Petrol,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Mahindra,2006,425000,40,Diesel,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hyundai,2014,325000,28000,Petrol,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ford,2014,575000,36000,Diesel,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Ford,2012,175000,41000,Diesel,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [209]:
df = pd.get_dummies(df, columns=['fuel_type'], drop_first=True, dtype=int)

In [210]:
df = pd.get_dummies(df, columns=['company'], drop_first=True, dtype=int)

In [211]:
df.head()

Unnamed: 0,year,Price,kms_driven,name_Audi A4 1.8,name_Audi A4 2.0,name_Audi A6 2.0,name_Audi A8,name_Audi Q3 2.0,name_Audi Q5 2.0,name_Audi Q7,...,company_Mercedes,company_Mini,company_Mitsubishi,company_Nissan,company_Renault,company_Skoda,company_Tata,company_Toyota,company_Volkswagen,company_Volvo
0,2007,80000,45000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2006,425000,40,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014,325000,28000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2014,575000,36000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2012,175000,41000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [212]:
import datetime

current_year = datetime.datetime.now().year
df['car_age'] = current_year - df['year']
df = df.drop(columns=['year'])

In [213]:
df.head()

Unnamed: 0,Price,kms_driven,name_Audi A4 1.8,name_Audi A4 2.0,name_Audi A6 2.0,name_Audi A8,name_Audi Q3 2.0,name_Audi Q5 2.0,name_Audi Q7,name_BMW 3 Series,...,company_Mini,company_Mitsubishi,company_Nissan,company_Renault,company_Skoda,company_Tata,company_Toyota,company_Volkswagen,company_Volvo,car_age
0,80000,45000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,18
1,425000,40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19
2,325000,28000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
3,575000,36000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
4,175000,41000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13


In [214]:

df['kms_driven_log'] = np.log1p(df['kms_driven'])
df.drop(columns=['kms_driven'], inplace=True)

In [215]:
df.head()

Unnamed: 0,Price,name_Audi A4 1.8,name_Audi A4 2.0,name_Audi A6 2.0,name_Audi A8,name_Audi Q3 2.0,name_Audi Q5 2.0,name_Audi Q7,name_BMW 3 Series,name_BMW 5 Series,...,company_Mitsubishi,company_Nissan,company_Renault,company_Skoda,company_Tata,company_Toyota,company_Volkswagen,company_Volvo,car_age,kms_driven_log
0,80000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18,10.71444
1,425000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19,3.713572
2,325000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,10.239996
3,575000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,10.491302
4,175000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,13,10.621352


In [216]:
X_scale = df.drop(columns='Price')
y = df['Price']

In [217]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [218]:

X_train_scale, X_test_scale, y_train_scale, y_test_scale = train_test_split(
    X_scale, y, test_size=0.25, random_state=42
)

# Create regression model
model_XG = XGBRegressor(
    n_estimators=200,      # trees
    max_depth=4,           # depth of each tree
    learning_rate=0.05,    # step size
    subsample=0.8,         # random sampling of rows
    colsample_bytree=0.8,  # random sampling of features
    random_state=42
)

# Train model
model_XG.fit(X_train_scale, y_train_scale)

# Predictions
y_pred = model_XG.predict(X_test_scale)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test_scale, y_pred))
r2 = r2_score(y_test_scale, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)



RMSE: 359774.80316442397
R² Score: 0.35993850231170654


In [219]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [220]:
param_dist = {
    'n_estimators': randint(10, 100),       # number of trees
    'max_depth': randint(2, 10),             # tree depth
    'learning_rate': uniform(0.01, 0.3),     # step size shrinkage
    'subsample': uniform(0.6, 0.4),          # row sampling
    'colsample_bytree': uniform(0.6, 0.4),   # feature sampling
    'gamma': uniform(0, 5),                  # min loss reduction
    'reg_alpha': uniform(0, 1),              # L1 regularization
    'reg_lambda': uniform(0, 1),             # L2 regularization
}

In [221]:
classifier = RandomizedSearchCV(model_XG, param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)

classifier.fit(X_train_scale, y_train_scale)

result = pd.DataFrame(classifier.cv_results_)

result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_n_estimators,param_reg_alpha,...,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.182045,0.006974,0.065058,0.002401,0.749816,4.753572,0.229598,6,30,0.156019,...,0.623233,"{'colsample_bytree': 0.749816047538945, 'gamma...",0.681406,0.730298,0.529372,0.503138,0.670829,0.623009,0.089827,45
1,0.271731,0.018098,0.062761,0.002656,0.946470,3.005575,0.222422,7,62,0.969910,...,0.684936,"{'colsample_bytree': 0.9464704583099741, 'gamm...",0.696357,0.782672,0.568126,0.531738,0.656050,0.646988,0.089924,26
2,0.340159,0.013715,0.062786,0.001716,0.672730,0.917023,0.101273,7,98,0.291229,...,0.655798,"{'colsample_bytree': 0.6727299868828402, 'gamm...",0.711059,0.758539,0.585412,0.526385,0.683749,0.653029,0.084917,17
3,0.303945,0.010437,0.068481,0.003832,0.716858,1.831809,0.146821,8,71,0.199674,...,0.836966,"{'colsample_bytree': 0.7168578594140873, 'gamm...",0.722443,0.791095,0.580315,0.527218,0.703978,0.665010,0.096824,6
4,0.203731,0.008507,0.088568,0.029035,0.618580,3.037724,0.061157,8,27,0.948886,...,0.923359,"{'colsample_bytree': 0.6185801650879991, 'gamm...",0.562951,0.613955,0.455275,0.398253,0.561775,0.518442,0.079268,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.219469,0.011940,0.063855,0.002768,0.867457,4.646880,0.177029,9,35,0.279979,...,0.674817,"{'colsample_bytree': 0.8674572879697724, 'gamm...",0.699981,0.746233,0.568447,0.527373,0.695723,0.647551,0.084271,24
96,0.217931,0.039179,0.107241,0.033074,0.729472,2.127182,0.162283,2,21,0.114837,...,0.715452,"{'colsample_bytree': 0.7294716945616975, 'gamm...",0.521137,0.540905,0.332069,0.310424,0.492644,0.439436,0.097953,93
97,0.280229,0.022831,0.143599,0.048095,0.832495,0.771814,0.154342,4,12,0.051824,...,0.653766,"{'colsample_bytree': 0.8324952885690449, 'gamm...",0.528833,0.549102,0.373692,0.348544,0.475289,0.455092,0.080823,91
98,0.508198,0.051337,0.104910,0.032814,0.625350,4.949801,0.106706,3,93,0.254641,...,0.904091,"{'colsample_bytree': 0.625349988189107, 'gamma...",0.658311,0.699630,0.492836,0.507021,0.670112,0.605582,0.087425,57


In [222]:
result['mean_test_score'].max()

0.6821628093719483

In [223]:
best_score = result['mean_test_score'].max()
print("Best CV score:", best_score)

Best CV score: 0.6821628093719483


In [224]:
best_rmse = -result['mean_test_score'].max()
print("Best CV RMSE:", best_rmse)

Best CV RMSE: -0.6821628093719483


***DEEP_LEARNING SEQUENTIAL_MODEL***

In [225]:
import tensorflow as tf
import keras
from keras import layers

In [226]:
model = keras.Sequential(
    [
        layers.Dense(16, activation="relu", name="layer1"),
        layers.Dense(8, activation="relu", name="layer2"),
        layers.Dense(4, activation="relu", name="layer3"),
        layers.Dense(1, activation ='linear', name="layer4"),
    ]
)

In [227]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.fit(X_train_scale, y_train_scale, epochs=100, batch_size=32, validation_split=0.2)



Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 259768385536.0000 - mae: 383492.0312 - val_loss: 261582929920.0000 - val_mae: 388208.3125
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 310580346880.0000 - mae: 404545.0000 - val_loss: 261582274560.0000 - val_mae: 388207.3438
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 232778694656.0000 - mae: 359899.4375 - val_loss: 261582192640.0000 - val_mae: 388207.2500
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 285057482752.0000 - mae: 397347.9688 - val_loss: 261582127104.0000 - val_mae: 388207.2500
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 314183090176.0000 - mae: 398964.8125 - val_loss: 261582110720.0000 - val_mae: 388207.1875
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

<keras.src.callbacks.history.History at 0x7cfe85e0bc20>

In [229]:
y_pred = model.predict(X_test_scale).ravel()

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [230]:
print("Shapes:")
print("X_test_scale:", X_test_scale.shape)
print("y_test_scale:", y_test_scale.shape)
print("y_pred:", y_pred.shape)


Shapes:
X_test_scale: (204, 281)
y_test_scale: (204,)
y_pred: (204,)


In [231]:
r2 = r2_score(y_test_scale, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_scale, y_pred))

print("R² Score:", r2)
print("RMSE:", rmse)

R² Score: -0.7419172525405884
RMSE: 593517.6844273472
