In [1]:
import numpy as np
import pandas as pd

In [2]:
file=pd.read_csv('all_car_adverts.csv')

In [3]:
newfile=file.drop(columns=['Unnamed: 0','make','variant','car_badges','car_sub_title','car_attention_grabber','car_specs','car_seller_rating','reg','body_type','engine_size_unit','ulez','car_seller_location','full_service','part_service','part_warranty','full_dealership','first_year_road_tax','brand_new','finance_available','discounted'],axis=1)

In [4]:
title_counts = newfile.groupby('car_title')['model'].nunique()
titles_to_remove = title_counts[title_counts > 1].index
newfile = newfile[~newfile['car_title'].isin(titles_to_remove)]

In [6]:
newfile['car_title'] = newfile['car_title'].fillna(newfile['car_title'].mode()[0])

In [7]:
newfile['model'] = newfile['model'].fillna(newfile['model'].mode()[0])


In [8]:
newfile['car_seller'] = newfile['car_seller'].fillna(newfile['car_seller'].mode()[0])
newfile['miles']=newfile['miles'].fillna(newfile['miles'].mean())


In [9]:
newfile['year'] = pd.to_numeric(newfile['year'], errors='coerce')
newfile['year'] = newfile['year'].fillna(newfile['year'].mean())


In [10]:
newfile['engine_vol']=newfile['engine_vol'].fillna(newfile['engine_vol'].mean())
newfile['engine_size']=newfile['engine_size'].fillna(newfile['engine_size'].mean())
newfile['num_owner']=newfile['num_owner'].fillna(newfile['num_owner'].mean())

In [11]:
newfile['transmission']=newfile['transmission'].fillna(newfile['transmission'].mode()[0])
newfile['fuel_type']=newfile['fuel_type'].fillna(newfile['fuel_type'].mode()[0])

In [12]:
from sklearn.preprocessing import LabelEncoder
high_cardinality_cols = ['car_title', 'model', 'car_seller']
for col in high_cardinality_cols:
    le = LabelEncoder() 
    newfile[col] = le.fit_transform(newfile[col])
low_cardinality_cols = ['transmission', 'fuel_type']
newfile = pd.get_dummies(newfile, columns=low_cardinality_cols, drop_first=True)

In [13]:
X=newfile.drop(['car_price'],axis=1)
Y=newfile['car_price']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=23)

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

In [17]:
model=GradientBoostingRegressor(n_estimators=250,learning_rate=0.1,random_state=25)
model.fit(X_train,Y_train)

In [18]:
Y_pred=model.predict(X_test)

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(f'Mean Absolute Error: {mean_absolute_error(Y_test, Y_pred)}')
print(f'Mean Squared Error: {mean_squared_error(Y_test, Y_pred)}')
print(f'R2 Score: {r2_score(Y_test, Y_pred)}')


Mean Absolute Error: 3124.0416661588774
Mean Squared Error: 93240215.95524149
R2 Score: 0.8447969550344181


In [37]:
from joblib import dump, load
dump(model, 'model.joblib')
model = load('model.joblib')


In [39]:
newfile

Unnamed: 0,model,car_price,car_title,car_seller,year,miles,engine_vol,engine_size,num_owner,transmission_manual,fuel_type_diesel,fuel_type_diesel hybrid,fuel_type_diesel plug-in hybrid,fuel_type_electric,fuel_type_hydrogen,fuel_type_petrol,fuel_type_petrol hybrid,fuel_type_petrol plug-in hybrid
0,372,89995,0,10152,2001.0,14400.0,4.900000,225.000000,5.0000,True,False,False,False,False,False,True,False,False
1,372,92500,0,1995,2019.0,650.0,1.856983,170.526853,2.0106,True,False,False,False,False,False,True,False,False
2,372,109995,0,9533,2000.0,21600.0,3.500000,170.526853,3.0000,True,False,False,False,False,False,True,False,False
3,372,124950,0,12543,1989.0,2750.0,1.856983,170.526853,2.0106,True,False,False,False,False,False,True,False,False
4,372,124950,0,12543,1989.0,15142.0,5.000000,170.526853,2.0106,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818451,1035,8595,3646,10152,1995.0,5000.0,1.856983,170.526853,4.0000,True,False,False,False,False,False,True,False,False
818452,14,3495,3647,10152,1962.0,56000.0,1.856983,170.526853,5.0000,True,False,False,False,False,False,True,False,False
818453,1232,22990,3648,9509,2022.0,480.0,1.856983,170.526853,2.0106,True,False,False,False,False,False,True,False,False
818454,178,22990,3648,9509,2022.0,480.0,1.856983,170.526853,2.0106,True,False,False,False,False,False,True,False,False
