In [101]:
import numpy as np
import pandas as pd
import gdown

In [102]:
url = "https://drive.google.com/uc?id=1VWcklFoYkOr2ZYqnZREBWum-2KtyM75Y&export=download"
gdown.download(url, 'all_car_adverts.csv', quiet=False)
file = pd.read_csv('all_car_adverts.csv')
newfile=file.drop(columns=['Unnamed: 0','make','variant','car_badges','car_sub_title','car_attention_grabber','car_specs','car_seller_rating','reg','body_type','engine_size_unit','ulez','car_seller_location','full_service','part_service','part_warranty','full_dealership','first_year_road_tax','brand_new','finance_available','discounted'],axis=1)

Downloading...
From (original): https://drive.google.com/uc?id=1VWcklFoYkOr2ZYqnZREBWum-2KtyM75Y&export=download
From (redirected): https://drive.google.com/uc?id=1VWcklFoYkOr2ZYqnZREBWum-2KtyM75Y&export=download&confirm=t&uuid=79646b6b-c434-49bf-9362-fe6cc2a08bfb
To: C:\Users\HP\all_car_adverts.csv
100%|██████████| 273M/273M [00:08<00:00, 33.5MB/s] 


In [109]:
title_counts = newfile.groupby('car_title')['model'].nunique()
titles_to_remove = title_counts[title_counts > 1].index
newfile = newfile[~newfile['car_title'].isin(titles_to_remove)]

In [111]:
newfile['car_title'] = newfile['car_title'].fillna(newfile['car_title'].mode()[0])

In [113]:
newfile['model'] = newfile['model'].fillna(newfile['model'].mode()[0])


In [115]:
newfile['car_seller'] = newfile['car_seller'].fillna(newfile['car_seller'].mode()[0])
newfile['miles']=newfile['miles'].fillna(newfile['miles'].mean())


In [117]:
newfile['year'] = pd.to_numeric(newfile['year'], errors='coerce')
newfile['year'] = newfile['year'].fillna(newfile['year'].mean())


In [118]:
newfile['engine_vol']=newfile['engine_vol'].fillna(newfile['engine_vol'].mean())
newfile['engine_size']=newfile['engine_size'].fillna(newfile['engine_size'].mean())
newfile['num_owner']=newfile['num_owner'].fillna(newfile['num_owner'].mean())

In [121]:
newfile['transmission']=newfile['transmission'].fillna(newfile['transmission'].mode()[0])
newfile['fuel_type']=newfile['fuel_type'].fillna(newfile['fuel_type'].mode()[0])

In [123]:
from sklearn.preprocessing import LabelEncoder
high_cardinality_cols = ['car_title', 'model', 'car_seller']
for col in high_cardinality_cols:
    le = LabelEncoder() 
    newfile[col] = le.fit_transform(newfile[col])
low_cardinality_cols = ['transmission', 'fuel_type']
newfile = pd.get_dummies(newfile, columns=low_cardinality_cols, drop_first=True)

In [124]:
X=newfile.drop(['car_price'],axis=1)
Y=newfile['car_price']

In [125]:
from sklearn.model_selection import train_test_split

In [126]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=23)

In [127]:
from sklearn.ensemble import GradientBoostingRegressor

In [133]:
model=GradientBoostingRegressor(n_estimators=250,learning_rate=0.1,random_state=25)
model.fit(X_train,Y_train)

In [134]:
Y_pred=model.predict(X_test)

In [137]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(f'Mean Absolute Error: {mean_absolute_error(Y_test, Y_pred)}')
print(f'Mean Squared Error: {mean_squared_error(Y_test, Y_pred)}')
print(f'R2 Score: {r2_score(Y_test, Y_pred)}')


Mean Absolute Error: 3124.0416661588774
Mean Squared Error: 93240215.95524149
R2 Score: 0.8447969550344181


In [136]:
from joblib import dump, load
dump(model, 'model.joblib')
model = load('model.joblib')
