In [1]:
import pandas as pd 
import numpy as np
import re
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('hyundi.csv')

In [3]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax(£),mpg,engineSize
0,I20,2017,7999,Manual,17307,Petrol,145,58.9,1.2
1,Tucson,2016,14499,Automatic,25233,Diesel,235,43.5,2.0
2,Tucson,2016,11399,Manual,37877,Diesel,30,61.7,1.7
3,I10,2016,6499,Manual,23789,Petrol,20,60.1,1.0
4,IX35,2015,10199,Manual,33177,Diesel,160,51.4,2.0


In [4]:
df["model"].value_counts()

 Tucson      1300
 I10         1092
 I30          536
 I20          496
 Kona         328
 Ioniq        284
 Santa Fe     245
 IX20         204
 I40          127
 IX35         118
 I800         117
 Getz           6
 Veloster       3
 Terracan       2
 Accent         1
 Amica          1
Name: model, dtype: int64

In [5]:
df['model'] = df['model'].str.strip()

In [6]:
df.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax(£)            int64
mpg             float64
engineSize      float64
dtype: object

In [7]:
df.describe()

Unnamed: 0,year,price,mileage,tax(£),mpg,engineSize
count,4860.0,4860.0,4860.0,4860.0,4860.0,4860.0
mean,2017.107613,12750.13107,21486.049588,121.147119,53.827798,1.457922
std,1.920456,5992.92231,17710.196964,58.003289,12.736042,0.400788
min,2000.0,1200.0,1.0,0.0,1.1,0.0
25%,2016.0,8000.0,8339.25,125.0,44.8,1.2
50%,2017.0,11990.0,17462.0,145.0,55.4,1.6
75%,2019.0,15733.75,30967.0,145.0,60.1,1.7
max,2020.0,92000.0,138000.0,555.0,256.8,2.9


In [8]:
df.rename(columns={'mileage': 'total_driven'}, inplace=True)

In [9]:
df['price'] = df['price'].replace(92000, 0)

In [10]:
mean_price = df[df['price'] != 0]['price'].mean()
df['price'] = df['price'].replace(0, mean_price)
round_mean_price = int(mean_price)
df['price'] = df['price'].replace(0, round_mean_price)

In [11]:
df['price'] = df['price'].astype('int64')

In [12]:
df['mpg'] = df['mpg'].replace(256.8, 0)
mean_price = df[df['mpg'] != 0]['mpg'].mean()
round_mean_price = int(mean_price)
df['mpg'] = df['mpg'].replace(0, round_mean_price)

In [13]:
df['mpg'] = df['mpg'].astype('int64')

In [14]:
# def clean_model(model):
#     cleaned_model = re.sub(r'[^a-zA-Z0-9]', '', model)
#     return cleaned_model
# df['model'] = df['model'].apply(clean_model)

In [15]:
X = df.drop(columns='price')
y=df['price']

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [17]:
OHE = OneHotEncoder()
OHE.fit(X[['model','transmission','fuelType']])

In [18]:
column_transform=make_column_transformer((OneHotEncoder(categories=OHE.categories_),['model','transmission','fuelType']),
                                    remainder='passthrough')

In [19]:
lr = LinearRegression()

In [20]:
pipe = make_pipeline(column_transform,lr)

In [21]:
pipe.fit(X_train,y_train)

In [22]:
y_pred = pipe.predict(X_test)

In [23]:
r2_score(y_test,y_pred)

0.8904497652028383

In [24]:
scores=[]
for i in range(10):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_transform,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))
    print(i,r2_score(y_test,y_pred))

0 0.8837766404978313
1 0.8999459013651621
2 0.8877330690841044
3 0.8958035329446126
4 0.9009742433027241
5 0.8893084700751422
6 0.8932049371017375
7 0.8813169278400329
8 0.8777438027627087
9 0.8870975162175979


In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
lr=LinearRegression()
pipe=make_pipeline(column_transform,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
scores.append(r2_score(y_test,y_pred))

In [26]:
print(np.argmax(scores))
print(scores[np.argmax(scores)])

4
0.9009742433027241


In [28]:
import pickle

In [29]:
pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))

In [30]:
pipe.predict(pd.DataFrame([["I10",2016,'Manual',23873,'Petrol',216,35.7,1.5]],
                          columns = ['model','year','transmission','total_driven','fuelType','tax(£)','mpg','engineSize']))

array([9850.72400733])