In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

In [2]:
car=pd.read_csv('/Users/hitaarthh/Documents/Amrita /Sem 5/ML Lab/Auto Value Pro/dataset/Cleaned_Car_data.csv')

### Extracting Training Data

In [3]:
X= car.drop('Price',axis=1)
y=car['Price']

In [4]:
X

Unnamed: 0,name,company,year,kms_driven,fuel_type,Transmission Type,No. of Previous Owners
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol,Manual,1
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel,Manual,2
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol,Automatic,3
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel,Manual,2
4,Ford Figo,Ford,2012,41000,Diesel,Automatic,2
...,...,...,...,...,...,...,...
810,Maruti Suzuki Ritz,Maruti,2011,50000,Petrol,Automatic,1
811,Tata Indica V2,Tata,2009,30000,Diesel,Manual,4
812,Toyota Corolla Altis,Toyota,2009,132000,Petrol,Automatic,4
813,Tata Zest XM,Tata,2018,27000,Diesel,Automatic,3


In [5]:
y.shape

(815,)

### Applying Train Test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

#### Creating an OneHotEncoder object to contain all the possible categories

In [9]:
ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type','Transmission Type']])

#### Creating a column transformer to transform categorical columns

In [10]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type','Transmission Type']),
                                    remainder='passthrough')

#### Linear Regression Model

In [11]:
lr=LinearRegression()

#### Making a pipeline

In [12]:
pipe=make_pipeline(column_trans,lr)

#### Fitting the  model

In [13]:
pipe.fit(X_train,y_train)

In [14]:
y_pred=pipe.predict(X_test)

#### Checking R2 Score

In [15]:
r2_score(y_test,y_pred)

0.6376799710938499

#### Finding the model with a random state of TrainTestSplit where the model was found to give almost 0.92 as r2_score

In [16]:
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [17]:
np.argmax(scores)

302

In [18]:
scores[np.argmax(scores)]

0.9006962659250415

In [19]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol','Manual',1]).reshape(1,7)))

array([413773.01620775])

#### The best model is found at a certain random state 

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr = LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
lr_r2score= r2_score(y_test,y_pred)
lr_mse=mean_squared_error(y_test,y_pred)
lr_mae=mean_absolute_error(y_test,y_pred)
print("R2 score,mse,mae for best random state",lr_r2score,lr_mse,lr_mae)

R2 score,mse,mae for best random state 0.9006962659250415 19064844746.030865 97568.73645971752


In [21]:
import pickle

model_data = {'pipeline': pipe, 'r2score': lr_r2score, 'mse': lr_mse, 'mae': lr_mae}
with open('LinearRegression.pkl', 'wb') as file:
    pickle.dump(model_data, file)


In [22]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol','Manual',1]).reshape(1,7)))

array([409848.94272572])