In [1]:
import pandas as pd
import numpy as np
import csv
import seaborn as sns
import os
import base64
from IPython.display import HTML
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import pickle

In [2]:
car = pd.read_csv('car-data-set.csv')
car['model'] = ''
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7903 entries, 0 to 7902
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7903 non-null   object 
 1   year           7903 non-null   int64  
 2   selling_price  7903 non-null   int64  
 3   km_driven      7903 non-null   int64  
 4   fuel           7903 non-null   object 
 5   owner          7903 non-null   object 
 6   mileage        7903 non-null   float64
 7   engine         7903 non-null   int64  
 8   seats          7903 non-null   int64  
 9   model          7903 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 617.5+ KB


In [3]:
car['model'] = car['name'].str.split(' ').str[0]
car.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,owner,mileage,engine,seats,model
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,First Owner,23.4,1248,5,Maruti
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Second Owner,21.14,1498,5,Skoda
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Third Owner,17.7,1497,5,Honda
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,First Owner,23.0,1396,5,Hyundai
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,First Owner,16.1,1298,5,Maruti


In [4]:
car = car.drop(['owner'], axis=1)
car.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,mileage,engine,seats,model
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,23.4,1248,5,Maruti
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,21.14,1498,5,Skoda
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,17.7,1497,5,Honda
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,23.0,1396,5,Hyundai
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,16.1,1298,5,Maruti


In [5]:
car = car[['name', 'model', 'year', 'selling_price', 'km_driven', 'fuel', 'mileage', 'engine', 'seats']]
car.head()

Unnamed: 0,name,model,year,selling_price,km_driven,fuel,mileage,engine,seats
0,Maruti Swift Dzire VDI,Maruti,2014,450000,145500,Diesel,23.4,1248,5
1,Skoda Rapid 1.5 TDI Ambition,Skoda,2014,370000,120000,Diesel,21.14,1498,5
2,Honda City 2017-2020 EXi,Honda,2006,158000,140000,Petrol,17.7,1497,5
3,Hyundai i20 Sportz Diesel,Hyundai,2010,225000,127000,Diesel,23.0,1396,5
4,Maruti Swift VXI BSIII,Maruti,2007,130000,120000,Petrol,16.1,1298,5


In [6]:
car['name'] = car['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [7]:
car.head()

Unnamed: 0,name,model,year,selling_price,km_driven,fuel,mileage,engine,seats
0,Maruti Swift Dzire,Maruti,2014,450000,145500,Diesel,23.4,1248,5
1,Skoda Rapid 1.5,Skoda,2014,370000,120000,Diesel,21.14,1498,5
2,Honda City 2017-2020,Honda,2006,158000,140000,Petrol,17.7,1497,5
3,Hyundai i20 Sportz,Hyundai,2010,225000,127000,Diesel,23.0,1396,5
4,Maruti Swift VXI,Maruti,2007,130000,120000,Petrol,16.1,1298,5


In [8]:
car.shape

(7903, 9)

In [9]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7903 entries, 0 to 7902
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7903 non-null   object 
 1   model          7903 non-null   object 
 2   year           7903 non-null   int64  
 3   selling_price  7903 non-null   int64  
 4   km_driven      7903 non-null   int64  
 5   fuel           7903 non-null   object 
 6   mileage        7903 non-null   float64
 7   engine         7903 non-null   int64  
 8   seats          7903 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 555.8+ KB


In [11]:
def create_download_link( df, title = "Download CSV file", filename = "final-data-set.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)
create_download_link(car)

In [12]:
X = car.drop(['selling_price'],axis=1)
Y = car['selling_price']

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2)

In [16]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','model','fuel']),remainder = 'passthrough')

In [17]:
linearRegressor = LinearRegression()
regressor = RandomForestRegressor()

In [18]:
pipeRf = make_pipeline(column_trans, regressor)
pipeLr = make_pipeline(column_trans, linearRegressor)

In [19]:
pipeRf.fit(X_train,Y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Ambassador CLASSIC 1500', 'Ambassador Classic 2000',
       'Ambassador Grand 1500', 'Ambassador Grand 2000',
       'Ashok Leyland Stile', 'Audi A3 35', 'Audi A3 40', 'Audi A4 1.8',
       'Audi A4 2.0', 'Audi A4 35', 'Audi A6 2.0', 'Audi A6 35',
       'Audi Q3 2.0', 'Au...
       'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
       'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
       'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]),
                                             

In [20]:
pipeLr.fit(X_train,Y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Ambassador CLASSIC 1500', 'Ambassador Classic 2000',
       'Ambassador Grand 1500', 'Ambassador Grand 2000',
       'Ashok Leyland Stile', 'Audi A3 35', 'Audi A3 40', 'Audi A4 1.8',
       'Audi A4 2.0', 'Audi A4 35', 'Audi A6 2.0', 'Audi A6 35',
       'Audi Q3 2.0', 'Au...
       'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
       'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
       'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]),
                                             

In [21]:
Y_predRf = pipeRf.predict(X_test)
Y_predLr = pipeLr.predict(X_test)

In [22]:
r2_score(Y_test,Y_predRf)

0.9557975680182506

In [23]:
r2_score(Y_test,Y_predLr)

0.9483541925358604

In [25]:
pickle.dump(pipeRf,open('RF-Model.pkl','wb'))
pickle.dump(pipeLr,open('LR-Model.pkl','wb'))

In [26]:
pipeRf.predict(pd.DataFrame(columns=['name','model','year','km_driven','fuel','mileage', 'engine', 'seats'],data=np.array(['Maruti Swift Dzire','Maruti',2014,145500,'Diesel',23.40,1248,5]).reshape(1,8)))

array([465629.84])

In [27]:
pipeLr.predict(pd.DataFrame(columns=['name','model','year','km_driven','fuel','mileage', 'engine', 'seats'],data=np.array(['Maruti Swift Dzire','Maruti',2014,145500,'Diesel',23.40,1248,5]).reshape(1,8)))

array([463041.09800106])