In [167]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
data = pd.read_csv('data/cars.csv')
data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [168]:
data.drop(['car_ID', 'symboling', 'CarName'], axis=1, inplace=True)

In [169]:
data.isnull().sum()

fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [170]:
data.select_dtypes(object)

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
0,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,gas,std,four,sedan,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...
200,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


In [171]:
data['fuelsystem'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'spfi', '4bbl', 'idi', 'spdi'],
      dtype=object)

In [172]:
fueltype_mapping = {'gas':0, 'diesel':1}
aspiration_mapping = {'std':0, 'turbo':1}
doornumber_mapping = {'two':0, 'four':1}
carbody_mapping = {'convertible':0, 'hatchback':1, 'sedan':2, 'wagon':3, 'hardtop':4}
drivewheel_mapping = {'rwd':0, 'fwd':1, '4wd':2}
enginelocation_mapping = {'front':0, 'rear':1}
enginetype_mapping = {'dohc':0, 'ohcv':1, 'ohc':2, 'l':3, 'rotor':4, 'ohcf':5, 'dohcv':6}
cylindernumber_mapping = {'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8}
fuelsystem_mapping = {'mpfi':0, '2bbl':1, 'mfi':2, '1bbl':3, 'spfi':4, '4bbl':5, 'idi':6, 'spdi':7}
########################
data['fueltype'] = data['fueltype'].map(fueltype_mapping)
data['aspiration'] = data['aspiration'].map(aspiration_mapping)
data['doornumber'] = data['doornumber'].map(doornumber_mapping)
data['carbody'] = data['carbody'].map(carbody_mapping)
data['drivewheel'] = data['drivewheel'].map(drivewheel_mapping)
data['enginelocation'] = data['enginelocation'].map(enginelocation_mapping)
data['enginetype'] = data['enginetype'].map(enginetype_mapping)
data['cylindernumber'] = data['cylindernumber'].map(cylindernumber_mapping)
data['fuelsystem'] = data['fuelsystem'].map(fuelsystem_mapping)
data

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,0,0,0,0,0,0,88.6,168.8,64.1,48.8,...,130,0,3.47,2.68,9.0,111,5000,21,27,13495.0
1,0,0,0,0,0,0,88.6,168.8,64.1,48.8,...,130,0,3.47,2.68,9.0,111,5000,21,27,16500.0
2,0,0,0,1,0,0,94.5,171.2,65.5,52.4,...,152,0,2.68,3.47,9.0,154,5000,19,26,16500.0
3,0,0,1,2,1,0,99.8,176.6,66.2,54.3,...,109,0,3.19,3.40,10.0,102,5500,24,30,13950.0
4,0,0,1,2,2,0,99.4,176.6,66.4,54.3,...,136,0,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0,0,1,2,0,0,109.1,188.8,68.9,55.5,...,141,0,3.78,3.15,9.5,114,5400,23,28,16845.0
201,0,1,1,2,0,0,109.1,188.8,68.8,55.5,...,141,0,3.78,3.15,8.7,160,5300,19,25,19045.0
202,0,0,1,2,0,0,109.1,188.8,68.9,55.5,...,173,0,3.58,2.87,8.8,134,5500,18,23,21485.0
203,1,1,1,2,0,0,109.1,188.8,68.9,55.5,...,145,6,3.01,3.40,23.0,106,4800,26,27,22470.0


In [173]:
# standard correlation coefficient
corr_matrix = data.corr()
corr_matrix["price"].sort_values(ascending=False)

price               1.000000
enginesize          0.874145
curbweight          0.835305
horsepower          0.808139
carwidth            0.759325
cylindernumber      0.718305
carlength           0.682920
wheelbase           0.577816
boreratio           0.553173
enginelocation      0.324973
aspiration          0.177926
carbody             0.161294
carheight           0.119336
fueltype            0.105679
stroke              0.079443
compressionratio    0.067984
doornumber          0.031835
peakrpm            -0.085267
enginetype         -0.092841
fuelsystem         -0.122118
drivewheel         -0.577992
citympg            -0.685751
highwaympg         -0.697599
Name: price, dtype: float64

In [174]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('price',axis=1), 
                                                    data['price'], test_size=0.2, random_state=29)

In [175]:
#Linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
prediction = lin_reg.predict(x_test)
acc = r2_score(y_test, prediction)
acc

0.78926931813699

In [176]:
#Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train,y_train)
prediction = tree_reg.predict(x_test)
acc = r2_score(y_test, prediction)
acc

0.8468749520457713

In [177]:
#Random forest Regression
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(x_train,y_train)
prediction = forest_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.8934464181948054

In [178]:
#Random forest Regression 
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(max_features=2)
forest_reg.fit(x_train,y_train)
prediction = forest_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.9063774936556784

In [179]:
#Random forest Regression 
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(max_features=8)
forest_reg.fit(x_train,y_train)
prediction = forest_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.8945765355960285