In [247]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
data = pd.read_csv('data/cars.csv')
data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [248]:
data.drop(['car_ID', 'symboling', 'CarName'], axis=1, inplace=True)

In [249]:
data.isnull().sum()

fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [250]:
data.select_dtypes(object)

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
0,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,gas,std,four,sedan,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...
200,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


In [251]:
data['fuelsystem'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'spfi', '4bbl', 'idi', 'spdi'],
      dtype=object)

In [252]:
enginetype_mapping = {'dohc':0, 'ohcv':1, 'ohc':2, 'l':3, 'rotor':4, 'ohcf':5, 'dohcv':6}
cylindernumber_mapping = {'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8}
fuelsystem_mapping = {'mpfi':0, '2bbl':1, 'mfi':2, '1bbl':3, 'spfi':4, '4bbl':5, 'idi':6, 'spdi':7}
########################
data['enginetype'] = data['enginetype'].map(enginetype_mapping)
data['cylindernumber'] = data['cylindernumber'].map(cylindernumber_mapping)
data['fuelsystem'] = data['fuelsystem'].map(fuelsystem_mapping)

In [253]:
#preparing data with pipelining method
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder

    
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

df = data.copy()
#data_label = df["price"].copy()
#df = df.drop("price", axis = 1)

cat_attrs = ["fueltype","aspiration","doornumber","carbody","drivewheel","enginelocation"]
df_num = df.drop(cat_attrs, axis = 1)
num_attrs = list(df_num)


num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrs)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrs)),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

temp = full_pipeline.fit_transform(df)
data_prepared = pd.DataFrame(temp,columns=["wheelbase","carlength","carwidth","carheight","curbweight","enginetype","cylindernumber","enginesize",
        "fuelsystem","boreratio","stroke","compressionratio","horsepower","peakrpm","citympg","highwaympg","price",
        "fuel_gas","fuel_diesel","asp_std","asp_turbo","two_door","four_door","body_convertible","body_hatchback",
        "body_sedan","body_wagon","body_hardtop","drivewheel_rwd","drivewheel_fwd","drivewheel_4wd",
        "engine_front","engine_rear"])

data_prepared.head()


Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,...,body_convertible,body_hatchback,body_sedan,body_wagon,body_hardtop,drivewheel_rwd,drivewheel_fwd,drivewheel_4wd,engine_front,engine_rear
0,88.6,168.8,64.1,48.8,2548.0,0.0,4.0,130.0,0.0,3.47,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,88.6,168.8,64.1,48.8,2548.0,0.0,4.0,130.0,0.0,3.47,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,94.5,171.2,65.5,52.4,2823.0,1.0,6.0,152.0,0.0,2.68,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,99.8,176.6,66.2,54.3,2337.0,2.0,4.0,109.0,0.0,3.19,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,99.4,176.6,66.4,54.3,2824.0,2.0,5.0,136.0,0.0,3.19,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [254]:
# standard correlation coefficient
corr_matrix = data.corr()
corr_matrix["price"].sort_values(ascending=False)

price               1.000000
enginesize          0.874145
curbweight          0.835305
horsepower          0.808139
carwidth            0.759325
cylindernumber      0.718305
carlength           0.682920
wheelbase           0.577816
boreratio           0.553173
carheight           0.119336
stroke              0.079443
compressionratio    0.067984
peakrpm            -0.085267
enginetype         -0.092841
fuelsystem         -0.122118
citympg            -0.685751
highwaympg         -0.697599
Name: price, dtype: float64

In [255]:
x_train, x_test, y_train, y_test = train_test_split(data_prepared.drop('price',axis=1), 
                                                    data_prepared['price'], test_size=0.2, random_state=29)

In [256]:
#Linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
prediction = lin_reg.predict(x_test)
acc = r2_score(y_test, prediction)
acc

0.7776112332652065

In [257]:
#Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train,y_train)
prediction = tree_reg.predict(x_test)
acc = r2_score(y_test, prediction)
acc

0.8127492600767807

In [258]:
#Random forest Regression
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(x_train,y_train)
prediction = forest_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.8910099459222078

In [259]:
#Random forest Regression 
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(max_features=2)
forest_reg.fit(x_train,y_train)
prediction = forest_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.8912096355054437

In [260]:
#Random forest Regression 
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(max_features=8)
forest_reg.fit(x_train,y_train)
prediction = forest_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.909001912738666

In [261]:
#XGB Regression
import xgboost as xgb
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(x_train, y_train)
prediction = xgb_reg.predict(x_test)
acc = r2_score(y_test,prediction)
acc

0.8656223492960714