In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dataset = pd.read_csv('Datasets/Automobile_data.csv')
dataset.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
# mark all missing values (?) as NaN:
dataset.replace('?', np.nan, inplace=True)
dataset.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
dataset.shape

(205, 26)

In [5]:
dataset.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [6]:
# drop all records with missing data and save it as a new dataframe:
dataset.dropna(inplace=True)
dataset.shape

(159, 26)

In [8]:
dataset.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [9]:
dataset.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [10]:
# let's first convert values in some of the columns to float (because they were recognized as 'object' initially):

dataset['normalized-losses'] = pd.to_numeric(dataset['normalized-losses'], downcast="float")
dataset['bore'] = pd.to_numeric(dataset['bore'], downcast="float")
dataset['stroke'] = pd.to_numeric(dataset['stroke'], downcast="float")
dataset['horsepower'] = pd.to_numeric(dataset['horsepower'], downcast="float")
dataset['peak-rpm'] = pd.to_numeric(dataset['peak-rpm'], downcast="float")
dataset['price'] = pd.to_numeric(dataset['price'], downcast="float")

In [11]:
dataset['make'].unique()

array(['audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'jaguar', 'mazda',
       'mercedes-benz', 'mitsubishi', 'nissan', 'peugot', 'plymouth',
       'porsche', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'],
      dtype=object)

In [12]:
dataset['num-of-doors'].unique()

array(['four', 'two'], dtype=object)

In [13]:
dataset['num-of-cylinders'].unique()

array(['four', 'five', 'six', 'three', 'eight'], dtype=object)

In [14]:
# now let's replace numerical data written in words with actual numbers:

dataset.replace({"num-of-doors":{"four": 4, "two": 2},
                "num-of-cylinders":{"three":3, "four": 4, "five": 5, "six": 6, "eight": 8}}, 
                inplace=True)
dataset.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,1,158.0,audi,gas,std,4,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,1,158.0,audi,gas,turbo,4,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,2,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


Let's look at all the columns with categorical data so we can decide how to encode them:

In [15]:
dataset['fuel-type'].unique()

array(['gas', 'diesel'], dtype=object)

In [16]:
dataset['aspiration'].unique()

array(['std', 'turbo'], dtype=object)

In [17]:
dataset['body-style'].unique()

array(['sedan', 'hatchback', 'wagon', 'hardtop', 'convertible'],
      dtype=object)

In [18]:
dataset['drive-wheels'].unique()

array(['fwd', '4wd', 'rwd'], dtype=object)

In [19]:
dataset['engine-location'].unique()

array(['front'], dtype=object)

In [20]:
# as it appears that all cars in our dataset have front engines, we can drop this column:
dataset.drop(['engine-location'], axis=1, inplace=True)

In [21]:
dataset['engine-type'].unique()

array(['ohc', 'l', 'dohc', 'ohcv', 'ohcf'], dtype=object)

In [22]:
dataset['fuel-system'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'idi', 'spdi'], dtype=object)

In [23]:
# replace categorical data with dummy variables
dataset = pd.get_dummies(dataset,
                         columns = ['make', 'fuel-type', 'aspiration','body-style', 'drive-wheels',\
                                    'engine-type', 'fuel-system'],
                         prefix = ['make', 'fuel-type', 'aspiration','body', 'drive','engine', 'fuel-system'])
dataset.head()

Unnamed: 0,symboling,normalized-losses,num-of-doors,wheel-base,length,width,height,curb-weight,num-of-cylinders,engine-size,...,engine_l,engine_ohc,engine_ohcf,engine_ohcv,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi
3,2,164.0,4,99.8,176.6,66.2,54.3,2337,4,109,...,0,1,0,0,0,0,0,0,1,0
4,2,164.0,4,99.4,176.6,66.4,54.3,2824,5,136,...,0,1,0,0,0,0,0,0,1,0
6,1,158.0,4,105.8,192.7,71.4,55.7,2844,5,136,...,0,1,0,0,0,0,0,0,1,0
8,1,158.0,4,105.8,192.7,71.4,55.9,3086,5,131,...,0,1,0,0,0,0,0,0,1,0
10,2,192.0,2,101.2,176.8,64.8,54.3,2395,4,108,...,0,1,0,0,0,0,0,0,1,0


Now that we've prepared our dataset, we can move on to creating our ML models and choosing the best one.

In [24]:
# separate our predictor and result variables:
X = dataset.loc[:, dataset.columns != 'price'].values
y = dataset['price'].values

# Multiple Linear Regression model

In [25]:
# split the dataset into Training and Test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
# train our model on the Training set:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
# predict Test set results:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[21421.6  21105.  ]
 [ 9352.12 10595.  ]
 [16170.96 18150.  ]
 [29981.19 28248.  ]
 [ 6170.23  6338.  ]
 [ 7643.75  8358.  ]
 [ 6382.51  5499.  ]
 [18362.65 17669.  ]
 [ 6347.36  5195.  ]
 [16202.67 18150.  ]
 [10725.42  9989.  ]
 [10350.23  9988.  ]
 [ 5969.24  7099.  ]
 [10476.09  8449.  ]
 [ 7660.01  6849.  ]
 [ 7265.72  7395.  ]
 [29698.27 25552.  ]
 [ 8514.3   7995.  ]
 [ 9272.23  7957.  ]
 [ 8424.19  7895.  ]
 [ 6504.18  6649.  ]
 [ 7630.26  8499.  ]
 [ 7777.18  6229.  ]
 [ 9361.67  8495.  ]
 [ 7606.32  7349.  ]
 [ -768.5   5151.  ]
 [ 8153.63  7295.  ]
 [ 8484.02  8189.  ]
 [ 6352.52  7738.  ]
 [15843.96 12940.  ]
 [ 4507.33  6479.  ]
 [15111.4  22018.  ]]


In [28]:
# evaluate the model performance:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8815307673280642

# Polynomial Regression model

In [29]:
# split the dataset into Training and Test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [30]:
# train our model on the Training set:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
# predict Test set results:
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 12075.48  21105.  ]
 [  8845.    10595.  ]
 [ 23389.76  18150.  ]
 [ 59471.71  28248.  ]
 [  9204.54   6338.  ]
 [  6739.19   8358.  ]
 [  3230.36   5499.  ]
 [ -8006.47  17669.  ]
 [ -5197.34   5195.  ]
 [-15304.99  18150.  ]
 [  8914.34   9989.  ]
 [ 10235.35   9988.  ]
 [ 49821.61   7099.  ]
 [  9732.24   8449.  ]
 [  5700.94   6849.  ]
 [  6536.46   7395.  ]
 [  9259.39  25552.  ]
 [ -2759.61   7995.  ]
 [ 10320.59   7957.  ]
 [  7163.24   7895.  ]
 [  5383.25   6649.  ]
 [  4611.46   8499.  ]
 [  2922.91   6229.  ]
 [ 10255.24   8495.  ]
 [  7214.     7349.  ]
 [-17724.07   5151.  ]
 [  1706.18   7295.  ]
 [  7049.98   8189.  ]
 [ 20803.49   7738.  ]
 [ 12314.3   12940.  ]
 [ 50508.97   6479.  ]
 [  4574.84  22018.  ]]


In [32]:
# evaluate the model performance:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-5.776398740426546

# Support Vector Regression (SVR) model

In [33]:
y1 = y.reshape(len(y),1)

In [34]:
# split the dataset into Training and Test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=0)

In [35]:
# feature scaling:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

In [36]:
# train our model on the Training set:
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [37]:
# predict Test set results:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[19839.94 21105.  ]
 [ 9419.82 10595.  ]
 [16679.82 18150.  ]
 [22513.15 28248.  ]
 [ 5707.4   6338.  ]
 [ 7246.81  8358.  ]
 [ 7110.26  5499.  ]
 [15413.07 17669.  ]
 [ 7194.56  5195.  ]
 [16843.18 18150.  ]
 [11800.89  9989.  ]
 [10632.61  9988.  ]
 [ 9549.06  7099.  ]
 [10509.98  8449.  ]
 [ 7459.85  6849.  ]
 [ 7254.74  7395.  ]
 [24334.86 25552.  ]
 [ 8566.96  7995.  ]
 [10578.92  7957.  ]
 [ 8472.63  7895.  ]
 [ 7114.64  6649.  ]
 [ 7944.81  8499.  ]
 [ 6953.03  6229.  ]
 [ 8894.64  8495.  ]
 [ 7644.51  7349.  ]
 [11418.33  5151.  ]
 [ 8346.6   7295.  ]
 [ 7600.16  8189.  ]
 [ 6505.85  7738.  ]
 [16631.97 12940.  ]
 [ 8238.76  6479.  ]
 [14099.07 22018.  ]]


In [38]:
# evaluate the model performance:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8353689638903133

# Decision Tree Regression model

In [39]:
# split the dataset into Training and Test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [40]:
# train our model on the Training set:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [41]:
# predict Test set results:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[20970. 21105.]
 [ 8845. 10595.]
 [19699. 18150.]
 [32250. 28248.]
 [ 6938.  6338.]
 [ 7198.  8358.]
 [ 7299.  5499.]
 [11549. 17669.]
 [ 6095.  5195.]
 [13499. 18150.]
 [ 9639.  9989.]
 [11248.  9988.]
 [ 7799.  7099.]
 [ 9639.  8449.]
 [ 7499.  6849.]
 [ 6695.  7395.]
 [28176. 25552.]
 [ 7775.  7995.]
 [ 7957.  7957.]
 [ 9095.  7895.]
 [ 7299.  6649.]
 [ 6989.  8499.]
 [ 6229.  6229.]
 [ 7898.  8495.]
 [ 7999.  7349.]
 [ 6295.  5151.]
 [ 7295.  7295.]
 [ 6989.  8189.]
 [ 6575.  7738.]
 [13415. 12940.]
 [ 6295.  6479.]
 [ 8778. 22018.]]


In [42]:
# evaluate the model performance:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7668528654508213

# Random Forest Regression model

In [43]:
# split the dataset into Training and Test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [44]:
# train our model on the Training set:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [45]:
# predict Test set results:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[18791.9  21105.  ]
 [ 9290.   10595.  ]
 [17806.4  18150.  ]
 [30428.3  28248.  ]
 [ 6353.05  6338.  ]
 [ 8202.    8358.  ]
 [ 7429.    5499.  ]
 [12657.2  17669.  ]
 [ 6242.7   5195.  ]
 [17676.3  18150.  ]
 [10663.6   9989.  ]
 [10648.1   9988.  ]
 [ 7356.5   7099.  ]
 [10663.6   8449.  ]
 [ 7448.6   6849.  ]
 [ 6781.2   7395.  ]
 [29336.1  25552.  ]
 [ 8019.8   7995.  ]
 [ 8415.5   7957.  ]
 [ 8675.5   7895.  ]
 [ 7429.    6649.  ]
 [ 8003.2   8499.  ]
 [ 6880.65  6229.  ]
 [ 8173.    8495.  ]
 [ 7708.6   7349.  ]
 [ 5747.2   5151.  ]
 [ 7307.    7295.  ]
 [ 8182.03  8189.  ]
 [ 6895.08  7738.  ]
 [15102.   12940.  ]
 [ 6445.3   6479.  ]
 [15858.7  22018.  ]]


In [46]:
# evaluate the model performance:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.909969080480222

# So the Random Forest Regression model takes the win!