In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Auto imports dataset https://archive.ics.uci.edu/ml/datasets/automobile

Missing attribute values denoted by "?"

In [8]:
# Number of Attributes: 26 total
#    -- 15 continuous
#    -- 1 integer
#    -- 10 nominal

#  Attribute Information:     
#      Attribute:                Attribute Range:
#      ------------------        -----------------------------------------------
#   1. symboling:                -3, -2, -1, 0, 1, 2, 3.
#   2. normalized-losses:        continuous from 65 to 256.
#   3. make:                     alfa-romero, audi, bmw, chevrolet, dodge, honda,
#                                isuzu, jaguar, mazda, mercedes-benz, mercury,
#                                mitsubishi, nissan, peugot, plymouth, porsche,
#                                renault, saab, subaru, toyota, volkswagen, volvo
#   4. fuel-type:                diesel, gas.
#   5. aspiration:               std, turbo.
#   6. num-of-doors:             four, two.
#   7. body-style:               hardtop, wagon, sedan, hatchback, convertible.
#   8. drive-wheels:             4wd, fwd, rwd.
#   9. engine-location:          front, rear.
#  10. wheel-base:               continuous from 86.6 120.9.
#  11. length:                   continuous from 141.1 to 208.1.
#  12. width:                    continuous from 60.3 to 72.3.
#  13. height:                   continuous from 47.8 to 59.8.
#  14. curb-weight:              continuous from 1488 to 4066.
#  15. engine-type:              dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
#  16. num-of-cylinders:         eight, five, four, six, three, twelve, two.
#  17. engine-size:              continuous from 61 to 326.
#  18. fuel-system:              1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
#  19. bore:                     continuous from 2.54 to 3.94.
#  20. stroke:                   continuous from 2.07 to 4.17.
#  21. compression-ratio:        continuous from 7 to 23.
#  22. horsepower:               continuous from 48 to 288.
#  23. peak-rpm:                 continuous from 4150 to 6600.
#  24. city-mpg:                 continuous from 13 to 49.
#  25. highway-mpg:              continuous from 16 to 54.
#  26. price:                    continuous from 5118 to 45400.

# "Symboling" corresponds to the degree to which the auto is more risky than its price indicates.
# Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is
# adjusted by moving it up (or down) the scale. A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.

# "Normalized-losses" is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a
# particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car
# per year.

#  Missing Attribute Values: (denoted by "?")
#    Attribute #:   Number of instances missing a value:
#    2.             41
#    6.             2
#    19.            4
#    20.            4
#    22.            2
#    23.            2
#    26.            4

### Read data and replace "?" with nan

In [9]:
df = pd.read_csv('automobile_data.csv')
df = df.replace('?', np.nan)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [11]:
df = df[df['price'].notnull()]

### Impute missing values

In [12]:
columns = ['normalized-losses', 'stroke', 'bore', 'horsepower', 'peak-rpm', ]
for i in columns:
    m = df[i].median()
    df[i] = df[i].replace(np.nan, m)

In [13]:
df[df['num-of-doors'].isnull()]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
27,1,148,dodge,gas,turbo,,sedan,fwd,front,93.7,...,98,mpfi,3.03,3.39,7.6,102,5500,24,30,8558
63,0,115,mazda,diesel,std,,sedan,fwd,front,98.8,...,122,idi,3.39,3.39,22.7,64,4650,36,42,10795


In [14]:
df.loc[27,:]

symboling                1
normalized-losses      148
make                 dodge
fuel-type              gas
aspiration           turbo
num-of-doors           NaN
body-style           sedan
drive-wheels           fwd
engine-location      front
wheel-base            93.7
length               157.3
width                 63.8
height                50.6
curb-weight           2191
engine-type            ohc
num-of-cylinders      four
engine-size             98
fuel-system           mpfi
bore                  3.03
stroke                3.39
compression-ratio      7.6
horsepower             102
peak-rpm              5500
city-mpg                24
highway-mpg             30
price                 8558
Name: 27, dtype: object

In [15]:
df.loc[63,:]

symboling                 0
normalized-losses       115
make                  mazda
fuel-type            diesel
aspiration              std
num-of-doors            NaN
body-style            sedan
drive-wheels            fwd
engine-location       front
wheel-base             98.8
length                177.8
width                  66.5
height                 55.5
curb-weight            2443
engine-type             ohc
num-of-cylinders       four
engine-size             122
fuel-system             idi
bore                   3.39
stroke                 3.39
compression-ratio      22.7
horsepower               64
peak-rpm               4650
city-mpg                 36
highway-mpg              42
price                 10795
Name: 63, dtype: object

In [16]:
df[df['body-style'] == 'sedan']['num-of-doors'].value_counts()

four    78
two     14
Name: num-of-doors, dtype: int64

In [17]:
df['num-of-doors'] = df['num-of-doors'].replace(np.nan, 'four')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  201 non-null    object 
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       201 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non

### Encode categorical features

In [19]:
label_columns = ['symboling','make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']
df_label_columns = df[label_columns]

In [20]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(dtype = 'int32')
enc_label_columns = enc.fit_transform(df_label_columns).toarray()
enc.categories_

[array([-2, -1,  0,  1,  2,  3], dtype=int64),
 array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
        'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
        'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault',
        'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object),
 array(['diesel', 'gas'], dtype=object),
 array(['std', 'turbo'], dtype=object),
 array(['four', 'two'], dtype=object),
 array(['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'],
       dtype=object),
 array(['4wd', 'fwd', 'rwd'], dtype=object),
 array(['front', 'rear'], dtype=object),
 array(['dohc', 'l', 'ohc', 'ohcf', 'ohcv', 'rotor'], dtype=object),
 array(['eight', 'five', 'four', 'six', 'three', 'twelve', 'two'],
       dtype=object),
 array(['1bbl', '2bbl', '4bbl', 'idi', 'mfi', 'mpfi', 'spdi', 'spfi'],
       dtype=object)]

In [21]:
enc_label_columns.shape

(201, 65)

In [22]:
all_columns = df.columns
all_columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [23]:
columns = list(set(all_columns) - set(label_columns))
columns

['normalized-losses',
 'engine-size',
 'bore',
 'height',
 'peak-rpm',
 'city-mpg',
 'compression-ratio',
 'wheel-base',
 'width',
 'stroke',
 'curb-weight',
 'horsepower',
 'length',
 'highway-mpg',
 'price']

In [24]:
X = df[columns].drop('price', axis = 1).values
X = np.concatenate([X, enc_label_columns], axis=1)
X.shape

(201, 79)

In [25]:
Y = df['price'].values
Y.shape

(201,)

### Split data into train and test

In [26]:
Rand = 1

In [27]:
from sklearn.model_selection import train_test_split
# Split to train and test
# 75% and 25% by default
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, random_state=Rand)
print(Xtrain.shape, Xtest.shape)

(150, 79) (51, 79)


### Build models

#### RandomForestRegressor

In [28]:
from sklearn.ensemble import RandomForestRegressor

In [29]:
rfr = RandomForestRegressor(random_state = Rand, n_jobs = -1)
rfr.fit(Xtrain, ytrain)
print('train R^2', rfr.score(Xtrain, ytrain))
print('train R^2', rfr.score(Xtest, ytest))

train R^2 0.9904406656184902
train R^2 0.90491695858199


#### GradientBoostingRegressor

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

In [31]:
gbr = GradientBoostingRegressor(random_state = Rand)
gbr.fit(Xtrain, ytrain)
print('train R^2', gbr.score(Xtrain, ytrain))
print('train R^2', gbr.score(Xtest, ytest))

train R^2 0.994308754677728
train R^2 0.8951186111265917


In [32]:
# One can try GridSearchCV for RF or GB to get better result