## Marek Ochocki (marcopolo97@vp.pl) i Łukasz Gosek (lukaszjgosek@gmail.com)

# Zadanie 3

Reading data from DB

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

columns = ['symboling',  'normalized-losses',  'make',  'fuel-type',  'aspiration',  'num-of-doors',  'body-style',  'drive-wheels',  'engine-location',  'wheel-base',  'length',  'width',  'height',  'curb-weight',  'engine-type',  'num-of-cylinders',  'engine-size',  'fuel-system',  'bore',  'stroke',  'compression-ratio',  'horsepower',  'peak-rpm',  'city-mpg',  'highway-mpg',  'price']

# function converting words denoting numbers into actual numbers
text_2_num_conv = lambda x: 8 if x=='eight' else (5 if x=='five' else (4 if x=='four' else (6 if x=='six' else (3 if x=='three' else (12 if x=='twelve' else (2 if x=='two' else np.nan))))))

data = pd.read_csv("imports-85.data", header=None, names=columns, converters={'num-of-doors': text_2_num_conv, 'num-of-cylinders': text_2_num_conv}, na_values='?')

# removing rows with missing values
data.dropna(inplace=True)

data.reset_index(inplace=True, drop=True)

display(data)

# separating target from features
target = data['price']
features = data.drop(labels=['price'], axis=1)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,2,164.0,audi,gas,std,4.0,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,4,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
1,2,164.0,audi,gas,std,4.0,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,5,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
2,1,158.0,audi,gas,std,4.0,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,5,136,mpfi,3.19,3.40,8.5,110.0,5500.0,19,25,17710.0
3,1,158.0,audi,gas,turbo,4.0,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,5,131,mpfi,3.13,3.40,8.3,140.0,5500.0,17,20,23875.0
4,2,192.0,bmw,gas,std,2.0,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,4,108,mpfi,3.50,2.80,8.8,101.0,5800.0,23,29,16430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,-1,95.0,volvo,gas,std,4.0,sedan,rwd,front,109.1,188.8,68.9,55.5,2952,ohc,4,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
155,-1,95.0,volvo,gas,turbo,4.0,sedan,rwd,front,109.1,188.8,68.8,55.5,3049,ohc,4,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
156,-1,95.0,volvo,gas,std,4.0,sedan,rwd,front,109.1,188.8,68.9,55.5,3012,ohcv,6,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
157,-1,95.0,volvo,diesel,turbo,4.0,sedan,rwd,front,109.1,188.8,68.9,55.5,3217,ohc,6,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


One Hot encoding categorical variables

In [2]:
from sklearn.preprocessing import OneHotEncoder

columns_for_one_hot = ['make',  'fuel-type',  'aspiration', 'body-style',  'drive-wheels',  'engine-location',  'engine-type', 'fuel-system']

one_hot_encoder = OneHotEncoder(categories=[
    ['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury', 'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'],
    ['diesel', 'gas'],
    ['std', 'turbo'],
    ['hardtop', 'wagon', 'sedan', 'hatchback', 'convertible'],
    ['4wd', 'fwd', 'rwd'],
    ['front', 'rear'],
    ['dohc', 'dohcv', 'l', 'ohc', 'ohcf', 'ohcv', 'rotor'],
    ['1bbl', '2bbl', '4bbl', 'idi', 'mfi', 'mpfi', 'spdi', 'spfi']
], sparse=False)

one_hot_encoder.fit(features[columns_for_one_hot])
one_hot_encoded = pd.DataFrame(one_hot_encoder.transform(features[columns_for_one_hot]))
one_hot_encoded

X = pd.concat([one_hot_encoded, features.drop(labels=columns_for_one_hot, axis=1)], axis=1).astype(float)



Visualising our feature matrix

In [3]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,symboling,normalized-losses,num-of-doors,wheel-base,length,width,height,curb-weight,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,164.0,4.0,99.8,176.6,66.2,54.3,2337.0,4.0,109.0,3.19,3.40,10.0,102.0,5500.0,24.0,30.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,164.0,4.0,99.4,176.6,66.4,54.3,2824.0,5.0,136.0,3.19,3.40,8.0,115.0,5500.0,18.0,22.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,158.0,4.0,105.8,192.7,71.4,55.7,2844.0,5.0,136.0,3.19,3.40,8.5,110.0,5500.0,19.0,25.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,158.0,4.0,105.8,192.7,71.4,55.9,3086.0,5.0,131.0,3.13,3.40,8.3,140.0,5500.0,17.0,20.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,192.0,2.0,101.2,176.8,64.8,54.3,2395.0,4.0,108.0,3.50,2.80,8.8,101.0,5800.0,23.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,95.0,4.0,109.1,188.8,68.9,55.5,2952.0,4.0,141.0,3.78,3.15,9.5,114.0,5400.0,23.0,28.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,95.0,4.0,109.1,188.8,68.8,55.5,3049.0,4.0,141.0,3.78,3.15,8.7,160.0,5300.0,19.0,25.0
156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,95.0,4.0,109.1,188.8,68.9,55.5,3012.0,6.0,173.0,3.58,2.87,8.8,134.0,5500.0,18.0,23.0
157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,95.0,4.0,109.1,188.8,68.9,55.5,3217.0,6.0,145.0,3.01,3.40,23.0,106.0,4800.0,26.0,27.0


Splitting data into train and test sets

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

Data standarization

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Function creating a model - parameters allow to control number of layers, their size and learning rate

In [7]:
from keras import models
from keras import layers
from keras import regularizers
from keras import optimizers

def create_model(num_layers=1, layer_size=20, lr=0.005):
    network = models.Sequential()
    network.add(layers.Dense(layer_size,
                             activation='relu',
                             input_shape=(68,),
                             kernel_regularizer=regularizers.l2(0.001),
                             activity_regularizer=regularizers.l1(0.001)))
    for i in range(num_layers-1):
        network.add(layers.Dense(layer_size,
                                 activation='relu',
                                 kernel_regularizer=regularizers.l2(0.001),
                                 activity_regularizer=regularizers.l1(0.001)))
    network.add(layers.Dense(1))

    adam = optimizers.Adam(lr=lr)

    network.compile(optimizer=adam,
                    loss='mean_absolute_error',
                    metrics=['accuracy'])
    
    return network

Performing grid search over model hyperparameters, and learning parameters (number of epochs, batch size) - 10-fold CV instead of Leave One Out due to extremely long computation time for the latter

In [8]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


hyperparam_grid = dict(
    num_layers=[1, 2, 3],
    layer_size=[40, 70, 100],
    lr=[0.001, 0.005, 0.01],
    epochs=[100, 200, 300, 500],
    batch_size=[20]
)

model = KerasClassifier(build_fn=create_model, verbose=0)
grid = GridSearchCV(estimator=model, param_grid=hyperparam_grid, cv=10, n_jobs=-1, verbose=10, refit=False)
grid_result = grid.fit(X_train, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   51.2s
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

Parameters of the best estimator

In [9]:
grid_result.best_params_

{'batch_size': 20,
 'epochs': 300,
 'layer_size': 70,
 'lr': 0.01,
 'num_layers': 1}

Creating a model with optimal parameters and training on full training set

In [12]:
model = create_model(num_layers=grid_result.best_params_['num_layers'], layer_size=grid_result.best_params_['layer_size'], lr=grid_result.best_params_['lr'])
model.fit(X_train, y_train, epochs=grid_result.best_params_['epochs'], batch_size=grid_result.best_params_['batch_size'], verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.callbacks.History at 0x2646335f708>

Predicting and calculating MAE for test set

In [13]:
from sklearn.metrics import mean_absolute_error

preds = model.predict(X_test)
mean_absolute_error(y_test, preds)

1686.5008087158203

Comment: MAE of ~1670 is similair to most of the results of estimators for this dataset found online