In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split

from utilities import *

DATA = Path('.').resolve().parent / 'data'

Question 1 (10 points)
Fit a neural network model to the data. Use the single hidden layer with 2 nodes. Use predictors Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.  Remember to first scale the numerical predictor and outcome variables to a 0–1 scale and convert categorical predictors to dummies. Record the RMS error for the training data and the validation data.

In [2]:
car_df = pd.read_csv(DATA/'ToyotaCorolla.csv', encoding='utf-8')
car_df = pd.get_dummies(car_df, columns=['Fuel_Type'], drop_first=True)
car_df.columns
car_df.drop(columns = ['Id', 'Model', 'Mfg_Month', 'Mfg_Year', 'Met_Color', 'Color', 'CC', 
                       'Cylinders', 'Gears',  'Weight', 'BOVAG_Guarantee', 'ABS', 'Airbag_1', 'Airbag_2', 
                       'Boardcomputer', 'Central_Lock',  'Power_Steering', 'Radio', 'Mistlamps',  'Backseat_Divider', 
                       'Metallic_Rim', 'Radio_cassette', 'Parking_Assistant'], inplace = True)
columns = list(car_df.columns)
columns.remove('Price')
columns.append('Price')
car_df = car_df[columns]
car_df.head()

Unnamed: 0,Age_08_04,KM,HP,Automatic,Doors,Quarterly_Tax,Mfr_Guarantee,Guarantee_Period,Airco,Automatic_airco,CD_Player,Powered_Windows,Sport_Model,Tow_Bar,Fuel_Type_Diesel,Fuel_Type_Petrol,Price
0,23,46986,90,0,3,210,0,3,0,0,0,1,0,0,1,0,13500
1,23,72937,90,0,3,210,0,3,1,0,1,0,0,0,1,0,13750
2,24,41711,90,0,3,210,1,3,0,0,0,0,0,0,1,0,13950
3,26,48000,90,0,3,210,1,3,0,0,0,0,0,0,1,0,14950
4,30,38500,90,0,3,210,1,3,1,0,0,1,0,0,1,0,13750


In [3]:
scaleInput = MinMaxScaler(feature_range = (0,1))
X = scaleInput.fit_transform(car_df.drop(columns=['Price']))
scaleOutput = MinMaxScaler(feature_range = (0,1))
y = scaleOutput.fit_transform(car_df.iloc[:,[-1]])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=42)

In [4]:
car_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='lbfgs', random_state=1)
car_nnet.fit(X_train, y_train.ravel())
validPrediction = car_nnet.predict(np.array(X_valid))
trainPrediction = car_nnet.predict(X_train)
y_train_actual = scaleOutput.inverse_transform(y_train).ravel() 
y_train_pred = scaleOutput.inverse_transform([trainPrediction]).ravel()
y_valid_actual = scaleOutput.inverse_transform(y_valid).ravel() 
y_valid_pred = scaleOutput.inverse_transform([validPrediction]).ravel()

In [5]:
regressionSummary(y_train_actual, y_train_pred)


Regression statistics

                      Mean Error (ME) : -0.7023
       Root Mean Squared Error (RMSE) : 1133.0424
            Mean Absolute Error (MAE) : 846.2332
          Mean Percentage Error (MPE) : -1.1733
Mean Absolute Percentage Error (MAPE) : 8.4467


In [6]:
regressionSummary(y_valid_actual, y_valid_pred)


Regression statistics

                      Mean Error (ME) : -76.4120
       Root Mean Squared Error (RMSE) : 1110.5459
            Mean Absolute Error (MAE) : 820.3399
          Mean Percentage Error (MPE) : -1.8852
Mean Absolute Percentage Error (MAPE) : 8.0887


Question 2 (10 points)
Repeat the process, changing the number of hidden layers and nodes to {single layer with 5 nodes}, {two layers, 5 nodes in each layer}.
What happens to the RMS error for the training data as the number of layers and nodes increases?
What happens to the RMS error for the validation data?
Comment on the appropriate number of layers and nodes for this application

In [7]:
car_nnet = MLPRegressor(hidden_layer_sizes=(5), activation='logistic', solver='lbfgs', random_state=1)
car_nnet.fit(X_train, y_train.ravel())
validPrediction = car_nnet.predict(np.array(X_valid))
trainPrediction = car_nnet.predict(X_train)
y_train_actual = scaleOutput.inverse_transform(y_train).ravel() 
y_train_pred = scaleOutput.inverse_transform([trainPrediction]).ravel()
y_valid_actual = scaleOutput.inverse_transform(y_valid).ravel() 
y_valid_pred = scaleOutput.inverse_transform([validPrediction]).ravel()

In [8]:
regressionSummary(y_train_actual, y_train_pred)


Regression statistics

                      Mean Error (ME) : 2.3728
       Root Mean Squared Error (RMSE) : 1119.1412
            Mean Absolute Error (MAE) : 844.9400
          Mean Percentage Error (MPE) : -1.1184
Mean Absolute Percentage Error (MAPE) : 8.4337


In [9]:
regressionSummary(y_valid_actual, y_valid_pred)


Regression statistics

                      Mean Error (ME) : -69.6306
       Root Mean Squared Error (RMSE) : 1120.0035
            Mean Absolute Error (MAE) : 824.5876
          Mean Percentage Error (MPE) : -1.8070
Mean Absolute Percentage Error (MAPE) : 8.1049


In [10]:
car_nnet = MLPRegressor(hidden_layer_sizes=(5,5,), activation='logistic', solver='lbfgs', random_state=1)
car_nnet.fit(X_train, y_train.ravel())
validPrediction = car_nnet.predict(np.array(X_valid))
trainPrediction = car_nnet.predict(X_train)
y_train_actual = scaleOutput.inverse_transform(y_train).ravel() 
y_train_pred = scaleOutput.inverse_transform([trainPrediction]).ravel()
y_valid_actual = scaleOutput.inverse_transform(y_valid).ravel() 
y_valid_pred = scaleOutput.inverse_transform([validPrediction]).ravel()

In [11]:
regressionSummary(y_train_actual, y_train_pred)


Regression statistics

                      Mean Error (ME) : -0.3361
       Root Mean Squared Error (RMSE) : 1118.8443
            Mean Absolute Error (MAE) : 831.2972
          Mean Percentage Error (MPE) : -1.2113
Mean Absolute Percentage Error (MAPE) : 8.2754


In [12]:
regressionSummary(y_valid_actual, y_valid_pred)


Regression statistics

                      Mean Error (ME) : -81.3772
       Root Mean Squared Error (RMSE) : 1095.3985
            Mean Absolute Error (MAE) : 821.1226
          Mean Percentage Error (MPE) : -1.8895
Mean Absolute Percentage Error (MAPE) : 8.1330


The RMS error for the training data decreases as the number of layers and nodes increases.
the RMS error for the validation data increases when the nodes change from 2 to 5 and not much change if increase layers to 2.
So Single layer is good for this dataset, also there is no much use to increase nodes number.