# Deep Learning Fundamentals - LU04 Lab Exercise

## 4.1 Import required package and load data from file into pandas dataframe

In [1]:
import pandas as pd
import numpy as np
from keras import models
from keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Import the data using the file path
data = pd.read_csv('Ames_Housing_Sales.csv', sep=',')

## 4.2 Data Preparation
Extract the label column containing the SalePrice and remove from the dataset

In [2]:
y_col_name = 'SalePrice'
y_data = data[y_col_name]

x_data = data.drop(y_col_name, axis=1)

Perform One-Hot Encoding on all categorical data columns

In [3]:
# OneHot Encode categorical data
categorical_data = x_data.select_dtypes(include=['object']).copy()
for col in categorical_data.columns:
    categorical_data[col] = categorical_data[col].astype('category')
categorical_data = pd.get_dummies(categorical_data)
print(categorical_data)

      Alley_Grvl  Alley_None  Alley_Pave  BldgType_1Fam  BldgType_2fmCon  \
0              0           1           0              1                0   
1              0           1           0              1                0   
2              0           1           0              1                0   
3              0           1           0              1                0   
4              0           1           0              1                0   
...          ...         ...         ...            ...              ...   
1354           1           0           0              1                0   
1355           0           1           0              1                0   
1356           0           1           0              1                0   
1357           0           1           0              1                0   
1358           0           1           0              1                0   

      BldgType_Duplex  BldgType_Twnhs  BldgType_TwnhsE  BsmtCond_Fa  \
0               

Normalize data scales for numerical data and the labels

In [4]:
# Standard Scale numerial feature data
numerical_data = x_data.select_dtypes(include=['float64', 'int64']).copy()
data_tmp = numerical_data.values #returns a numpy array
std_scaler = StandardScaler()
data_tmp = std_scaler.fit_transform(data_tmp)
numerical_data = pd.DataFrame(data_tmp, columns=numerical_data.columns)

# Standard Scale numerial label data
y_tmp = pd.DataFrame(y_data).values #returns a numpy array
y_scaler = MinMaxScaler()
y_tmp = y_scaler.fit_transform(y_tmp)
y_tmp = y_tmp.reshape(-1)
y_data = pd.Series(y_tmp)

In [5]:
# Combine categorial and numerical data
x_data = pd.concat([numerical_data, categorical_data], axis=1)
x_col_name = x_data.columns
x_col_count = len(x_col_name)

## 4.3 Train, Validation, Test Dataset Split

### 4.3.1 Method 1

In [6]:
# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

network = models.Sequential()

# This is 1 input layer of x_col_count nodes and 1 output later of 1 node
#network.add(layers.Dense(1, activation='sigmoid', input_shape=(x_col_count,)))

# >>>>>>>>> the following set is a sample if to create multiple layer >>>>>>>>>>>>>
#example to create multiple layers. the following example is 
# 1 input layer of x_col_count nodes 
# 2 hidden layers of 5 and 3 nodes respectively
# 1 output layer of 1 node on predicted sales pricing
network.add(layers.Dense(5, activation='relu', input_shape=(x_col_count,)))
network.add(layers.Dense(3, activation='relu'))
network.add(layers.Dense(1, activation='sigmoid'))
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Observe the use loss function in the codes below
network.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mse'])

# Observe the setting of shuffle to True
# Play around the epochs, batch_size to see the effect 
# (Try to achieve <0.001 loss)
network.fit(X_train, y_train, epochs=20, batch_size=8, validation_split=0.2, shuffle=True)

test_loss, test_error = network.evaluate(X_test, y_test)
print('Test loss: {:.4f}'.format(test_loss))
print('Test error: {:.4f}'.format(test_error))

X_train shape: (951, 294)
y_train shape: (951,)
X_test shape: (408, 294)
y_test shape: (408,)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.0039
Test error: 0.0039


### 4.3.2 Method 2

In [7]:
# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_val shape: {}'.format(X_val.shape))
print('y_val shape: {}'.format(y_val.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

network = models.Sequential()

# This is 1 input layer of x_col_count nodes and 1 output later of 1 node
#network.add(layers.Dense(1, activation='sigmoid', input_shape=(x_col_count,)))

# >>>>>>>>> the following set is a sample if to create multiple layer >>>>>>>>>>>>>
#example to create multiple layers. the following example is 
# 1 input layer of x_col_count nodes 
# 2 hidden layers of 5 and 3 nodes respectively
# 1 output layer of 1 node on predicted sales pricing
network.add(layers.Dense(5, activation='relu', input_shape=(x_col_count,)))
network.add(layers.Dense(3, activation='relu'))
network.add(layers.Dense(1, activation='sigmoid'))
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Observe the use loss function in the codes below
network.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mae'])

# Observe the setting of shuffle to True
# Play around the epochs, batch_size to see the effect 
# (Try to achieve <0.001 loss)
history = network.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_val, y_val), shuffle=True)

test_loss, test_mae = network.evaluate(X_test, y_test)
print('Test loss: {:.4f}'.format(test_loss))
print('Test mae: {:.4f}'.format(test_mae))

X_train shape: (815, 294)
y_train shape: (815,)
X_val shape: (272, 294)
y_val shape: (272,)
X_test shape: (272, 294)
y_test shape: (272,)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.0032
Test mae: 0.0403


### 4.3.3 K-fold cross validation using scikit-learn

The following is a simple k-fold implementation. all data preparation will still be required prior this cell

In [8]:
network2 = models.Sequential()

# >>>>>>>>> the following set is a sample if to create multiple layer >>>>>>>>>>>>>
#example to create multiple layers. the following example is 
# 1 input layer of x_col_count nodes 
# 2 hidden layers of 5 and 3 nodes respectively
# 1 output layer of 1 node on predicted sales pricing
network2.add(layers.Dense(5, activation='relu', input_shape=(x_col_count,)))
network2.add(layers.Dense(3, activation='relu'))
network2.add(layers.Dense(1, activation='sigmoid'))
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Observe the use loss function in the codes below
network2.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mae'])

In [9]:
from numpy import array

# implement k-fold using scikit learn library. you can refer to the link below on the api
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
from sklearn.model_selection import KFold

# prepare cross validation of 5 fold on the data and further shuffle the data. You can modify this to 
# see the data set used in the print out 
kf = KFold(n_splits=5, random_state=40, shuffle=True)

for i, (train_index, test_index) in enumerate(kf.split(x_data)):
    firstTrain = train_index[0] 
    totalTrainRec = len(train_index)
    firstTest = test_index[0] 
    totalTestRec = len(test_index)
    
    print('********************************** Running fold '+ str(i))  
    print('=====Training set=======')
    print('Train set from ' + str(firstTrain) + ' with total of  ' + str(totalTrainRec))
    print(train_index)

    print('=====Testing set=======')
    print('Test set from ' + str(firstTest) + ' with total of  ' + str(totalTestRec))
    print(test_index)
    K_train, K_label = x_data.iloc[train_index], y_data.iloc[train_index]
    K_test, Ktest_label = x_data.iloc[test_index], y_data.iloc[test_index]
    
    network2.fit(K_train, K_label, epochs=5, batch_size=8)
    
    test_loss, test_mae = network.evaluate(K_test, Ktest_label)
    print('Test error: {}'.format(test_mae))

********************************** Running fold 0
Train set from 2 with total of  1087
[   2    3    4 ... 1354 1355 1357]
Test set from 0 with total of  272
[   0    1   14   26   28   30   34   35   42   49   50   56   62   71
   79   81   82   84   85   86   90   93   95  110  111  132  136  137
  141  143  144  152  154  163  168  169  181  188  192  198  203  204
  213  215  217  222  223  236  237  240  246  254  256  262  270  271
  275  277  279  283  284  287  304  307  316  323  325  337  345  360
  365  369  376  379  380  383  386  390  396  404  407  408  410  415
  417  423  426  435  445  449  450  464  466  467  468  471  481  482
  485  486  488  498  503  504  512  520  526  534  547  555  560  563
  564  569  571  581  591  607  609  619  625  628  629  631  635  637
  651  653  659  661  663  668  685  689  693  700  703  704  715  722
  729  730  735  739  741  743  746  747  750  767  776  786  796  807
  808  810  814  816  818  819  821  823  824  827  828  830 