In [0]:
'''
Author  : Rajendra singh
Roll no : 111601017
Date    : 28 Sept 2019
Topic   : Mnist data classification using feed forward network

Question : 

MNIST digit classification: (Use Tensorflow or Keras).

(a) MNIST is a database of hand written images. Download MNIST data using the built-in functions
    in Tensorflow or Keras
(b) Get the training, validation and test data sets using the functions in Tensorflow or Keras. If
    you are using Tensorflow, the dataset is already split into training set of size 55000, validation
    set of size 5000, and test set of size 10000. If you are using Keras, the data set is split into
    training set of size 60000, and validation set of size 10000. Then create a validation set of size
    5000 from the training set.
(c) Classify the dataset using a feed-forward neural network. Vary the hyperparameters as follows:
  i.  Create a fully connected feed forward neural network for MNIST classification with one
      hidden layer(32 nodes). Train the model using Stochastic Gradient Descent optimizer with
      learning rate 0.1. Use Sigmoid activation function in the hidden layer.
  ii. Normalize the dataset to range (0,1). Compare both the normalized and unnormalized
      models in terms of training time and accuracy.
  iii. Choose the best performing model among (i) and (ii). Train different models by varying
      the number of hidden layers in the model as 2 and 3. Record the observations. Other
      hyperparameters are same as in (i).
  iv. Choose the best performing model in (iii). Train models by varying the learning rates as
      0.001 and 0.0001 and record your observations.
  v.  Choose the best performing model in (iv). Train models by varying the number of nodes in
      each hidden layer to 64 and 128.
  vi. Choose the best performing model in (v). Train models by varying the activation functions
      in each of the hidden layers to tanh, relu and leaky relu and record your observations.
  vii. Among all the configurations of hyper-parameters that you trained above, which setting is
      best. How did you decide which setting is better?
  viii. Among all the models trained above, how will you choose the best model? Which is the
      best model?
  ix. (Optional) Report the training time and RAM usage for each training.
'''

In [0]:
#=======================Import all necessary libraries=====================
from __future__ import print_function

import keras
from keras.datasets import mnist
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, LeakyReLU, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from time import *

In [0]:
# ====================== define necessary variables ========================
batch_size = 128
num_classes = 10
epochs = 200
# input image dimensions
img_rows, img_cols = 28, 28

In [97]:
#========================== LOAD DATA ===============================
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
#print shape of data before as per asked
print(y_train.shape, y_test.shape)

(60000,) (10000,)


In [98]:
#============================= SPLIT DATA ============================
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5)
#print shape of data after split
print(y_train.shape, y_valid.shape, y_test.shape)

(60000,) (5000,) (5000,)


In [99]:
#========================= Correct data formating ============================
# i.e  (None, 28, 28) = > (None, 28, 28, 1)
if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_valid = x_valid.reshape(x_valid.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_valid = x_valid.reshape(x_valid.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)
print(x_train.shape)

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_valid = keras.utils.to_categorical(y_valid, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(x_train.shape)

(60000, 28, 28, 1)
(60000, 28, 28, 1)


In [100]:
#============================= Printing final data shape used with model ===========
print("train = ",x_train.shape,y_train.shape)
print("test = ",x_test.shape,y_test.shape)
print("valid = ",x_valid.shape,y_valid.shape)

train =  (60000, 28, 28, 1) (60000, 10)
test =  (5000, 28, 28, 1) (5000, 10)
valid =  (5000, 28, 28, 1) (5000, 10)


In [0]:
# **************************************************** QUESTION C.i ****************************************************

In [0]:
# ======================== Funtion to create model ==========================
def createModel(H, s, acti,lr): #here h is no. of hidden layer and size of it, acti is activation function
  model = Sequential()
  model.add(Flatten())
  for i in range(H):
    model.add(Dense(s, activation=acti,input_shape=input_shape))
    if acti == "leakyrelu":
      model.add(LeakyReLU(alpha=0.3))
    else:
      model.add(Activation(acti))
    
  model.add(Dense(num_classes, activation='softmax'))
  model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.SGD(lr=lr), metrics=['accuracy'])
  return model

# ===================== Funtion to train model ================
def trainM(model):
  es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5) #early stop if validation loss increases 
  mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
  tempt = time()
  model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_valid, y_valid), callbacks=[es, mc])
  model = load_model('best_model.h5')
  tempt = time() - tempt
  print("Training time(sec): ", tempt)
  print("\n------------------------------------------------------------\n")
  return model, tempt

# ===================== Funtion to evaluate model ================
def eval(model):
  score = model.evaluate(x_test, y_test, verbose=0) #evaluating on test set
  print('Test loss:', score[0])
  print('Test accuracy:', score[1])
  print("\n------------------------------------------------------------\n")
  return score[0], score[1]

# ===================== Funtion to print model ================
def printM(model):
  print(" Model summury ")
  model.summary()
  print("\n------------------------------------------------------------\n")

In [103]:
#================== Create model ================
model = createModel(1,32, "sigmoid",0.1)
#================== Train model ================
model,traingTime_ci = trainM(model)
#================== eval model ================
loss_ci, acc_ci = eval(model)
#================== Print model summary ================
printM(model)

Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.81940, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc improved from 0.81940 to 0.85200, saving model to best_model.h5
Epoch 3/200

Epoch 00003: val_acc improved from 0.85200 to 0.87520, saving model to best_model.h5
Epoch 4/200

Epoch 00004: val_acc improved from 0.87520 to 0.88240, saving model to best_model.h5
Epoch 5/200

Epoch 00005: val_acc improved from 0.88240 to 0.88820, saving model to best_model.h5
Epoch 6/200

Epoch 00007: val_acc improved from 0.88820 to 0.89700, saving model to best_model.h5
Epoch 8/200

Epoch 00008: val_acc improved from 0.89700 to 0.89760, saving model to best_model.h5
Epoch 9/200

Epoch 00009: val_acc improved from 0.89760 to 0.90040, saving model to best_model.h5
Epoch 10/200

Epoch 00010: val_acc did not improve from 0.90040
Epoch 11/200

Epoch 00011: val_acc did not improve from 0.90040
Epoch 12/200

Epoch 00012: val_acc improve

In [0]:
# **************************************************** QUESTION C.ii ****************************************************

In [0]:
#============================= Normalise data ==========================
x_train = x_train.astype('float32')
x_valid = x_valid.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_valid /= 255
x_test /= 255

In [106]:
#================== Create model ================
model = createModel(1,32,"sigmoid",0.1)
#================== Train model ================
model,traingTime_cii = trainM(model)
#================== eval model ================
loss_cii, acc_cii = eval(model)
#================== Print model summary ================
printM(model)
# ================== comparing normalized and unnormalized models ====================
print("Unormalised = [ loss = " + str(loss_ci) + ", acc = " + str(acc_ci) + ", Training time(sec) = " + str(traingTime_ci))
print("Normalised = [ loss = " + str(loss_cii) + ", acc = " + str(acc_cii) + ", Training time(sec) = " + str(traingTime_cii))

print("\n Clearly, Normalised model perform best here ")

Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.59620, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc improved from 0.59620 to 0.65920, saving model to best_model.h5
Epoch 3/200

Epoch 00003: val_acc improved from 0.65920 to 0.72200, saving model to best_model.h5
Epoch 4/200

Epoch 00004: val_acc improved from 0.72200 to 0.77780, saving model to best_model.h5
Epoch 5/200

Epoch 00005: val_acc improved from 0.77780 to 0.82860, saving model to best_model.h5
Epoch 6/200

Epoch 00006: val_acc improved from 0.82860 to 0.85100, saving model to best_model.h5
Epoch 7/200

Epoch 00007: val_acc improved from 0.85100 to 0.86380, saving model to best_model.h5
Epoch 8/200

Epoch 00008: val_acc improved from 0.86380 to 0.87720, saving model to best_model.h5
Epoch 9/200

Epoch 00009: val_acc improved from 0.87720 to 0.87900, saving model to best_model.h5
Epoch 10/200

Epoch 00010: val_acc improved from 0.87900 to 0.88680, sav

In [0]:
# **************************************************** QUESTION C.iii ****************************************************

In [108]:
#------------------------------------------- For 2 hidden layer ---------------------------------
#================== Create model ================
model = createModel(2,32,"sigmoid",0.1)
#================== Train model ================
model,traingTime_Ciii_2 = trainM(model)
#================== eval model ================
loss_Ciii_2, acc_Ciii_2 = eval(model)
#================== Print model summary ================
printM(model)

#------------------------------------------- For 3 hidden layer ---------------------------------
#================== Create model ================
model = createModel(3,32,"sigmoid",0.1)
#================== Train model ================
model,traingTime_Ciii_3 = trainM(model)
#================== eval model ================
loss_Ciii_3, acc_Ciii_3 = eval(model)
#================== Print model summary ================
printM(model)


# ================== comparing normalized and unnormalized models ====================
print("2 Hidden layer = [ loss = " + str(loss_Ciii_2) + ", acc = " + str(acc_Ciii_2) + ", Training time(sec) = " + str(traingTime_Ciii_2))
print("3 Hidden layer = [ loss = " + str(loss_Ciii_3) + ", acc = " + str(acc_Ciii_3) + ", Training time(sec) = " + str(traingTime_Ciii_3))

print("\n It seems here, 2 Hidden layer model perform better")

Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.11760, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc did not improve from 0.11760
Epoch 3/200

Epoch 00003: val_acc did not improve from 0.11760
Epoch 4/200

Epoch 00004: val_acc did not improve from 0.11760
Epoch 5/200

Epoch 00005: val_acc did not improve from 0.11760
Epoch 6/200

Epoch 00006: val_acc improved from 0.11760 to 0.12380, saving model to best_model.h5
Epoch 7/200

Epoch 00007: val_acc did not improve from 0.12380
Epoch 8/200

Epoch 00008: val_acc improved from 0.12380 to 0.16480, saving model to best_model.h5
Epoch 9/200

Epoch 00009: val_acc improved from 0.16480 to 0.28400, saving model to best_model.h5
Epoch 10/200

Epoch 00010: val_acc did not improve from 0.28400
Epoch 11/200

Epoch 00011: val_acc did not improve from 0.28400
Epoch 12/200

Epoch 00012: val_acc improved from 0.28400 to 0.37520, saving model to best_model.h5
Epoch 13/200

Epoch 

In [0]:
# **************************************************** QUESTION C.iv ****************************************************

In [110]:
# Since learning rate too small lets in the epoch as
epochs = 200
#------------------------------------------- For lr = 0.001 ---------------------------------
#================== Create model ================
model = createModel(2,32,"sigmoid",0.001)
#================== Train model ================
model,traingTime_CiV_001 = trainM(model)
#================== eval model ================
loss_CiV_001, acc_CiV_001 = eval(model)
#================== Print model summary ================
printM(model)

#------------------------------------------- For lr = 0.0001 ---------------------------------
#================== Create model ================
model = createModel(2,32,"sigmoid",0.0001)
#================== Train model ================
model,traingTime_CiV_0001 = trainM(model)
#================== eval model ================
loss_CiV_0001, acc_CiV_0001 = eval(model)
#================== Print model summary ================
printM(model)


# ================== comparing normalized and unnormalized models ====================
print("For lr(0.001) = [ loss = " + str(loss_CiV_001) + ", acc = " + str(acc_CiV_001) + ", Training time(sec) = " + str(traingTime_CiV_001))
print("For lr(0.0001) = [ loss = " + str(loss_CiV_0001) + ", acc = " + str(acc_CiV_0001) + ", Training time(sec) = " + str(traingTime_CiV_0001))
print("\n It seems here, model with lr=0.001 perform better on with 0.0001 but not better than model with lr=0.1, Hence procced with 0.001 as it asked to chose best out of 0.001 and 0.0001 in Question")

Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.09520, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc did not improve from 0.09520
Epoch 3/200

Epoch 00003: val_acc did not improve from 0.09520
Epoch 4/200

Epoch 00004: val_acc did not improve from 0.09520
Epoch 5/200

Epoch 00005: val_acc did not improve from 0.09520
Epoch 6/200

Epoch 00006: val_acc improved from 0.09520 to 0.11760, saving model to best_model.h5
Epoch 7/200

Epoch 00007: val_acc did not improve from 0.11760
Epoch 8/200

Epoch 00008: val_acc did not improve from 0.11760
Epoch 9/200

Epoch 00009: val_acc did not improve from 0.11760
Epoch 10/200

Epoch 00010: val_acc did not improve from 0.11760
Epoch 11/200

Epoch 00011: val_acc did not improve from 0.11760
Epoch 00011: early stopping
Training time(sec):  25.06432056427002

------------------------------------------------------------

Test loss: 2.3012048271179197
Test accuracy: 0.1094

------

In [0]:
# **************************************************** QUESTION C.v ****************************************************

In [116]:
# epochs = 200
#------------------------------------------- For 64 ---------------------------------
#================== Create model ================
model = createModel(2,64,"sigmoid",0.001)
#================== Train model ================
model,traingTime_CV_64 = trainM(model)
#================== eval model ================
loss_CV_64, acc_CV_64 = eval(model)
#================== Print model summary ================
printM(model)

#------------------------------------------- For 128 ---------------------------------
#================== Create model ================
model = createModel(2,128,"sigmoid",0.001)
#================== Train model ================
model,traingTime_CV_128 = trainM(model)
#================== eval model ================
loss_CV_128, acc_CV_128 = eval(model)
#================== Print model summary ================
printM(model)


# ================== comparing normalized and unnormalized models ====================
print("For 64 = [ loss = " + str(loss_CV_64) + ", acc = " + str(acc_CV_64) + ", Training time(sec) = " + str(traingTime_CV_64))
print("For 128 = [ loss = " + str(loss_CV_128) + ", acc = " + str(acc_CV_128) + ", Training time(sec) = " + str(traingTime_CV_128))
print("\n It seems here, model with 128 node in hidden layer perform better.Hence procced with 128 as it asked to chose best out of 64 and 128 in Question")

Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.10340, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc did not improve from 0.10340
Epoch 3/200

Epoch 00003: val_acc improved from 0.10340 to 0.11760, saving model to best_model.h5
Epoch 4/200

Epoch 00004: val_acc did not improve from 0.11760
Epoch 5/200

Epoch 00005: val_acc did not improve from 0.11760
Epoch 6/200

Epoch 00006: val_acc did not improve from 0.11760
Epoch 7/200

Epoch 00007: val_acc did not improve from 0.11760
Epoch 8/200

Epoch 00008: val_acc did not improve from 0.11760
Epoch 00008: early stopping
Training time(sec):  25.452443599700928

------------------------------------------------------------

Test loss: 2.3015950576782225
Test accuracy: 0.1094

------------------------------------------------------------

 Model summury 
Model: "sequential_39"
_________________________________________________________________
Layer (type)                 

In [0]:
# **************************************************** QUESTION C.vi ****************************************************

In [114]:
# epochs = 200
#------------------------------------------- For tanh ---------------------------------
#================== Create model ================
model = createModel(2,128,"tanh",0.001)
#================== Train model ================
model,traingTime_Cvi_tanh = trainM(model)
#================== eval model ================
loss_Cvi_tanh, acc_Cvi_tanh = eval(model)
#================== Print model summary ================
printM(model)


#------------------------------------------- For relu ---------------------------------
#================== Create model ================
model = createModel(2,128,"relu",0.001)
#================== Train model ================
model,traingTime_Cvi_relu = trainM(model)
#================== eval model ================
loss_Cvi_relu, acc_Cvi_relu = eval(model)
#================== Print model summary ================
printM(model)

''' It gave error although I import LeakyReLu and using it seperatly
#------------------------------------------- For leakyrelu ---------------------------------
#================== Create model ================
model = createModel(2,128,"leakyrelu",0.001)
#================== Train model ================
model,traingTime_Cvi_leakyrelu = trainM(model)
#================== eval model ================
loss_Cvi_leakyrelu, acc_Cvi_leakyrelu = eval(model)
#================== Print model summary ================
printM(model)
'''
# ================== comparing normalized and unnormalized models ====================
print("For tanh = [ loss = " + str(loss_Cvi_tanh) + ", acc = " + str(acc_Cvi_tanh) + ", Training time(sec) = " + str(traingTime_Cvi_tanh))
print("For relu = [ loss = " + str(loss_Cvi_relu) + ", acc = " + str(acc_Cvi_relu) + ", Training time(sec) = " + str(traingTime_Cvi_relu))
# print("For leakyrelu = [ loss = " + str(loss_Cvi_leakyrelu) + ", acc = " + str(acc_Cvi_leakyrelu) + ", Training time(sec) = " + str(traingTime_Cvi_leakyrelu))
print("\n It seems here, model with relu in hidden layer perform better.")

Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.59760, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc improved from 0.59760 to 0.73380, saving model to best_model.h5
Epoch 3/200

Epoch 00003: val_acc improved from 0.73380 to 0.77740, saving model to best_model.h5
Epoch 4/200

Epoch 00004: val_acc improved from 0.77740 to 0.80020, saving model to best_model.h5
Epoch 5/200

Epoch 00005: val_acc improved from 0.80020 to 0.81920, saving model to best_model.h5
Epoch 6/200

Epoch 00006: val_acc improved from 0.81920 to 0.83120, saving model to best_model.h5
Epoch 7/200

Epoch 00007: val_acc improved from 0.83120 to 0.84540, saving model to best_model.h5
Epoch 8/200

Epoch 00008: val_acc improved from 0.84540 to 0.85200, saving model to best_model.h5
Epoch 9/200

Epoch 00009: val_acc improved from 0.85200 to 0.85980, saving model to best_model.h5
Epoch 10/200

Epoch 00010: val_acc improved from 0.85980 to 0.86420, sav

In [0]:
# **************************************************** QUESTION C.vii ****************************************************

'''
Among all configurations of hyper-parameters for which we trained above.
We chose the best setting by comparing validation loss and validaation 
accuracy of configurations of hyper-parameters.
Also It will be good, we get best model in minimum possible training time possible.
'''

In [0]:
# **************************************************** QUESTION C.viii ****************************************************

'''
Among all the models trained above I found out the best model is
trained with
                    data = Normalised
                    Number of Hidden layer = 2
                    Activation function = relu
                    Number of node in hidden layer = 128
                    learning rate = 0.001

and

          [ loss = 0.2073979331254959, acc = 0.943, Training time(sec) = 324.31938457489014]


We took this model after comparing it with valid_loss, valid_acc and training time among all trained model above.

'''

# PLEASE NOTE ABOVE MODEL IS CHOSEN BY STRICTLY FOLLOWING THE STEPS AND RESTRICTIONS! GIVEN IN THE QUETION.
# THERE ARE VARIOUS OTHER WAYS BY WHICH I COULD HAVE TRAINED THE MODEL WITH EVEN BETTER PERFORMANCES IF THERE
# NO RESTRICTIONS !!!

In [0]:
# ================== FEW OF BETTER WAY ARE (WITHOUT RESTRICTIONS) ======================

In [117]:
#-------------This give test accuracy of 97.26 % -------------

#================== Create model ================
model = createModel(2,128,"relu",0.01)
#================== Train model ================
model,traingTime_Cvi_tanh = trainM(model)
#================== eval model ================
loss_Cvi_tanh, acc_Cvi_tanh = eval(model)
#================== Print model summary ================
printM(model)


Train on 60000 samples, validate on 5000 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.84880, saving model to best_model.h5
Epoch 2/200

Epoch 00002: val_acc improved from 0.84880 to 0.88880, saving model to best_model.h5
Epoch 3/200

Epoch 00003: val_acc improved from 0.88880 to 0.90560, saving model to best_model.h5
Epoch 4/200

Epoch 00004: val_acc improved from 0.90560 to 0.91360, saving model to best_model.h5
Epoch 5/200

Epoch 00005: val_acc improved from 0.91360 to 0.91980, saving model to best_model.h5
Epoch 6/200

Epoch 00006: val_acc improved from 0.91980 to 0.92520, saving model to best_model.h5
Epoch 7/200

Epoch 00007: val_acc improved from 0.92520 to 0.93060, saving model to best_model.h5
Epoch 8/200

Epoch 00008: val_acc improved from 0.93060 to 0.93340, saving model to best_model.h5
Epoch 9/200

Epoch 00009: val_acc improved from 0.93340 to 0.93540, saving model to best_model.h5
Epoch 10/200

Epoch 00010: val_acc improved from 0.93540 to 0.93740, sav

In [119]:
#----------- THIS give almost 99.12% test accuracy ------------
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu', input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])

epochs = 12
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
#==========================================================

Train on 60000 samples, validate on 5000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Test loss: 0.030511780695714696
Test accuracy: 0.9912
