# Keras neural network for MNIST classification

https://github.com/shibuiwilliam/keras_gpyopt/blob/master/bopt_nn.ipynb


This experiment defined a 2-layer Dense neural network whose hyperparameters are tuned via Bayesian optimization after a warm start.

That is, its performance is already quite good- BayesOpt here further fine-tunes the model while:
- Monitoring loss and accuracy (on a left-out test set, batch-wise); later: will try to implement *aleatoric* and *epistemic* uncertainties too
- For a choice of *activation function*, these two are calculated based on the network's dropout rates, hidden layer sizes, # epochs, batch size, and validation split
- For each activation function, the result of each Bayesian optimization iteration is saved as a row in a dataframe, and this is then examined in a regression setting for statistical significance of hyperparameters


In [0]:
!pip install GPy

In [0]:
!pip install GPyOpt

In [0]:

import GPy, GPyOpt
import numpy as np
import pandas as pds
import random
from keras.layers import Activation, Dropout, BatchNormalization, Dense
from keras.models import Sequential
from keras.datasets import mnist
from keras.metrics import categorical_crossentropy
from keras.utils import np_utils
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

time: 240 ms


In [0]:
# MNIST class
class MNIST():
    def __init__(self, first_input=784, last_output=10,
                 l1_out=512, 
                 l2_out=512, 
                 l1_drop=0.2, 
                 l2_drop=0.2, 
                 batch_size=100, 
                 epochs=10, 
                 validation_split=0.1):
        self.__first_input = first_input
        self.__last_output = last_output
        self.l1_out = l1_out
        self.l2_out = l2_out
        self.l1_drop = l1_drop
        self.l2_drop = l2_drop
        self.batch_size = batch_size
        self.epochs = epochs
        self.validation_split = validation_split
        self.__x_train, self.__x_test, self.__y_train, self.__y_test = self.mnist_data()
        self.__model = self.mnist_model()
        
    # load mnist data from keras dataset
    def mnist_data(self):
        (X_train, y_train), (X_test, y_test) = mnist.load_data()
        X_train = X_train.reshape(60000, 784)
        X_test = X_test.reshape(10000, 784)

        X_train = X_train.astype('float32')
        X_test = X_test.astype('float32')
        X_train /= 255
        X_test /= 255

        Y_train = np_utils.to_categorical(y_train, 10)
        Y_test = np_utils.to_categorical(y_test, 10)
        return X_train, X_test, Y_train, Y_test
    
    # mnist model
    def mnist_model(self):
        model = Sequential()
        model.add(Dense(self.l1_out, input_shape=(self.__first_input,)))
        model.add(Activation('relu'))
        model.add(Dropout(self.l1_drop))
        model.add(Dense(self.l2_out))
        model.add(Activation('relu'))
        model.add(Dropout(self.l2_drop))
        model.add(Dense(self.__last_output))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])

        return model
    
    # fit mnist model
    def mnist_fit(self):
        early_stopping = EarlyStopping(patience=0, verbose=1)
        
        self.__model.fit(self.__x_train, self.__y_train,
                       batch_size=self.batch_size,
                       epochs=self.epochs,
                       verbose=0,
                       validation_split=self.validation_split,
                       callbacks=[early_stopping])
    
    # evaluate mnist model
    def mnist_evaluate(self):
        self.mnist_fit()
        
        evaluation = self.__model.evaluate(self.__x_test, self.__y_test, batch_size=self.batch_size, verbose=0)
        return evaluation

In [0]:
# function to run mnist class
def run_mnist(first_input=784, last_output=10,
              l1_out=512, l2_out=512, 
              l1_drop=0.2, l2_drop=0.2, 
              batch_size=100, epochs=10, validation_split=0.1):
    
    _mnist = MNIST(first_input=first_input, last_output=last_output,
                   l1_out=l1_out, l2_out=l2_out, 
                   l1_drop=l1_drop, l2_drop=l2_drop, 
                   batch_size=batch_size, epochs=epochs, 
                   validation_split=validation_split)
    mnist_evaluation = _mnist.mnist_evaluate()
    return mnist_evaluation

# Bayesian optimization for the model

The hyperparameters of the 2-layer neural network.

In [0]:
# bounds for hyper-parameters in mnist model
# the bounds dict should be in order of continuous type and then discrete type
bounds = [{'name': 'l1_drop',          'type': 'continuous',  'domain': (0.0, 0.5)},
          {'name': 'l2_drop',          'type': 'continuous',  'domain': (0.0, 0.5)},
          {'name': 'l1_out',           'type': 'discrete',    'domain': (64, 128, 256, 512, 1024)},
          {'name': 'l2_out',           'type': 'discrete',    'domain': (64, 128, 256, 512, 1024)},
          {'name': 'batch_size',       'type': 'discrete',    'domain': (32, 64)},
          {'name': 'epochs',           'type': 'discrete',    'domain': (5, 10, 20)}]

time: 4.71 ms


Define the function to be optimized; it relies on loss and accuracy

In [0]:
# function to optimize mnist model
accuracy_list = []
loss_list = []
l1_dropout_list = []
l2_dropout_list = []
l1_out_list = []
l2_out_list = []
batch_size_list = []
epochs_list = []


def f(x):
    print(x)
    #modify this function to create a dataframe keeping track of hyperparams. and performance
    evaluation = run_mnist(
        l1_drop = float(x[:,0]), 
        l2_drop = float(x[:,1]), 
        l1_out = int(x[:,2]),
        l2_out = int(x[:,3]), 
        batch_size = int(x[:,4]), 
        epochs = int(x[:,5])) 
        #validation_split = float(x[:,0]))
    accuracy_list.append(evaluation[1])
    loss_list.append(evaluation[0])
    l1_dropout_list.append(float(x[:,0]))
    l2_dropout_list.append(float(x[:,1]))
    l1_out_list.append(int(x[:,2]))
    l2_out_list.append(int(x[:,3]))
    batch_size_list.append(int(x[:,4]))
    epochs_list.append(int(x[:,5]))
    print("LOSS:\t{0} \t ACCURACY:\t{1}".format(evaluation[0], evaluation[1]))
    #print(evaluation)
    return evaluation[0]

time: 11.4 ms


In [0]:
# optimizer
import warnings
warnings.filterwarnings('ignore')
#default acquisition function is expected improvement... EI. Other options: { ‘MPI', ‘LCB’} <- will experiment with these as well

# Choice of acquisition function?
#acquisition = 'EI' 
acquisition = 'MPI'
#acquisition = 'LCB'
kernel = GPy.kern.Matern52(input_dim=1, variance=1.0, lengthscale=2.5) #default kernel is Matern 5/2, shown to work well on most tasks [Snoek et al., 2012]

# 5 outputs by default... it takes 5 starting points to begin with! 
opt_mnist = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds, kernel=kernel, acquisition_type = acquisition, acquisition_jitter = 0.01)

# optimize mnist model
maxiter = 30
opt_mnist.run_optimization(max_iter=maxiter) # if max_iter = x, then there will be x+5 results. By default, it takes 5 samples.

[[  0.21310759   0.21746308 128.         128.          32.
   20.        ]]
Epoch 00007: early stopping
LOSS:	0.08014777007705706 	 ACCURACY:	0.9779
[[3.19652002e-01 3.53879220e-01 5.12000000e+02 2.56000000e+02
  6.40000000e+01 1.00000000e+01]]
Epoch 00004: early stopping
LOSS:	0.07589345325692556 	 ACCURACY:	0.9778
[[4.20974561e-01 5.51086560e-02 1.02400000e+03 1.28000000e+02
  3.20000000e+01 1.00000000e+01]]
Epoch 00004: early stopping
LOSS:	0.07176869309709873 	 ACCURACY:	0.9784
[[  0.37322592   0.24677989  64.         128.          32.
    5.        ]]
Epoch 00005: early stopping
LOSS:	0.11593333527562208 	 ACCURACY:	0.9642
[[3.67357246e-01 3.95679621e-01 1.02400000e+03 6.40000000e+01
  3.20000000e+01 1.00000000e+01]]
Epoch 00006: early stopping
LOSS:	0.08741464113570838 	 ACCURACY:	0.9768
The set cost function is ignored! LCB acquisition does not make sense with cost.
[[5.11609524e-04 4.78040378e-01 1.28000000e+02 6.40000000e+01
  6.40000000e+01 2.00000000e+01]]
Epoch 00005: early

Time taken for Bayesian optimization:

Acquisition:

EI

MPI

LCB - 25 min 38 sec 

In [0]:
#check that enough iterations took place... length here should be maxiter + 5
print(len(loss_list)) 
#best hyperparams
print(opt_mnist.x_opt)

#plot convergence
opt_mnist.plot_convergence()

In [0]:
#put all results into a dataframe to be exported to .csv format
import pandas as pd

if acquisition == 'EI':
  bayesOpt_performance_EI = pd.DataFrame(
      {'Dropout in layer 1': l1_dropout_list,
      'Dropout in layer 2': l2_dropout_list,
      'Layer 1 size': l1_out_list,
      'Layer 2 size': l2_out_list,
      'Batch size': batch_size_list,
      'Epochs': epochs_list,
      #'Validation split': validation_split_list,
      'Loss': loss_list,
      'Accuracy': accuracy_list
      })
elif acquisition == 'MPI':
  bayesOpt_performance_MPI = pd.DataFrame(
      {'Dropout in layer 1': l1_dropout_list,
      'Dropout in layer 2': l2_dropout_list,
      'Layer 1 size': l1_out_list,
      'Layer 2 size': l2_out_list,
      'Batch size': batch_size_list,
      'Epochs': epochs_list,
      #'Validation split': validation_split_list,
      'Loss': loss_list,
      'Accuracy': accuracy_list
      })
elif acquisition == 'LCB':
  bayesOpt_performance_LCB = pd.DataFrame(
      {'Dropout in layer 1': l1_dropout_list,
      'Dropout in layer 2': l2_dropout_list,
      'Layer 1 size': l1_out_list,
      'Layer 2 size': l2_out_list,
      'Batch size': batch_size_list,
      'Epochs': epochs_list,
      #'Validation split': validation_split_list,
      'Loss': loss_list,
      'Accuracy': accuracy_list
      })

In [0]:
#print(bayesOpt_performance)
if acquisition == 'EI':
  bayesOpt_performance_EI.to_csv(r'BayesOpt_Hyperparameters_keras_MNIST_EI.csv', header=True)
if acquisition == 'MPI':
  bayesOpt_performance_MPI.to_csv(r'BayesOpt_Hyperparameters_keras_MNIST_MPI.csv', header=True)
if acquisition == 'LCB':
  bayesOpt_performance_LCB.to_csv(r'BayesOpt_Hyperparameters_keras_MNIST_LCB.csv', header=True)

In [0]:

# print optimized mnist model
print("""
Optimized Parameters:
\t{0}:\t{1}
\t{2}:\t{3}
\t{4}:\t{5}
\t{6}:\t{7}
\t{8}:\t{9}
\t{10}:\t{11}
""".format(bounds[0]["name"],opt_mnist.x_opt[0],
           bounds[1]["name"],opt_mnist.x_opt[1],
           bounds[2]["name"],opt_mnist.x_opt[2],
           bounds[3]["name"],opt_mnist.x_opt[3],
           bounds[4]["name"],opt_mnist.x_opt[4],
           bounds[5]["name"],opt_mnist.x_opt[5]))
           #bounds[6]["name"],opt_mnist.x_opt[6]))
print("optimized loss: {0}".format(opt_mnist.fx_opt))


Optimized Parameters:
	l1_drop:	0.2791510127474299
	l2_drop:	0.41382603699229603
	l1_out:	256.0
	l2_out:	128.0
	batch_size:	32.0
	epochs:	10.0

optimized loss: 0.07014503737912454


Retrieve best hyperparameters:

In [0]:
opt_mnist.x_opt


array([4.16108600e-01, 1.97594437e-01, 1.02400000e+03, 6.40000000e+01,
       6.40000000e+01, 2.00000000e+01])

In [0]:
opt_mnist.plot_acquisition()
opt_mnist.plot_convergence()

# Random search

Now we can see the performance of the random search model;
its performance should look like white noise.

This is coded from scratch, which is trivial. Using a Keras wrapper with the scikit learn RandomizedSearch was annoying.



In [0]:
!pip install ipython-autotime
%load_ext autotime

In [0]:
import random
maxiter = 35

#hyperparameters:
l1_drop_values = [] 
l2_drop_values = []
l1_out_values = []
l2_out_values = []
batch_size_values = []
epochs_values = []
accuracy_list = []
loss_list = []



for i in range(maxiter):
  l1_drop = random.uniform(0, 0.3)
  l2_drop = random.uniform(0, 0.3)
  l1_out = random.choice([64, 128, 256, 512, 1024])
  l2_out = random.choice([64, 128, 256, 512, 1024])
  batch_size = random.choice([32, 64])
  epochs = random.choice([5, 10, 20])

  l1_drop_values.append(l1_drop)
  l2_drop_values.append(l2_drop)
  l1_out_values.append(l1_out)
  l2_out_values.append(l2_out)
  batch_size_values.append(batch_size)
  epochs_values.append(epochs)

  evaluation = run_mnist(
        l1_drop = l1_drop, 
        l2_drop = l2_drop, 
        l1_out = l1_out,
        l2_out = l2_out, 
        batch_size = batch_size, 
        epochs = epochs) 
  accuracy_list.append(evaluation[1])
  loss_list.append(evaluation[0])
  print("LOSS:\t{0} \t ACCURACY:\t{1}".format(evaluation[0], evaluation[1]))

Epoch 00004: early stopping
LOSS:	0.08393999278205447 	 ACCURACY:	0.9763
Epoch 00002: early stopping
LOSS:	0.12972128381417133 	 ACCURACY:	0.9561
Epoch 00005: early stopping
LOSS:	0.08095801948263252 	 ACCURACY:	0.9767
Epoch 00005: early stopping
LOSS:	0.0861539315050235 	 ACCURACY:	0.9766
Epoch 00005: early stopping
LOSS:	0.07943686439549784 	 ACCURACY:	0.9772
Epoch 00005: early stopping
LOSS:	0.0665393693373073 	 ACCURACY:	0.9798
Epoch 00004: early stopping
LOSS:	0.09553429095949978 	 ACCURACY:	0.9713
Epoch 00004: early stopping
LOSS:	0.069875609615969 	 ACCURACY:	0.9784
Epoch 00004: early stopping
LOSS:	0.07917471268810332 	 ACCURACY:	0.9766
Epoch 00003: early stopping
LOSS:	0.08126694226264954 	 ACCURACY:	0.9759
LOSS:	0.08353986749184551 	 ACCURACY:	0.9738
Epoch 00004: early stopping
LOSS:	0.1141093005700037 	 ACCURACY:	0.9688
Epoch 00006: early stopping
LOSS:	0.10217970528267324 	 ACCURACY:	0.9695
Epoch 00005: early stopping
LOSS:	0.07490481636805925 	 ACCURACY:	0.9779
Epoch 00004

In [0]:
#save result to pandas dataframe, then csv
print(len(loss_list))
import pandas as pd
rs_performance_keras_mnist = pd.DataFrame({'Dropout in layer 1': l1_drop_values,
      'Dropout in layer 2': l2_drop_values,
      'Layer 1 size': l1_out_values,
      'Layer 2 size': l2_out_values,
      'Batch size': batch_size_values,
      'Epochs': epochs_values,
      'Loss': loss_list,
      'Accuracy': accuracy_list
      })

rs_performance_keras_mnist.to_csv(r'Random_Search_Hyperparameters_keras_MNIST.csv', header=True)

#Random search here took 18 min 46 seconds

35
time: 15.4 ms
