# **Step 1: Import required libraries**

In [1]:
import keras
import numpy as np
from keras.preprocessing import image
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Lambda, Flatten
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D, BatchNormalization
from keras.optimizers import Adam

Using TensorFlow backend.


# **Step 2: Define all helper functions**

**normalize** 

-> this method normalizes the entire dataset 

-> i.e. subtract mean value of the distribution from each data point and divide it by standard deviation of the distribution

-> this helps in transforming the entire dataset into values just between 0 and 1, helps avoiding memory overflow due to large numbers.

In [0]:
def normalize(x):
    x_mean = x_train.mean().astype(np.float32)
    x_std = x_train.std().astype(np.float32)
    return (x - x_mean) / x_std

**onehot** 

-> this method performs “binarization” of the category and include it as a feature to train the model

-> Suppose you have ‘flower’ feature which can take values ‘daffodil’, ‘lily’, and ‘rose’. One hot encoding converts ‘flower’ feature to three features, ‘is_daffodil’, ‘is_lily’, and ‘is_rose’ which all are binary. 

![Onwhot Encoding](https://cdn-images-1.medium.com/max/1200/1*Ac4z1rWWuU0TzxJRUM62WA.jpeg)
*Courtesy - hackernoon.com*

In [0]:
def onehot(y):
    return keras.utils.np_utils.to_categorical(y)

**get_data**

In [0]:
def get_data(sample=False):
    if sample:
        x,y = x_train_sample, y_train_sample
        val_x, val_y = x_test_sample, y_test_sample
    else:
        x,y = x_train, y_train
        val_x, val_y = x_test, y_test
    return x, y, val_x, val_y

**basic_model**

-> Input Shape is 28 X 28 X 1

-> Normalize the input before feeding into your Neural Net

-> Lambda - wraps arbitrary expression as a Layer object.

In [0]:
def basic_model():
    return Sequential([Lambda(normalize, input_shape=(28,28,1))])

**add_output_layer**

-> Batch size = 10

-> Activation = Softmax function calculates the probabilities distribution of the event over ‘n’ different events. In general way of saying, this function will calculate the probabilities of each target class over all possible target classes. Later the calculated probabilities will be helpful for determining the target class for the given inputs.

The main advantage of using Softmax is the output probabilities range. The range will 0 to 1, and the sum of all the probabilities will be equal to one. If the softmax function used for multi-classification model it returns the probabilities of each class and the target class will have the high probability.

The formula computes the exponential (e-power) of the given input value and the sum of exponential values of all the values in the inputs. Then the ratio of the exponential of the input value and the sum of exponential values is the output of the softmax function.

Properties:
1.  The calculated probabilities will be in the range of 0 to 1.
2.  The sum of all the probabilities is equals to 1.


Softmax Function Usage:
1.  Used in multiple classification logistic regression model.
2.  In building neural networks softmax functions used in different layer level.





In [0]:
def add_output_layer(model):
    model.add(Dense(10, activation="softmax"))

**convolutional_model**

In [0]:
def convolutional_model(conv_blocks=2, hidden_dense_activation="relu", pool_size=2, num_dense_neurons=512, starting_filter_size=32, batchnorm=False, zero_padding=True, dropout=False):
    model = basic_model()
    
    for block in range(conv_blocks):
        num_filters = starting_filter_size * (block + 1)
        
        if zero_padding:
            model.add(ZeroPadding2D())
        model.add(Convolution2D(num_filters, 3, 3, activation="relu"))
        
        if batchnorm:
            model.add(BatchNormalization(axis=1))
        model.add(Convolution2D(num_filters, 3, 3, activation="relu"))
        
        model.add(MaxPooling2D(pool_size=pool_size))
        
        if batchnorm and block is not conv_blocks - 1:
            model.add(BatchNormalization(axis=1))
            
    model.add(Flatten())
    
    if batchnorm:
        model.add(BatchNormalization(axis=1))
    model.add(Dense(num_dense_neurons, activation=hidden_dense_activation))
    
    if batchnorm:
        model.add(BatchNormalization(axis=1))
    
    if dropout:
        model.add(Dropout(dropout))
    
    add_output_layer(model)
    
    return model

**compile_model**

In [0]:
def compile_model(model):
    # Categorical cross entropy is used when you have more than 2 classes to compare against.
    # We have 10 classes for MNIST (digits 0 - 9), so thus we use it here.
    return model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy"])

**fit**

In [0]:
def fit(model, learning_rate=None, epochs=1, sample=False):
    
    model.optimizer.lr = learning_rate if learning_rate else model.optimizer.lr
    
    x, y, val_x, val_y = get_data(sample=sample)
    
    model.fit(x, y, batch_size=64, epochs=epochs, validation_data=(val_x, val_y))

**multi_fit**

In [0]:
def multi_fit(model, reset=True, augmentation=False, sample=False, epochs=1, runs=1):
    for run in range(runs):
        for learning_rate in [0.001, 0.01, 0.1]:
            print("Fitting with learning rate of: ", learning_rate)
            
            fit(model, learning_rate=learning_rate, epochs=epochs, sample=sample)
            
            if reset:
                # Resetting is nice here for comparing differences in learning rate, without the compounding factor of model state across epochs
                model.reset_states()

# **Step 3: Define all executor functions**

**Load Data** - load MNIST dataset

In [24]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_test = np.expand_dims(x_test, -1)
x_train = np.expand_dims(x_train, -1)
y_train = onehot(y_train)
y_test = onehot(y_test)

sample_size = 4000
x_train_sample = x_train[:sample_size]
y_train_sample = y_train[:sample_size]
x_test_sample = x_test[:sample_size]
y_test_sample = y_test[:sample_size]


print("Shape of Input Data : ", x_train_sample.shape)
print("Shape of Output Labels : ",y_train_sample.shape)

Shape of Input Data :  (4000, 28, 28, 1)
Shape of Output Labels :  (4000, 10)


**Define Model**

In [25]:
model = convolutional_model(conv_blocks=1, starting_filter_size=3, pool_size=(4,4), hidden_dense_activation="relu", num_dense_neurons=8)

  if __name__ == '__main__':
  del sys.path[0]


**Compile Model**

In [0]:
compile_model(model)

**Optimize Model**

In [27]:
multi_fit(model, sample=True, epochs=2, runs=10)

Fitting with learning rate of:  0.001
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.01
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.1
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.001
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.01
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.1
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.001
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.01
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.1
Train on 4000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
Fitting with learning rate of:  0.001
Train on 4000 sam