## Importing the libraries

In [63]:
import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds

## Preprocessing the data

In [64]:
mnist_dataset = tfds.load(name='mnist') ## loading the dataset

In [65]:
mnist_dataset, mnist_info = tfds.load(name='mnist', with_info = True, as_supervised=True) # Getting additional information from the data set

In [66]:
mnist_train, mnist_test = mnist_dataset['train'], mnist_dataset['test'] # Splitting the dataset into test and train

In [67]:
num_validation_samples = 0.1 * mnist_info.splits['train'].num_examples # Extracting 10% samples from train datasets for validation

In [68]:
num_validation_samples = tf.cast(num_validation_samples, tf.int64) 
#tf method suitable for casting variables into different data types

num_test_samples = mnist_info.splits['test'].num_examples
num_test_samples = tf.cast(num_test_samples, tf.int64)

In [69]:
def scale(image,label):
    image = tf.cast(image, tf.float32)
    image /= 255.
    return image,label

# build scale function as input for tensorflow map

scaled_train_and_validation_data = mnist_train.map(scale)

test_data = mnist_test.map(scale)

BUFFER_SIZE = 10000 #needs to be greater than number of samples for uniform shuffling

shuffled_train_and_validation_data = scaled_train_and_validation_data.shuffle(BUFFER_SIZE)

validation_data = shuffled_train_and_validation_data.take(num_validation_samples)
train_data = shuffled_train_and_validation_data.skip(num_validation_samples) #skip the validation dataset

BATCH_SIZE = 100

train_data = train_data.batch(BATCH_SIZE)
validation_data = validation_data.batch(num_validation_samples)
test_data = test_data.batch(num_test_samples)

validation_inputs, validation_targets = next(iter(validation_data)) # Go through the validation data element by element

In [70]:
# 784 input nodes, 50 hidden nodes in two layers each, 10 output layers
input_size = 784 # Each observation is 28 by 28 pixels (Tensor of rank 3 : 28 by 28 by 1)
output_size = 10
hidden_layer_size = 200 # all hidden layers are assumed to be of the same size

model = tf.keras.Sequential([
                            tf.keras.layers.Flatten(input_shape=(28,28,1)), # Reducing rank by flattening one dimension
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # Building each consecutive layer (an equivalent of the input and weight dot product and adding bias, also can be used to apply activation function)
                            tf.keras.layers.Dense(hidden_layer_size, activation='tanh'), # Add as many hidden layers as needed
                            tf.keras.layers.Dense(output_size, activation='softmax') # Use softmax to form output as probability
                            ])

## Optimization Algorithm (Loss Function)

In [71]:
custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.02)
model.compile(optimizer=custom_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) # sparse_categorical_crossentropy automatically incorporates one-hot encoding
# Output metrics can be specified to monitor i.e. accuracy in this case


## Training the model

In [72]:
NUM_EPOCHS = 5

model.fit(train_data, epochs = NUM_EPOCHS, validation_data = (validation_inputs, validation_targets), verbose = 2)

# Training loss = 0 at the beginning of each epoch
# The algorithm will iterate over a set number of batches all from train_data
# Updating weights and biases as many times as the number of batches
# However loss function updated only after each epoch once all batches pass through the training at least once i.e = number of epochs

Epoch 1/5
540/540 - 3s - loss: 0.3695 - accuracy: 0.8953 - val_loss: 0.2208 - val_accuracy: 0.9340
Epoch 2/5
540/540 - 3s - loss: 0.2586 - accuracy: 0.9283 - val_loss: 0.2197 - val_accuracy: 0.9377
Epoch 3/5
540/540 - 3s - loss: 0.2728 - accuracy: 0.9268 - val_loss: 0.3388 - val_accuracy: 0.8982
Epoch 4/5
540/540 - 3s - loss: 0.3026 - accuracy: 0.9173 - val_loss: 0.2741 - val_accuracy: 0.9248
Epoch 5/5
540/540 - 3s - loss: 0.3288 - accuracy: 0.9160 - val_loss: 0.2965 - val_accuracy: 0.9203


<tensorflow.python.keras.callbacks.History at 0x269bc9cc040>

Both validation and training loss are decreasing hence we are increasing in accuracy but not overfitting. Doubling the hidden layer size from 50 to 100 has increased the accuracy. Doubling further increases accuracy but not so much. Adding a hidden layer increases the accuracy to 98.5%. With 5 hidden layers, the accuracy does not improve. Using two hidden layers each with a sigmoid activation drops the accuracy back to 97%. Using two hidden layers, one with a relu activation function and the other with a tanh activation function increases the accuracy upto 98.9%. Changing the batch size from 100 to 10000 causes a loss of accuracy which drops to 90.6%. Using 1 as batch size significantly increases the learning time where all trainings before were completed withing 7-8 secs, with a batch size of 1, each epoch takes approximately 53-58 secs and the accuracy is 96.8% so using SGD is not the best option.

Using a custom learning rate of 0.0001 gives an accuracy of 95.7% with a total training time of 10s. Adjusting the learning rate to 0.02 gives an accuracy of 92% with a total learning time of 12s. Best to use the default value in this case