# Importing the relevent libraries

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

# Data

In [2]:
mnist_dataset, mnist_info = tfds.load(name='mnist', with_info=True, as_supervised=True)
#loads the data in a 2-tuple structure [input, target]
#provides a tuple containing info about version, features, # samples of the dataset, ...

mnist_train, mnist_test = mnist_dataset['train'], mnist_dataset['test']
#By default tensorflow datasets have train and test datasets but NO validation dataset, Train = 60000, Test = 10000


num_validation_samples = 0.1 * mnist_info.splits['train'].num_examples
#we can either count the number of training samples or we can use the mnist_info variable we created earlier.
num_validation_samples = tf.cast(num_validation_samples, tf.int64)

num_test_samples = mnist_info.splits['test'].num_examples
num_test_samples = tf.cast(num_test_samples, tf.int64)
 
#we would like to scale our data in some way to make the result more numerically stable. 
#In this case we will simply prefer to have inputs between 0 and 1.
def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.
    return image, label

scaled_train_and_validation_data = mnist_train.map(scale)
test_data = mnist_test.map(scale)

#Since we'll be batching we'd better shuffle the data it should be as randomly spread as possible so that batching works as intended.
#If our data is not shuffled, This will confuse the stochastic gradient descent algorithm because each batch is homogenous inside it but completely different from all other batches, causing the loss to differ greatly.
#This buffer size parameter is used in cases when we are dealing with enormous datasets.
#In such cases we can't shuffle the whole data set in one go because we can't possibly fit it all in the memory of the computer.
#So instead we must instruct tensorflow to take samples ten thousand at a time shuffle them and then take the next ten thousand.       
BUFFER_SIZE = 10000

shuffled_train_and_validation_data = scaled_train_and_validation_data.shuffle(BUFFER_SIZE)

validation_data = shuffled_train_and_validation_data.take(num_validation_samples)
train_data = shuffled_train_and_validation_data.skip(num_validation_samples)

#we will be using mini batch gradient descent to train our model. This is the most efficient way to perform deep learning as the tradeoff between accuracy and speed is optimal.
BATCH_SIZE = 100

train_data = train_data.batch(BATCH_SIZE)
#since we won't be back propagating on the validation data but only forward propagating we don't really need to batch.
#So whenever we validate or test we simply forward propagate Once. When batching we usually find the average loss and average accuracy; during validation and testing We want the exact values.
#However the model expects our validation set in batch form too. In this way we'll create a new column in our tensor indicating that the model should take the whole validation dataset at once when it utilizes it.
#To handle our test data We don't need to batch it either. We'll take the same approach we use with the validation set.
validation_data = validation_data.batch(num_validation_samples)
test_data = test_data.batch(num_test_samples)

#our validation data must have the same shape and object properties as the train and test data. The mnist data is itterable and in tuple format As we set the argument as_supervised to true. 
#Therefore we must extract and convert the validation inputs and targets appropriately.
validation_inputs, validation_targets = next(iter(validation_data))

# Model

### Outlining the model

In [3]:
input_size = 784
output_size = 10
hidden_layer_size = 50

model = tf.keras.Sequential([
                            tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(output_size, activation='softmax'),
                           ])
#When we are creating a classifier the activation function of the output layer must transform the values into probabilities. Therefore we must opt for the softMax.

### Choosing the Optimizer and the Loss Function

In [4]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Training

In [5]:
NUM_EPOCHS = 10

model.fit(train_data, epochs = NUM_EPOCHS, validation_data=(validation_inputs, validation_targets), verbose=2, validation_steps=1)

Epoch 1/10
540/540 - 16s - loss: 0.4241 - accuracy: 0.8833 - val_loss: 0.2288 - val_accuracy: 0.9313
Epoch 2/10
540/540 - 8s - loss: 0.1892 - accuracy: 0.9448 - val_loss: 0.1535 - val_accuracy: 0.9545
Epoch 3/10
540/540 - 8s - loss: 0.1449 - accuracy: 0.9569 - val_loss: 0.1260 - val_accuracy: 0.9632
Epoch 4/10
540/540 - 8s - loss: 0.1193 - accuracy: 0.9641 - val_loss: 0.1133 - val_accuracy: 0.9642
Epoch 5/10
540/540 - 7s - loss: 0.1003 - accuracy: 0.9694 - val_loss: 0.1048 - val_accuracy: 0.9692
Epoch 6/10
540/540 - 6s - loss: 0.0874 - accuracy: 0.9740 - val_loss: 0.0857 - val_accuracy: 0.9728
Epoch 7/10
540/540 - 6s - loss: 0.0756 - accuracy: 0.9764 - val_loss: 0.0800 - val_accuracy: 0.9763
Epoch 8/10
540/540 - 9s - loss: 0.0686 - accuracy: 0.9787 - val_loss: 0.0792 - val_accuracy: 0.9742
Epoch 9/10
540/540 - 6s - loss: 0.0602 - accuracy: 0.9816 - val_loss: 0.0671 - val_accuracy: 0.9790
Epoch 10/10
540/540 - 10s - loss: 0.0553 - accuracy: 0.9834 - val_loss: 0.0652 - val_accuracy: 0.98

<tensorflow.python.keras.callbacks.History at 0x29f59ec2948>

### Testing the model

In [6]:
test_loss, test_accuracy = model.evaluate(test_data)

      1/Unknown - 3s 3s/step - loss: 0.0922 - accuracy: 0.9746

In [7]:
print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%.'.format(test_loss, test_accuracy*100.))

Test loss: 0.09. Test accuracy: 97.46%.
