In [80]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from keras.utils import to_categorical

import numpy as np
from tensorflow.keras.optimizers import RMSprop


In [40]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [12]:
x_train.shape

(60000, 28, 28)

There are 6000 training samples each 28 by 28.

In [55]:
x_train = x_train.reshape(60000, 784)
x_train.shape

(60000, 784)

This puts all the training data into 2d, or 600000 lists of length 784. This just makes the dataset more simple for a MLP. An example of one training letter is:

In [22]:
x_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,  18,  18,
       126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  30,  36,  94, 154, 17

In [23]:
x_test.shape

(10000, 28, 28)

In [102]:
x_test = x_test.reshape(10000, 784)

The next step is to normalise the data so it is all between 0 and 1. This is so larger numbers don't skew the dataset.

In [54]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train = x_train/255
x_test = x_test/255

In [41]:
y_train
y_train[0]

5

The output needs to be converted into categorical data:

In [57]:
num_of_classes = 10 #this is because there are 10 numbers in the set
y_train = to_categorical(y_train, num_of_classes)
y_test = to_categorical(y_test, num_of_classes)

In [38]:
y_train[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32)

The above sample is a 5 as can be seen by the 6th number being a 1. Now to build the model:

In [64]:
#Using keras sequential model:
model = keras.Sequential()

#add a basic dense layer (all neurons are connected to all neurons in each layer). 512 neurons are selected to reduce
#the dimensionality from 784. Relu is chosen due to its fast training.
model.add(layers.Dense(512, activation='relu', input_shape=(784,)))

#dropout can be added as a regularisation. this drops 20% of the neurons randomly so the model relies on picking up
#strong features and cannot overfit because the neurons are different each pass.
keep_prob = 0.8
model.add(layers.Dropout(0.2))

#another layer:
model.add(layers.Dense(512, activation='relu'))

model.add(layers.Dropout(0.2))

model.add(layers.Dense(num_of_classes, activation='softmax'))

In [65]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 512)               401920    
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                5130      
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________


In [72]:
#compile the model with loss function for categorical data and optimser of stochastic gradient descent
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [83]:
#batch_size = 128
epochs = 20

history = model.fit(x_train, y_train, epochs=epochs, verbose=1,validation_data=(x_test, y_test))


Train on 60000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [88]:
score = model.evaluate(x_test, y_test, verbose=0)

In [86]:
print("test loss:", score[0])

test loss: 0.12115572971787032


In [89]:
print("test accuracy", score[1])

test accuracy 0.9764


A test accuracy of 97.64% was achieved from just 20 epochs showing the effectiveness of a MLP on the MNIST dataset.

In [104]:
single_prediction = np.array([x_test[0],])
model.predict(single_prediction)

array([[8.9486201e-27, 8.2563786e-20, 5.9441172e-15, 1.6156085e-11,
        1.9260036e-26, 1.8088165e-18, 0.0000000e+00, 1.0000000e+00,
        3.0322613e-18, 2.2619429e-13]], dtype=float32)

As can be seen this gives a value between 0 and 1 and can be viewed as a probability. To see the class we can use: 

In [106]:
model.predict_classes(single_prediction)

array([7])

In [109]:
y = y_test[0]
y = np.argmax(y, axis=-1)
y

7

Lets print the results for 20 predictions, later (hopefully) I will test on some out of sample images.

In [114]:
import random



for i in range(10):
    j = random.randint(1,784)
    x_new = np.array([x_test[j], ])
    y_new = model.predict_classes(x_new)
    y_new = y_new[0]
    print("prediction:", y_new, "actual:", np.argmax(y_test[j], axis =-1) )

prediction: 5 actual: 5
prediction: 4 actual: 4
prediction: 7 actual: 7
prediction: 6 actual: 6
prediction: 6 actual: 6
prediction: 7 actual: 7
prediction: 6 actual: 6
prediction: 1 actual: 1
prediction: 7 actual: 7
prediction: 9 actual: 9


Lets check if one is wrong:

In [116]:
for i in range(784):
    x_new = np.array([x_test[i], ])
    y_new = model.predict_classes(x_new)
    y_new = y_new[0]
    y_act = np.argmax(y_test[i], axis=-1)
    if y_new != y_act:
        print("test sample number ", i, " is incorrect, model predicted ", y_new, "actual was", y_act)

test sample number  149  is incorrect, model predicted  3 actual was 2
test sample number  151  is incorrect, model predicted  8 actual was 9
test sample number  217  is incorrect, model predicted  5 actual was 6
test sample number  247  is incorrect, model predicted  2 actual was 4
test sample number  274  is incorrect, model predicted  3 actual was 9
test sample number  321  is incorrect, model predicted  7 actual was 2
test sample number  340  is incorrect, model predicted  3 actual was 5
test sample number  445  is incorrect, model predicted  0 actual was 6
test sample number  447  is incorrect, model predicted  9 actual was 4
test sample number  547  is incorrect, model predicted  8 actual was 2
test sample number  613  is incorrect, model predicted  8 actual was 2
test sample number  624  is incorrect, model predicted  8 actual was 2
test sample number  659  is incorrect, model predicted  1 actual was 2
test sample number  684  is incorrect, model predicted  3 actual was 7
test s

It is interesting to see how similar looking many of the ones that were wrong were. For example a 6 and 0 being mismatched, these could maybe be mistakes a human could make depending on the handwriting. Hopefully more complex models can improve this.