In [None]:
import plaidml.keras
plaidml.keras.install_backend()

import keras

import numpy as np 
import matplotlib.pyplot as plt

The [Fashion MNIST dataset](https://github.com/zalandoresearch/fashion-mnist) is preferable to handwritten digit version because 1.) it's harder to archive near perfect classifications and, 2.) it's a better real word example.

There are multiple methods for loading the fashion MNIST data. The keras dataset is helpful in that it returns tuples for training and test data.

In [None]:
fashion_mnist = keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

Labels are the numbers 0-9, but we will sometimes want to map these to the articles of clothing they represent. 

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']


Each pixel in input images is value from 0-255 but these need to be normalised to use as imput for the model. 

In [None]:
train_images = train_images / 255.0
test_images = test_images / 255.0

Visualise the data to ensure it is correct.

In [None]:
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])

Building a model without weight regularisation or drop layers to server as baselane for the following models

In [None]:
baseline_model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

baseline_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy', 'sparse_categorical_crossentropy']
)

In [None]:
baseline_history = baseline_model.fit(train_images, train_labels, 
                                      epochs=20,
                                      validation_data=(test_images, test_labels),
                                      verbose=2)

In this model we attempt to address overfitting by adding a regularisation layer. The task of the regulariser is to penalise nodes with large values.

In [None]:
reg_model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

reg_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy', 'sparse_categorical_crossentropy']
)

In [None]:
reg_history = reg_model.fit(train_images, train_labels, 
                            epochs=20,
                            validation_data=(test_images, test_labels),
                            verbose=2)

In this model we again try to address overfitting but this time we add a dropout layer. 

In [None]:
drop_model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10, activation='softmax')
])

drop_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy', 'sparse_categorical_crossentropy']
)

In [None]:
drop_history = drop_model.fit(train_images, train_labels, 
                              epochs=20,
                              validation_data=(test_images, test_labels),
                              verbose=2)

In [None]:
def plot_history(histories, key='sparse_categorical_crossentropy'):
  plt.figure(figsize=(16,10))
    
  for name, history in histories:
    val = plt.plot(history.epoch, history.history['val_'+key],
                   '--', label=name.title()+' Val')
    plt.plot(history.epoch, history.history[key], color=val[0].get_color(),
             label=name.title()+' Train')

  plt.xlabel('Epochs')
  plt.ylabel(key.replace('_',' ').title())
  plt.legend()

  plt.xlim([0,max(history.epoch)])
    

plot_history([('baseline', baseline_history),
              ('reg', reg_history),
              ('drop', drop_history)])

From this chart I see regularization and dropout didn't improved the models ability to generalise and had the unexpected consequence of decreasing the accuracy to which the model fit the training data. Maybe the two stratergies would be move effective if the models capacity increased.  

In [None]:
drop_model.evaluate(test_images, test_labels)

In [None]:
predictions = drop_model.predict(np.expand_dims(test_images[0], 0))

np.argmax(predictions[0])