# First neural network using MNIST data

In [1]:
from theano.sandbox import cuda

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:23:00.0)


In [2]:
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential 
from keras.utils.np_utils import to_categorical

np.set_printoptions(precision=4, linewidth=100)

Using Theano backend.


In [3]:
from keras.layers.core import Flatten, Dense, Dropout, Lambda

from keras.optimizers import Adam

In [4]:
from keras.preprocessing import image

# Setup 

In [5]:
batch_size = 64

In [6]:
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [7]:
# keras expect the images to be in rank2 so we need to add the channel into the image. 
# because we are using theano as backend, image is channelxheightxwidth
x_train = np.expand_dims(x_train,1)
x_test = np.expand_dims(x_test, 1)

In [8]:
# final shape of the images after expansion
x_train.shape, x_test.shape

((60000, 1, 28, 28), (10000, 1, 28, 28))

In [9]:
# the labels are not one hot encoded. 
y_train[:5]

array([5, 0, 4, 1, 9], dtype=uint8)

In [10]:
# one-hot encode the labels
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [11]:
y_train[:5]

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [12]:
# normalizing the input helps with training 
mean_px = x_train.mean().astype(np.float32)
std_px = x_train.std().astype(np.float32)
print('The pixel mean is {0} and the standard deviation is {1}'.format(mean_px, std_px))

The pixel mean is 33.31842041015625 and the standard deviation is 78.56748962402344


In [13]:
def norm_input(x): return (x-mean_px)/std_px

# Linear Model 

In [21]:
# Create a simple linear model which is similar to logistical regression. 
# First input is normalized
# Second flatten to create a single vector with length of 1x28x28
# Third a non-linear activation 
def get_lin_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Flatten(),
        Dense(10, activation = 'softmax')   
        ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [22]:
lm = get_lin_model()

In [14]:
# Generate image batches 
gen = image.ImageDataGenerator()
batches = gen.flow(x_train, y_train, batch_size = batch_size)
test_batches = gen.flow(x_test, y_test, batch_size = batch_size)

In [34]:
# Train the model
lm.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch=5, 
                 validation_data=test_batches, nb_val_samples = test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb7c3a7b978>

In [36]:
# Reduce the learning rate since it looks like it is close to the minimum 
lm.optimizer.lr=0.001

In [38]:
lm.fit_generator(batches, batches.n, nb_epoch = 3, 
                 validation_data = test_batches, nb_val_samples=test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb7c3a7b080>

There is not much we can do to improve the accuracy. 

# Single Dense Layer

In [17]:
def get_fc_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Flatten(),
        Dense(512, activation = 'softmax'),
        Dense(10, activation = 'softmax')
    ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

## Show that you should start with a small learning rate

In [26]:
fc_test = get_fc_model()
fc_test.optimizer.lr = 0.1
fc_test.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 5, 
                     validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4e03051c50>

As you can see the training after 5 epoch isn't even close to 1 epoch with lr = 0.001. 

In [23]:
fc = get_fc_model()

In [24]:
fc.fit_generator(batches, samples_per_epoch = batches.n, nb_epoch = 2,
                validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4e037b3b70>

In [27]:
fc.optimizer.lr = 0.1

In [29]:
fc.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 3,
                validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4e03057320>

In [30]:
fc.optimizer.lr = 0.01

In [31]:
fc.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 3,
                validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4e03057358>

Looks like it is already overfitting use this nn architecture.

# Basic VGG-Style CNN 

In [None]:
def get_vgg_model:
    model