# The Architecture
In the LeNet-5 convolutional architecture, used on the MNIST input images, we have 32x32 images as our data input. 

For this architecture, the layers are:
1. Convolution
2. Max Pooling
3. Convolution 
4. Max Pooling
5. Fully connected
6. Full connected
7. Output layer

Define the basic constants for the LeNet-5 model

In [1]:
import os
import argparse
import gzip
import sys
import time

import numpy
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

In [2]:
SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
WORK_DIRECTORY = 'data/mnist_data'
NUM_CHANNELS = 1
PIXEL_DEPTH = 255
IMAGE_SIZE = 28
NUM_LABELS = 10
VALIDATION_SIZE = 5000  # Size of the validation set.
SEED = 66478  # Set to None for random seed.
BATCH_SIZE = 64
NUM_EPOCHS = 10
EVAL_BATCH_SIZE = 64
EVAL_FREQUENCY = 100  # Number of steps between evaluations.

# Helper Functions

In [3]:
def download(filename):
    """
    Download the data from Yann's website, unless it's already here.
    """
    # if there is no local directory named 'data'
    if not os.path.exists(WORK_DIRECTORY):
        # create it
        os.makedirs(WORK_DIRECTORY)
    # create target filename
    filepath = os.path.join(WORK_DIRECTORY, filename)
    # if the file is not already there in the data directory
    if not os.path.exists(filepath):
        # use urllib to download it
        filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename,filepath)
    # get the size of the downloaded file
    size = os.stat(filepath).st_size
    # print out the file name and its size
    print('Successfully downloaded', filename, size, 'bytes.')
    return filepath

In [4]:
def extract_data(filename, num_images):
    """
    Extract the images into a 4D tensor [image index, y, x, channels].
    Values are rescaled from [0, 255] down to [-0.5, 0.5].
    """
    # print the file currently extracting
    print('Extracting', filename)
    # open the file
    with gzip.open(filename) as bytestream:
        # discard the header
        bytestream.read(16)
        # read in the data
        buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
        data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
    # The original data consists of pixels ranging from 0-255.
    # Center the data to have mean zero, and unit range.
    data = (data - (255/2.0))/255 
    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE,
                        NUM_CHANNELS)
    return data

In [5]:
def extract_labels(filename, num_images):
    """
    Extract the labels into a vector of int64 label IDs.
    """
    # show which file is currently being extracted
    print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        # Discard header.
        bytestream.read(8)
        # Read bytes for labels.
        buf = bytestream.read(num_images)
        labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64)
    return labels

In [6]:
def error_rate(predictions, labels):
    """
    Return the error rate based on dense predictions and sparse labels.
    """
    return 100.0 - (100.0 * numpy.sum(numpy.argmax(predictions, 1) == labels) / predictions.shape[0])

In [7]:
def model( data, train=False ):
    """
    The definition of our CNN model
    """
    # we use SAME padding so that the output feature map has the same size as the input
    # stride is a 4D array whose shape matches the data layout of [image_index, y, x, depth]
    conv1 = tf.nn.conv2d( data, conv1_weights, strides=[1,1,1,1], padding="SAME" )
    # we create the bias term and wrap the logit in rectified linear non-linearity
    relu1 = tf.nn.relu( tf.nn.bias_add( conv1, conv1_biases ) )
    # max pooling. we use a pooling window of 2 and a stride of 2.
    # ksize follows the layout of the data
    pool1 = tf.nn.max_pool( relu1, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME" )
    conv2 = tf.nn.conv2d( pool1, conv2_weights, strides=[1,1,1,1], padding="SAME" )
    relu2 = tf.nn.relu( tf.nn.bias_add( conv2, conv2_biases ) )
    pool2 = tf.nn.max_pool( relu2, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME" )
    pool_shape = pool2.get_shape().as_list()
    # reshape the input data cuboid into a 2D matrix to feed it to the fully connected layers
    reshape = tf.reshape( pool2, [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]] )
    # note that the "+" broadcasts the biases appropriately
    hidden = tf.nn.relu( tf.matmul(reshape, fc1_weights) + fc1_biases )
    # if we're training
    if train:
        # add a 50% dropout. dropout scales activations such that no rescaling is needed at evaluation time
        hidden = tf.nn.dropout( hidden, 0.5, seed=SEED )
    return tf.matmul( hidden, fc2_weights ) + fc2_biases

# Get the Data

In [8]:
# Get the data.
train_data_filename = download('train-images-idx3-ubyte.gz')
train_labels_filename = download('train-labels-idx1-ubyte.gz')
test_data_filename = download('t10k-images-idx3-ubyte.gz')
test_labels_filename = download('t10k-labels-idx1-ubyte.gz')

# Extract it into numpy arrays.
train_data = extract_data(train_data_filename, 60000)
train_labels = extract_labels(train_labels_filename, 60000)
test_data = extract_data(test_data_filename, 10000)
test_labels = extract_labels(test_labels_filename, 10000)

# Generate a validation set.
validation_data = train_data[:VALIDATION_SIZE, ...]
validation_labels = train_labels[:VALIDATION_SIZE]
train_data = train_data[VALIDATION_SIZE:, ...]
train_labels = train_labels[VALIDATION_SIZE:]

num_epochs = NUM_EPOCHS
train_size = train_labels.shape[0]

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting data/mnist_data/train-images-idx3-ubyte.gz
Extracting data/mnist_data/train-labels-idx1-ubyte.gz
Extracting data/mnist_data/t10k-images-idx3-ubyte.gz
Extracting data/mnist_data/t10k-labels-idx1-ubyte.gz


Placeholders to feed data into CNN

In [9]:
# This is where training samples and labels are fed to the graph.
# These placeholder nodes will be fed a batch of training data at each
# training step using the {feed_dict} argument to the Run() call below.
train_data_node = tf.placeholder(tf.float32,shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
eval_data = tf.placeholder(tf.float32,shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))

Define the learnable weights for the convolutional layers

In [10]:
# define a 5x5 filter of depth 32
conv1_weights = tf.Variable(tf.truncated_normal([5, 5, NUM_CHANNELS, 32],
                                                stddev=0.1,
                                                seed=SEED, dtype=tf.float32))
conv1_biases = tf.Variable(tf.zeros([32], dtype=tf.float32))
conv2_weights = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1, seed=SEED, dtype=tf.float32))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=tf.float32))

Define the learnable weights for the fully connected layers

In [11]:
fc1_weights = tf.Variable(tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],\
                                             stddev=0.1,seed=SEED, dtype=tf.float32))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=tf.float32))
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], stddev=0.1, seed=SEED, dtype=tf.float32))
fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS], dtype=tf.float32))

# Create the Model

In [12]:
logits = model(train_data_node, True)
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=train_labels_node, logits=logits))

# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(fc1_weights)
                + tf.nn.l2_loss(fc1_biases)
                + tf.nn.l2_loss(fc2_weights)
                + tf.nn.l2_loss(fc2_biases))
# Add the regularization term to the loss.
loss += 5e-4 * regularizers

# Optimizer: set up a variable that's incremented once per batch and
# controls the learning rate decay.
batch = tf.Variable(0, dtype=tf.float32)
# Decay once per epoch, using an exponential schedule starting at 0.01.
learning_rate = tf.train.exponential_decay(
    0.01,                # Base learning rate.
    batch * BATCH_SIZE,  # Current index into the dataset.
    train_size,          # Decay step.
    0.95,                # Decay rate.
    staircase=True)
# Use simple momentum for the optimization.
optimizer = tf.train.MomentumOptimizer(learning_rate,
                                       0.9).minimize(loss,
                                                     global_step=batch)

# Predictions for the current training minibatch.
train_prediction = tf.nn.softmax(logits)

# Predictions for the test and validation, which we'll compute less
# often.
eval_prediction = tf.nn.softmax(model(eval_data))

W0801 16:50:08.032005 140709209909056 deprecation.py:506] From <ipython-input-7-1bd9a9572416>:24: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


# Train the Model

In [13]:
# Small utility function to evaluate a dataset by feeding batches of
# data to {eval_data} and pulling the results from {eval_predictions}.
# Saves memory and enables this to run on smaller GPUs.
def eval_in_batches(data, sess):
    """Get predictions for a dataset by running it in small batches."""
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
        raise ValueError("batch size for evals larger than dataset: %d"
                     % size)
    predictions = numpy.ndarray(shape=(size, NUM_LABELS),
                              dtype=numpy.float32)
    for begin in xrange(0, size, EVAL_BATCH_SIZE):
        end = begin + EVAL_BATCH_SIZE
        if end <= size:
            predictions[begin:end, :] = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[begin:end, ...]})
    else:
        batch_predictions = sess.run(
        eval_prediction,
        feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
        predictions[begin:, :] = batch_predictions[begin - size:, :]
    return predictions

In [15]:
# Create a local session to run the training.
start_time = time.time()
with tf.Session() as sess:
    # Run all the initializers to prepare the trainable parameters.
    tf.global_variables_initializer().run()
    # Loop through training steps.
    for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
        # Compute the offset of the current minibatch in the data.
        # Note that we could use better randomization across epochs.
        offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
        batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
        batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
        # This dictionary maps the batch data (as a numpy array) to the
        # node in the graph it should be fed to.
        feed_dict = {train_data_node: batch_data,
                     train_labels_node: batch_labels}
        # Run the optimizer to update weights.
        sess.run(optimizer, feed_dict=feed_dict)
        # print some extra information once reach the evaluation frequency
        if step % EVAL_FREQUENCY == 0:
            # fetch some extra nodes' data
            l, lr, predictions = sess.run([loss, learning_rate,
                                         train_prediction],
                                        feed_dict=feed_dict)
            elapsed_time = time.time() - start_time
            start_time = time.time()
            print('Step %d (epoch %.2f), %.1f ms' %
                (step, float(step) * BATCH_SIZE / train_size,
                 1000 * elapsed_time / EVAL_FREQUENCY))
            print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
            print('Minibatch error: %.1f%%'
                % error_rate(predictions, batch_labels))
            print('Validation error: %.1f%%' % error_rate(
              eval_in_batches(validation_data, sess), validation_labels))
            sys.stdout.flush()
    # Finally print the result!
    test_error = error_rate(eval_in_batches(test_data, sess),
                          test_labels)
    print('Test error: %.1f%%' % test_error)

Step 0 (epoch 0.00), 4.0 ms
Minibatch loss: 8.334, learning rate: 0.010000
Minibatch error: 85.9%
Validation error: 84.6%
Step 100 (epoch 0.12), 41.5 ms
Minibatch loss: 3.266, learning rate: 0.010000
Minibatch error: 7.8%
Validation error: 8.6%
Step 200 (epoch 0.23), 40.5 ms
Minibatch loss: 3.380, learning rate: 0.010000
Minibatch error: 10.9%
Validation error: 4.4%
Step 300 (epoch 0.35), 40.8 ms
Minibatch loss: 3.170, learning rate: 0.010000
Minibatch error: 3.1%
Validation error: 3.0%
Step 400 (epoch 0.47), 40.7 ms
Minibatch loss: 3.185, learning rate: 0.010000
Minibatch error: 7.8%
Validation error: 2.7%
Step 500 (epoch 0.58), 41.0 ms
Minibatch loss: 3.170, learning rate: 0.010000
Minibatch error: 3.1%
Validation error: 2.6%
Step 600 (epoch 0.70), 40.3 ms
Minibatch loss: 3.143, learning rate: 0.010000
Minibatch error: 3.1%
Validation error: 2.1%
Step 700 (epoch 0.81), 40.8 ms
Minibatch loss: 2.978, learning rate: 0.010000
Minibatch error: 1.6%
Validation error: 2.2%
Step 800 (epoch 

Validation error: 0.9%
Step 6700 (epoch 7.80), 40.6 ms
Minibatch loss: 1.781, learning rate: 0.006983
Minibatch error: 0.0%
Validation error: 0.9%
Step 6800 (epoch 7.91), 43.0 ms
Minibatch loss: 1.779, learning rate: 0.006983
Minibatch error: 0.0%
Validation error: 0.9%
Step 6900 (epoch 8.03), 37.4 ms
Minibatch loss: 1.759, learning rate: 0.006634
Minibatch error: 0.0%
Validation error: 0.9%
Step 7000 (epoch 8.15), 35.4 ms
Minibatch loss: 1.748, learning rate: 0.006634
Minibatch error: 0.0%
Validation error: 1.0%
Step 7100 (epoch 8.26), 35.4 ms
Minibatch loss: 1.734, learning rate: 0.006634
Minibatch error: 0.0%
Validation error: 0.9%
Step 7200 (epoch 8.38), 35.3 ms
Minibatch loss: 1.731, learning rate: 0.006634
Minibatch error: 0.0%
Validation error: 0.9%
Step 7300 (epoch 8.49), 34.9 ms
Minibatch loss: 1.742, learning rate: 0.006634
Minibatch error: 1.6%
Validation error: 0.9%
Step 7400 (epoch 8.61), 35.2 ms
Minibatch loss: 1.700, learning rate: 0.006634
Minibatch error: 0.0%
Validati