# Udacity Deep Learning Project - Digit Recognition - Part 2

Author: Lei Mao <br>
Late revised on: 12/23/2016 <br>

## Convolutional Neural Network (CNN) for multiMNIST Dataset

### Import libraries required

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
from six.moves import cPickle as pickle

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [2]:
%autosave 0

Autosave disabled


### Load multiMNIST dataset

In [3]:
# Read data from multiMNIST.pickle
multiMNIST_folder = 'data/multiMNIST/'
pickle_file = 'multiMNIST_continuous_80000.pickle'

with open(multiMNIST_folder + pickle_file, 'rb') as f:
    print('Loading multiMNIST data ...')
    print('This may consume a lot of memories.')
    save = pickle.load(f)
    multiMNIST_train_dataset = save['train_dataset']
    multiMNIST_train_labels = save['train_labels']
    multiMNIST_valid_dataset = save['valid_dataset']
    multiMNIST_valid_labels = save['valid_labels']
    multiMNIST_test_dataset = save['test_dataset']
    multiMNIST_test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('multiMNIST training set', multiMNIST_train_dataset.shape, multiMNIST_train_labels.shape)
    print('multiMNIST validation set', multiMNIST_valid_dataset.shape, multiMNIST_valid_labels.shape)
    print('multiMNIST test set', multiMNIST_test_dataset.shape, multiMNIST_test_labels.shape)

Loading multiMNIST data ...
This may consume a lot of memories.
('multiMNIST training set', (64000, 28, 140), (64000, 5))
('multiMNIST validation set', (8000, 28, 140), (8000, 5))
('multiMNIST test set', (8000, 28, 140), (8000, 5))


Reformat into a TensorFlow-friendly shape:
* convolutions need the image data formatted as a cube (width by height by #channels)
* labels as float 1-hot encodings.

In [4]:
length_limit = 5
num_length_class = length_limit + 2 # 0, 1, 2, 3, 4, 5, more than 5
num_digit_class = 11 # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 (blank)
num_image_rows = 28
num_image_columns = 28 * 5
num_channels = 1 # Gray scale

def onehot_encoding(array, num_class):
    '''Turn a numerical numpy array to one-hot encoded matrix'''
    onehot_encoded = np.zeros(shape=(len(array), num_class), dtype=np.int)
    for i in xrange(len(array)):
        onehot_encoded[i][array[i]] = 1
    return onehot_encoded

def labels_reformat(dataset, labels):
    '''Reformat dataset to [num_images, num_rows, num_columns, num_channels]'''
    '''Reformat labels to one-hot encoded matrix list'''
    dataset = dataset.reshape((-1, num_image_rows, num_image_columns, num_channels))
    labelset = list()
    length = np.sum(labels != 10, axis = 1)
    length_onehot = onehot_encoding(array = length, num_class = num_length_class)
    labelset.append(length_onehot)
    for i in xrange(labels.shape[1]):
        digit_onehot = onehot_encoding(array = labels[:, i], num_class = num_digit_class)
        labelset.append(digit_onehot)
    return dataset, labelset

In [5]:
train_dataset, train_labelset = labels_reformat(multiMNIST_train_dataset, multiMNIST_train_labels)
valid_dataset, valid_labelset = labels_reformat(multiMNIST_valid_dataset, multiMNIST_valid_labels)
test_dataset, test_labelset = labels_reformat(multiMNIST_test_dataset, multiMNIST_test_labels)

In [6]:
def probabilities_to_label(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return np.array([c for c in np.argmax(probabilities, 1)])
def accuracy(prediction_digits, labels):
    digits_predicted = list()
    for i in xrange(len(prediction_digits)):
        digits_predicted.append(probabilities_to_label(prediction_digits[i]))
    labels_predicted = np.array(digits_predicted).T
    num_correct = 0
    for i in xrange(len(labels)):
        num_correct += np.array_equal(labels_predicted[i], labels[i])
    accuracy = float(num_correct)/len(labels)
    return accuracy

### Build Deep Convolutional Neural Network using Tensorflow

2 layers of CNN

In [7]:
batch_size = 64
patch_size = 6
depth = 16
num_hidden = 128

graph = tf.Graph()

with graph.as_default():
    
    # Input data
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, num_image_rows, num_image_columns, num_channels))
    tf_train_labelset = list()
    tf_train_labelset.append(tf.placeholder(tf.float32, shape=(batch_size, num_length_class)))
    for i in xrange(length_limit):
        tf_train_labelset.append(tf.placeholder(tf.float32, shape=(batch_size, num_digit_class)))
    
    tf_valid_dataset = tf.constant(valid_dataset, dtype=tf.float32)
    tf_test_dataset = tf.constant(test_dataset, dtype=tf.float32)

    # Variables
    cnn_1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1), 
                                name = 'CNN1_W')
    cnn_1_biases = tf.Variable(tf.zeros([depth]), name = 'CNN1_B')
    
    cnn_2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1), 
                                name = 'CNN2_W')
    cnn_2_biases = tf.Variable(tf.constant(1.0, shape=[depth]), name = 'CNN2_B')
    
    cnn_3_weights = tf.Variable(tf.truncated_normal(
            [num_image_rows // 4 * num_image_columns // 4 * depth, num_hidden], stddev=0.1), name = 'CNN3_W')
    cnn_3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]), name = 'CNN3_B')
    
    logistic_weights = list()
    logistic_biases = list()
    logistic_weights.append(tf.Variable(tf.truncated_normal([num_hidden, num_length_class], stddev=0.1), name = 'H0_W'))
    logistic_biases.append(tf.Variable(tf.zeros([num_length_class]), name = 'H0_B'))
    
    for i in xrange(length_limit):
        logistic_weights.append(tf.Variable(
                tf.truncated_normal([num_hidden, num_digit_class], stddev=0.1), name = 'H' + str(i + 1) + '_W'))
        logistic_biases.append(tf.Variable(tf.constant(1.0, shape=[num_digit_class]), name = 'H' + str(i + 1) + '_B'))

    # Model
    def model(data):
        
        conv = tf.nn.conv2d(data, cnn_1_weights, [1, 2, 2, 1], padding = 'SAME')
        hidden = tf.nn.relu(conv + cnn_1_biases)
        conv = tf.nn.conv2d(hidden, cnn_2_weights, [1, 2, 2, 1], padding = 'SAME')
        hidden = tf.nn.relu(conv + cnn_2_biases)
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, cnn_3_weights) + cnn_3_biases)
        
        
        # Hints: tf.pack()?
        logits = list()
        for i in xrange(length_limit + 1):
            logits.append(tf.matmul(hidden, logistic_weights[i]) + logistic_biases[i])
        
        return logits
    
    # Training computation
    train_logits = model(tf_train_dataset)
    
    train_loss = 0
    for i in xrange(length_limit + 1):
        train_loss = train_loss + tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(train_logits[i], tf_train_labelset[i]))
    
    # Optimizer
    #optimizer = tf.train.GradientDescentOptimizer(0.0005).minimize(train_loss)
    
    # Optimizer.
    global_step = tf.Variable(0)
    #learning rate with exponential decay.
    learning_rate = tf.train.exponential_decay(
        learning_rate = 0.05, global_step = global_step, decay_steps = 10000, decay_rate = 0.8, staircase = True) 
    optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(train_loss, global_step = global_step)
    
    # Predict the length of sequence in training set
    train_prediction_length = tf.nn.softmax(train_logits[0])
    
    # Predict the digits of sequence in training set
    train_prediction_digits = list()
    for i in xrange(length_limit):
        train_prediction_digits.append(tf.nn.softmax(train_logits[i + 1]))
    train_prediction_digits = tf.pack(train_prediction_digits)
    
    # Predict the length of sequence in validation set
    valid_logits = model(tf_valid_dataset)
    valid_prediction_length = tf.nn.softmax(valid_logits[0])
    
    # Predict the digits of sequence in validation set
    valid_prediction_digits = list()
    for i in xrange(length_limit):
        valid_prediction_digits.append(tf.nn.softmax(valid_logits[i + 1]))
    valid_prediction_digits = tf.pack(valid_prediction_digits)
    
    # Predict the length of sequence in test set
    test_logits = model(tf_test_dataset)
    test_prediction_length = tf.nn.softmax(test_logits[0])
    
    # Predict the digits of sequence in test set
    test_prediction_digits = list()
    for i in xrange(length_limit):
        test_prediction_digits.append(tf.nn.softmax(test_logits[i + 1]))
    test_prediction_digits = tf.pack(test_prediction_digits)
    
    # Prediction of single test data
    # Input data
    tf_test_single = tf.placeholder(tf.float32, shape=(1, num_image_rows, num_image_columns, num_channels))
    # Predict the length of sequence in single test data
    test_logits_single = model(tf_test_single)
    test_prediction_length_single = tf.nn.softmax(test_logits_single[0])
    # Predict the digits of sequence in single test data
    test_prediction_digits_single = list()
    for i in xrange(length_limit):
        test_prediction_digits_single.append(tf.nn.softmax(test_logits_single[i + 1]))
    test_prediction_digits_single = tf.pack(test_prediction_digits_single)   
    
    saver = tf.train.Saver()

In [8]:
num_steps = 100001
model_path = 'model/'
if not os.path.exists(model_path):
    os.makedirs(model_path)

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_dataset.shape[0] - batch_size)
        batch_dataset = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labelset = list()
        for i in xrange(len(train_labelset)):
            batch_labelset.append(train_labelset[i][offset:(offset + batch_size), :])
        feed_dict = {tf_train_dataset: batch_dataset}
        for i in xrange(len(train_labelset)):
            feed_dict[tf_train_labelset[i]] = batch_labelset[i]
        _, l, prediction_length, prediction_digits = session.run(
            [optimizer, train_loss, train_prediction_length, train_prediction_digits], feed_dict=feed_dict)
        if (step % 500 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' 
                  %(accuracy(prediction_digits = prediction_digits, 
                             labels = multiMNIST_train_labels[offset:(offset + batch_size), :]) * 100))
            print('Validation accuracy: %.1f%%' 
                  %(accuracy(prediction_digits = valid_prediction_digits.eval(), labels = multiMNIST_valid_labels) * 100))
    
    print('Test accuracy: %.1f%%' 
          %(accuracy(prediction_digits = test_prediction_digits.eval(), labels = multiMNIST_test_labels) * 100))
    save_path = saver.save(sess = session, save_path = model_path + 'CNN_multiMNIST')
    print("Model saved in file: %s" % save_path)

Initialized
Minibatch loss at step 0: 1413.333740
Minibatch accuracy: 0.0%
Validation accuracy: 0.0%
Minibatch loss at step 500: 3.323476
Minibatch accuracy: 29.7%
Validation accuracy: 30.9%
Minibatch loss at step 1000: 2.593884
Minibatch accuracy: 42.2%
Validation accuracy: 43.3%
Minibatch loss at step 1500: 1.708480
Minibatch accuracy: 60.9%
Validation accuracy: 52.2%
Minibatch loss at step 2000: 1.626800
Minibatch accuracy: 57.8%
Validation accuracy: 59.5%
Minibatch loss at step 2500: 1.576269
Minibatch accuracy: 57.8%
Validation accuracy: 63.0%
Minibatch loss at step 3000: 1.253385
Minibatch accuracy: 70.3%
Validation accuracy: 66.0%
Minibatch loss at step 3500: 1.345247
Minibatch accuracy: 62.5%
Validation accuracy: 69.0%
Minibatch loss at step 4000: 1.102809
Minibatch accuracy: 67.2%
Validation accuracy: 71.3%
Minibatch loss at step 4500: 1.002624
Minibatch accuracy: 73.4%
Validation accuracy: 72.9%
Minibatch loss at step 5000: 0.940568
Minibatch accuracy: 71.9%
Validation accura

After some preliminary parameter exploration, the prediction accuracy on test set can reach at least 86.0%. I did not spend too much time on explore the parameters, such as increasing the CNN layers, tuning batch_size, patch_size, depth, num_hidden and num_steps, because my desktop does not have GPU to do the labor-intensive matrix calculations. 

I saw overfitting in the later training stage. We could employ regularization during the training to solve this problem. Buut overall the CNN network is working for this problem.