This Jupyter notebooks is developed by Hang Zhang (@HangZhang6) for a toy project, which is to classify images as cars or trains. 

There are three CNN models trained in this Jupyter. The first CNN model is just a simple CNN model with 2 convolution layers, and 2-lay full connected neural network. 

The second CNN model tries to improve it by adding two max pooling layers, one after each convolution layer. You can see the model accuracy on testing data improves slightly.

The third CNN model employs technologies of drop out and exponentially decayed learning rate to improve the model. The accuracy of this model on the testing data was improved to 93.3%. 

## Load the data in the pickle file

In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import os

In [None]:
data_root = 'C:\\Projects\\DX\\Data\\Images'
pickle_file = os.path.join(data_root, 'carsTrains.pickle')

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Testing set', test_dataset.shape, test_labels.shape)

## Scale the data to -1 and 1 if it is not scaled yet

In [None]:
pixel_depth = 255
if np.max(train_dataset) > 1:
    train_dataset = (train_dataset - pixel_depth/2)/(pixel_depth/2)
if np.max(valid_dataset) > 1:
    valid_dataset = (valid_dataset - pixel_depth/2)/(pixel_depth/2)
if np.max(test_dataset) > 1:
    test_dataset = (test_dataset - pixel_depth/2)/(pixel_depth/2)

## Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [None]:
width = 100
height = 80
num_labels = 2
num_channels = 3 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, height, width, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

In [None]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [None]:
batch_size = 16
patch_size = 4
depth = 32
num_hidden = 128

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, height, width, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  keep_prob = tf.placeholder(tf.float32)
    
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  # Applying SAME padding in each of the previous convolution layer. Stride=2, meaning in each convolution layer, 
  # the output image size is 1/2 of the input. Therefore, the output of the 2nd convolution layer is only 1/4 
  # of the image size in each direction
  layer3_weights = tf.Variable(tf.truncated_normal(
      [width // 4 * height // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.02).minimize(loss)
  
  # Predictions for the training, and validation
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [None]:
num_steps = 1001
num_epochs = 10
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for ite in range(num_epochs):
      permutation = np.random.permutation(train_labels.shape[0])
      shuffled_dataset = train_dataset[permutation,:]
      shuffled_labels = train_labels[permutation]
      for step in range(num_steps):
        offset = (step * batch_size) % (shuffled_labels.shape[0] - batch_size)
        batch_data = shuffled_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = shuffled_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
          print('Minibatch loss at iteration %d step %d: %f' % (ite, step, l))
          print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
          print('Validation accuracy: %.1f%%' % accuracy(
            valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

## Improve the CNN model by adding two max_pooling layers, one after each convolution layer

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

The accuracy increases from 85.7% to 89.3%. 

In [None]:
batch_size = 16
patch_size = 4
depth = 32
num_hidden = 128

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, height, width, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.2))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [width // 4 * height // 4 * depth, num_hidden], stddev=0.2))
  
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    layer1_max_pooling = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 1, 1, 1], 'SAME', 'NHWC')
    conv = tf.nn.conv2d(layer1_max_pooling, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    layer2_max_pooling = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 1, 1, 1], 'SAME', 'NHWC')
    shape = layer2_max_pooling.get_shape().as_list()
    reshape = tf.reshape(layer2_max_pooling, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [None]:
num_steps = 1001
num_epochs = 10
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for ite in range(num_epochs):
      permutation = np.random.permutation(train_labels.shape[0])
      shuffled_dataset = train_dataset[permutation,:]
      shuffled_labels = train_labels[permutation]
      for step in range(num_steps):
        offset = (step * batch_size) % (shuffled_labels.shape[0] - batch_size)
        batch_data = shuffled_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = shuffled_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
          print('Minibatch loss at iteration %d step %d: %f' % (ite, step, l))
          print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
          print('Validation accuracy: %.1f%%' % accuracy(
            valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
  test_preds = test_prediction.eval() #output the test predictions to a variable outside of the graph

## Show images that are misclassified

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
def show_misclassified_images(test_dataset, test_preds, test_labels):
    error_index = np.arange(test_preds.shape[0])[np.argmax(test_preds, 1) != np.argmax(test_labels, 1)]
    # show images that are misclassfied
    for i in range(len(error_index) // 2):
        f, axarr = plt.subplots(1, 2)
        image_data = (test_dataset[error_index[i*2],:,:,:] * pixel_depth/2 + pixel_depth/2)/pixel_depth
        axarr[0].imshow(image_data, interpolation='none', aspect='auto')
        image_data = (test_dataset[error_index[i*2+1],:,:,:] * pixel_depth/2 + pixel_depth/2)/pixel_depth
        axarr[1].imshow(image_data, interpolation='none', aspect='auto')

## Further improve the model by adding dropout and learning rate decay

The test accuracy increases to 93.3% by taking depth = 30, patch_size = 3, num_hidden neurons at the fully connect net = 128, keep_probability = 0.8, and start_lambda = 0.02

In [None]:
batch_size = 16
patch_size = 3
depth = 30
num_hidden = 128

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, height, width, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  keep_prob = tf.placeholder(tf.float32)
  starting_learning_rate = tf.placeholder(tf.float32)
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [height // 4 * width // 4 * depth, num_hidden], stddev=0.1))
  
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data, keep_prob):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    layer1_max_pooling = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 1, 1, 1], 'SAME', 'NHWC')
    conv = tf.nn.conv2d(layer1_max_pooling, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    layer2_max_pooling = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 1, 1, 1], 'SAME', 'NHWC')
    shape = layer2_max_pooling.get_shape().as_list()
    reshape = tf.reshape(layer2_max_pooling, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    drop_out = tf.nn.dropout(hidden, keep_prob)
    return tf.matmul(drop_out, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset, keep_prob)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(starting_learning_rate, global_step, 100000, 0.95, staircase = True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset, 1))
  test_prediction = tf.nn.softmax(model(tf_test_dataset, 1))

In [None]:
num_steps = 1001
num_epochs = 10
keep_p = 0.8
start_lambda = 0.02

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for ite in range(num_epochs):
      permutation = np.random.permutation(train_labels.shape[0])
      shuffled_dataset = train_dataset[permutation,:]
      shuffled_labels = train_labels[permutation]
      for step in range(num_steps):
        offset = (step * batch_size) % (shuffled_labels.shape[0] - batch_size)
        batch_data = shuffled_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = shuffled_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : keep_p, \
                     starting_learning_rate : start_lambda}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
          print('Minibatch loss at iteration %d step %d: %f' % (ite, step, l))
          print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
          print('Validation accuracy: %.1f%%' % accuracy(
            valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
  test_preds = test_prediction.eval()

In [None]:
show_misclassified_images(test_dataset, test_preds, test_labels)