Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import time

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set:\t', train_dataset.shape, train_labels.shape)
  print('Validation set:\t', valid_dataset.shape, valid_labels.shape)
  print('Test set:\t', test_dataset.shape, test_labels.shape)

Training set:	 (400000, 28, 28) (400000,)
Validation set:	 (15000, 28, 28) (15000,)
Test set:	 (18000, 28, 28) (18000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set:\t', train_dataset.shape, train_labels.shape)
print('Validation set:\t', valid_dataset.shape, valid_labels.shape)
print('Test set:\t', test_dataset.shape, test_labels.shape)

Training set:	 (400000, 28, 28, 1) (400000, 10)
Validation set:	 (15000, 28, 28, 1) (15000, 10)
Test set:	 (18000, 28, 28, 1) (18000, 10)


In [4]:
def accuracy(predictions, labels):
  #np.argmax() returns the indices of the maximum values along an axis (axis value starts at 0).
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

The code below constructs two convolutional netowrk layers, and one fully connected network layer, and one output layer. Each of the hidden layers is followed by a RELU layer.

In [5]:
batch_size = 16
patch_size = 5   # Convolutional network patch size.
depth = 16       # Convolutional network output depth.
num_hidden = 64  # Fully connectly hidden layer size. 

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) # model() is an internal function defined above.
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [6]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 100 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3.287610
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 100: 1.811163
Minibatch accuracy: 50.0%
Validation accuracy: 72.9%
Minibatch loss at step 200: 0.826625
Minibatch accuracy: 75.0%
Validation accuracy: 78.1%
Minibatch loss at step 300: 0.176615
Minibatch accuracy: 93.8%
Validation accuracy: 78.5%
Minibatch loss at step 400: 0.975691
Minibatch accuracy: 75.0%
Validation accuracy: 80.3%
Minibatch loss at step 500: 0.891770
Minibatch accuracy: 62.5%
Validation accuracy: 81.6%
Minibatch loss at step 600: 1.051130
Minibatch accuracy: 75.0%
Validation accuracy: 81.0%
Minibatch loss at step 700: 0.209371
Minibatch accuracy: 93.8%
Validation accuracy: 81.4%
Minibatch loss at step 800: 0.704229
Minibatch accuracy: 81.2%
Validation accuracy: 82.2%
Minibatch loss at step 900: 0.382092
Minibatch accuracy: 93.8%
Validation accuracy: 82.8%
Minibatch loss at step 1000: 0.345307
Minibatch accuracy: 87.5%
Validation accuracy: 82.6%


---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [7]:
batch_size = 32
patch_size = 5   # Convolutional network patch size.
depth = 16      # Convolutional network output depth.
num_hidden = 64  # Fully connectly hidden layer size. 

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    shape = pool.get_shape().as_list()
    reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) # model() is an onternal function defined above.
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [8]:
num_steps = 10001
start_time = time.time()
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%.   Time elapsed:%.1f' % (accuracy(
        valid_prediction.eval(), valid_labels), time.time()-start_time))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
print("\nTime it takes to run the graph with %d:" % num_steps, time.time()-start_time)

Initialized
Minibatch loss at step 0: 4.763687
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%.   Time elapsed:1.9
Minibatch loss at step 500: 0.734541
Minibatch accuracy: 81.2%
Validation accuracy: 82.7%.   Time elapsed:9.7
Minibatch loss at step 1000: 0.415032
Minibatch accuracy: 87.5%
Validation accuracy: 84.5%.   Time elapsed:17.5
Minibatch loss at step 1500: 0.631829
Minibatch accuracy: 78.1%
Validation accuracy: 86.2%.   Time elapsed:25.2
Minibatch loss at step 2000: 0.258624
Minibatch accuracy: 90.6%
Validation accuracy: 86.8%.   Time elapsed:32.9
Minibatch loss at step 2500: 0.522479
Minibatch accuracy: 84.4%
Validation accuracy: 87.9%.   Time elapsed:40.7
Minibatch loss at step 3000: 0.387590
Minibatch accuracy: 93.8%
Validation accuracy: 87.8%.   Time elapsed:48.4
Minibatch loss at step 3500: 0.299874
Minibatch accuracy: 90.6%
Validation accuracy: 88.4%.   Time elapsed:56.1
Minibatch loss at step 4000: 0.501531
Minibatch accuracy: 81.2%
Validation accuracy: 88.4%.   Time 

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

In [12]:
batch_size = 32
patch_size = 5   # Convolutional network patch size.
depth = 16      # Convolutional network output depth.
num_hidden = 64  # Fully connectly hidden layer size. 

graph2 = tf.Graph()

with graph2.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(tf.nn.dropout(conv + layer1_biases, keep_prob))
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    conv = tf.nn.conv2d(pool, layer2_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(tf.nn.dropout(conv + layer2_biases, keep_prob))
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    shape = pool.get_shape().as_list()
    reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.nn.dropout(tf.matmul(reshape, layer3_weights) + layer3_biases, keep_prob))
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.2, global_step, 200, 0.95, staircase=True, name="Learn_rate_decay")
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) # model() is an onternal function defined above.
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [13]:
num_steps = 10001
start_time = time.time()
with tf.Session(graph=graph2) as sess:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob:0.5}
    _, l, predictions = sess.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print('Minibatch loss at step %d: %f' % (global_step.eval(), l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%.   Time elapsed:%.1f' % (accuracy(
        valid_prediction.eval(), valid_labels), time.time()-start_time))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
print("\nTime it takes to run the graph with %d:" % num_steps, time.time()-start_time)

Initialized
Minibatch loss at step 1: 3.044292
Minibatch accuracy: 3.1%
Validation accuracy: 10.0%.   Time elapsed:1.9
Minibatch loss at step 501: 0.661945
Minibatch accuracy: 81.2%
Validation accuracy: 85.5%.   Time elapsed:9.8
Minibatch loss at step 1001: 0.303825
Minibatch accuracy: 90.6%
Validation accuracy: 86.8%.   Time elapsed:17.5
Minibatch loss at step 1501: 0.616289
Minibatch accuracy: 81.2%
Validation accuracy: 86.7%.   Time elapsed:25.4
Minibatch loss at step 2001: 0.239876
Minibatch accuracy: 90.6%
Validation accuracy: 88.3%.   Time elapsed:33.3
Minibatch loss at step 2501: 0.370436
Minibatch accuracy: 90.6%
Validation accuracy: 88.7%.   Time elapsed:41.1
Minibatch loss at step 3001: 0.400720
Minibatch accuracy: 93.8%
Validation accuracy: 89.1%.   Time elapsed:48.8
Minibatch loss at step 3501: 0.248917
Minibatch accuracy: 87.5%
Validation accuracy: 89.4%.   Time elapsed:56.6
Minibatch loss at step 4001: 0.437089
Minibatch accuracy: 87.5%
Validation accuracy: 89.6%.   Time 