Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [23]:
print(tf.reduce_mean.__doc__)

Computes the mean of elements across dimensions of a tensor.

  Reduces `input_tensor` along the dimensions given in `reduction_indices`.
  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
  entry in `reduction_indices`. If `keep_dims` is true, the reduced dimensions
  are retained with length 1.

  If `reduction_indices` has no entries, all dimensions are reduced, and a
  tensor with a single element is returned.

  For example:

  ```python
  # 'x' is [[1., 1.]
  #         [2., 2.]]
  tf.reduce_mean(x) ==> 1.5
  tf.reduce_mean(x, 0) ==> [1.5, 1.5]
  tf.reduce_mean(x, 1) ==> [1.,  2.]
  ```

  Args:
    input_tensor: The tensor to reduce. Should have numeric type.
    reduction_indices: The dimensions to reduce. If `None` (the default),
      reduces all dimensions.
    keep_dims: If true, retains reduced dimensions with length 1.
    name: A name for the operation (optional).

  Returns:
    The reduced tensor.
  


In [37]:
learning_rate = 0.5 
hidden_layer_size = 1024 
batch_size = 128 
l2_loss_param = 5e-4

def construct_network(layers, dataset):
    result = dataset
    for (W, b) in layers[:-1]:
        result = tf.nn.relu(tf.matmul(result, W) + b) 
    (W, b) = layers[-1]
    result = tf.matmul(result, W) + b 
    return result 

graph = tf.Graph() 
with graph.as_default():
    print('Initializing graph...')
    
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape=(batch_size, image_size * image_size))
    tf_train_labels  = tf.placeholder(tf.float32,
                                     shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset  = tf.constant(test_dataset)
    
    W1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layer_size])) 
    b1 = tf.Variable(tf.zeros([hidden_layer_size])) 
    
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_size, num_labels])) 
    b2 = tf.Variable(tf.zeros([num_labels])) 
    
    layers = [
        (W1, b1),
        (W2, b2)
    ]
    
    logits = construct_network(layers, tf_train_dataset)
#     loss = tf.reduce_mean(
#         tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
#     loss = tf.nn.l2_loss(
#         tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 
    
    l2 = loss + l2_loss_param * (
        tf.nn.l2_loss(W2) + tf.nn.l2_loss(b2) + 
        tf.nn.l2_loss(W1) + tf.nn.l2_loss(b1))
#     l2 = loss + tf.nn.l2_loss(logits)
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(l2)
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(construct_network(layers, tf_valid_dataset))
    test_prediction  = tf.nn.softmax(construct_network(layers, tf_test_dataset))
    
    print('Graph has been constructed.')

Initializing graph...
Graph has been constructed.


In [38]:
num_steps = 3001 
with tf.Session(graph=graph) as session:
    print('Running Session...')
    tf.initialize_all_variables().run() 
    
    for i in range(num_steps):
        offset = (i * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data   = train_dataset[offset:(offset + batch_size), :]
        batch_labels =  train_labels[offset:(offset + batch_size), :] 
        
        feed_dict = {
            tf_train_dataset: batch_data,
            tf_train_labels : batch_labels
        }
        
        _, l, predictions = session.run(
            [optimizer, l2, train_prediction],
            feed_dict=feed_dict
        )
        
        if (i % 500 == 0):
            print('Minibatch loss at step %d: %f' % (i, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
            
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    print('Done.')

Running Session...
Minibatch loss at step 0: 475.666565
Minibatch accuracy: 13.3%
Validation accuracy: 25.2%
Minibatch loss at step 500: 136.505508
Minibatch accuracy: 77.3%
Validation accuracy: 80.9%
Minibatch loss at step 1000: 97.512383
Minibatch accuracy: 83.6%
Validation accuracy: 79.8%
Minibatch loss at step 1500: 77.550247
Minibatch accuracy: 80.5%
Validation accuracy: 79.6%
Minibatch loss at step 2000: 56.881653
Minibatch accuracy: 89.1%
Validation accuracy: 83.1%
Minibatch loss at step 2500: 44.061432
Minibatch accuracy: 82.8%
Validation accuracy: 83.6%
Minibatch loss at step 3000: 34.282406
Minibatch accuracy: 89.1%
Validation accuracy: 85.0%
Test accuracy: 90.7%
Done.


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [42]:
# num_steps = 100 
num_steps = 10 
batch_size = 128 

with tf.Session(graph=graph) as session:
    print('Starting training...')
    tf.initialize_all_variables().run() 
    
    for i in range(num_steps):
        offset = (i * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data   = train_dataset[offset:(offset + batch_size), :]
        batch_labels =  train_labels[offset:(offset + batch_size), :] 
        
        feed_dict = {
            tf_train_dataset: batch_data,
            tf_train_labels : batch_labels
        }
        
        _, l, predictions = session.run(
            [optimizer, l2, train_prediction],
            feed_dict=feed_dict
        )
        
#         if (i % 10 == 0):
        print('Minibatch loss at step %d: %f' % (i, l))
        print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
        print('Validation accuracy: %.1f%%' % accuracy(
            valid_prediction.eval(), valid_labels))
            
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    print('Done.')

Starting training...
Minibatch loss at step 0: 535.725525
Minibatch accuracy: 13.3%
Validation accuracy: 35.5%
Minibatch loss at step 1: 1335.718506
Minibatch accuracy: 38.3%
Validation accuracy: 27.1%
Minibatch loss at step 2: 1616.163940
Minibatch accuracy: 25.0%
Validation accuracy: 42.7%
Minibatch loss at step 3: 821.125732
Minibatch accuracy: 43.0%
Validation accuracy: 43.5%
Minibatch loss at step 4: 1148.818604
Minibatch accuracy: 38.3%
Validation accuracy: 56.0%
Minibatch loss at step 5: 592.169006
Minibatch accuracy: 46.9%
Validation accuracy: 58.2%
Minibatch loss at step 6: 489.319641
Minibatch accuracy: 54.7%
Validation accuracy: 68.0%
Minibatch loss at step 7: 330.089294
Minibatch accuracy: 66.4%
Validation accuracy: 70.1%
Minibatch loss at step 8: 311.330048
Minibatch accuracy: 62.5%
Validation accuracy: 69.1%
Minibatch loss at step 9: 380.531158
Minibatch accuracy: 62.5%
Validation accuracy: 77.0%
Test accuracy: 83.5%
Done.


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [52]:
learning_rate = 0.5 
# l2_loss_param = 5e-5 
l2_loss_param = 5e-4 
hidden_layer_size = 1024 

keep_prob = 0.5 
seed = 19584901239 

graph = tf.Graph() 
with graph.as_default():
    print('Constructing Graph...')
    
    tf_train_dataset = tf.placeholder(tf.float32, 
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels  = tf.placeholder(tf.float32,
                                      shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset) 
    tf_test_dataset  = tf.constant(test_dataset) 
    
    #####################
    # Initialize weight tensors 
    W1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layer_size]))
    b1 = tf.Variable(tf.zeros([hidden_layer_size]))
    
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_size, num_labels]))
    b2 = tf.Variable(tf.zeros([num_labels]))
    
    #####################
    # Create a new instance of the model 
    def model(dataset, training=False):
        h1 = tf.nn.relu(tf.matmul(dataset, W1) + b1)
        
        if training:
            h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=seed) 
        
        y = tf.matmul(h1, W2) + b2
        return y 
    
    logits = model(tf_train_dataset, training=True)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
    regularizers = tf.nn.l2_loss(W1) + tf.nn.l2_loss(b1) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(b2) 
    l2 = loss + l2_loss_param * regularizers 
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(l2) 
    
    train_prediction = tf.nn.softmax(logits) 
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction  = tf.nn.softmax(model(tf_test_dataset)) 
    
    print('Done.')

Constructing Graph...
Done.


In [53]:
num_steps = 3001 

with tf.Session(graph=graph) as session:
    print('Starting training...')
    tf.initialize_all_variables().run() 
    
    for i in range(num_steps):
        offset = (i * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data  = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {
            tf_train_dataset: batch_data,
            tf_train_labels : batch_labels 
        }
        
        _, l, predictions = session.run(
            [optimizer, l2, train_prediction],
            feed_dict=feed_dict
        )
        
        if i % 500 == 0:
            print("Minibatch loss at step %d: %f" % (i, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print('Done.')

Starting training...
Minibatch loss at step 0: 631.504395
Minibatch accuracy: 14.1%
Validation accuracy: 29.8%
Minibatch loss at step 500: 143.889221
Minibatch accuracy: 74.2%
Validation accuracy: 80.1%
Minibatch loss at step 1000: 112.561066
Minibatch accuracy: 75.0%
Validation accuracy: 79.7%
Minibatch loss at step 1500: 87.577209
Minibatch accuracy: 68.8%
Validation accuracy: 79.1%
Minibatch loss at step 2000: 59.714672
Minibatch accuracy: 79.7%
Validation accuracy: 81.9%
Minibatch loss at step 2500: 45.772717
Minibatch accuracy: 73.4%
Validation accuracy: 82.1%
Minibatch loss at step 3000: 34.943115
Minibatch accuracy: 85.2%
Validation accuracy: 83.3%
Test accuracy: 89.5%
Done.


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [6]:
keep_prob = 0.7
batch_size = 128 
seed = None
l2_loss_param = 5e-4 
init_learning_rate = 0.001
momentum = 0.001
decay_step = 100 
decay_rate = 0.96 

h1_size = 1024
h2_size = 1024
# h3_size = num_labels * 2

graph = tf.Graph() 
with graph.as_default(): 
    print('Constructing Graph...') 
    
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size)) 
    tf_train_labels  = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 
    tf_valid_dataset = tf.constant(valid_dataset) 
    tf_test_dataset  = tf.constant(test_dataset) 
    
    tf_keep_prob = tf.constant(keep_prob) 
    
    
    ############################
    # NN Layers 
    W1 = tf.Variable(tf.truncated_normal([image_size * image_size, h1_size])) 
    b1 = tf.Variable(tf.zeros([h1_size])) 
    
    W2 = tf.Variable(tf.truncated_normal([h1_size, h2_size])) 
    b2 = tf.Variable(tf.zeros([h2_size])) 
    
    W3 = tf.Variable(tf.truncated_normal([h2_size, num_labels])) 
    b3 = tf.Variable(tf.zeros([num_labels])) 
    
#     W4 = tf.Variable(tf.truncated_normal([h3_size, num_labels]))
#     b4 = tf.Variable(tf.zeros([num_labels]))
    
    
    ############################
    # Construct model 
    def model(dataset, training=False):
        h1 = tf.nn.relu(tf.matmul(dataset, W1) + b1)
#         if training:
#             h1 = tf.nn.dropout(h1, tf_keep_prob, seed=seed)
            
        h2 = tf.nn.relu(tf.matmul(h1, W2) + b2) 
#         if training:
#             h2 = tf.nn.dropout(h2, tf_keep_prob, seed=seed) 
            
#         h3 = tf.nn.relu(tf.matmul(h2, W3) + b3)
#         if training:
#             h3 = tf.nn.dropout(h3, tf_keep_prob, seed=seed)
        
#         h4 = tf.matmul(h3, W4) + b4
#         return h4 
        
        h3 = tf.matmul(h2, W3) + b3
        if training:
            h3 = tf.nn.dropout(h3, tf_keep_prob, seed=seed)
        return h3
    
    
    logits = model(tf_train_dataset, training=True) 
    
    regularizers = (tf.nn.l2_loss(W1) + tf.nn.l2_loss(b1) + 
                    tf.nn.l2_loss(W2) + tf.nn.l2_loss(b2) + 
                    tf.nn.l2_loss(W3) + tf.nn.l2_loss(b3)) 
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 
    l2 = loss + l2_loss_param * regularizers 
    
    
    # Create learning rate decay 
    global_step = tf.Variable(0) 
    learning_rate = tf.train.exponential_decay(
        init_learning_rate, global_step, decay_step, decay_rate) 
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(l2, global_step=global_step) 
#     optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(l2) 
    
    train_prediction = tf.nn.softmax(logits) 
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) 
    test_prediction  = tf.nn.softmax(model(tf_test_dataset)) 
    
    print('Done.') 




Constructing Graph...
Done.


In [None]:

num_steps = 3001 
with tf.Session(graph=graph) as session:
    print('Training model...') 
    tf.initialize_all_variables().run() 
    
    for i in range(num_steps):
        offset = (i * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data  = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {
            tf_train_dataset: batch_data,
            tf_train_labels : batch_labels 
        }
        
        _, l, predictions = session.run(
            [optimizer, l2, train_prediction],
            feed_dict=feed_dict
        )
        
        if i % 500 == 0:
            print("Minibatch loss at step %d: %f" % (i, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print('Done.')

Training model...
Minibatch loss at step 0: 8974.763672
Minibatch accuracy: 5.5%
Validation accuracy: 9.7%
Minibatch loss at step 500: 1042.474487
Minibatch accuracy: 64.8%
Validation accuracy: 78.3%
Minibatch loss at step 1000: 772.798828
Minibatch accuracy: 66.4%
Validation accuracy: 79.8%
Minibatch loss at step 1500: 836.063477
Minibatch accuracy: 64.1%
Validation accuracy: 80.6%
Minibatch loss at step 2000: 671.718994
Minibatch accuracy: 66.4%
Validation accuracy: 81.3%

In [176]:
print(tf.train.exponential_decay.__doc__)

Applies exponential decay to the learning rate.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an exponential decay function
  to a provided initial learning rate.  It requires a `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  decayed_learning_rate = learning_rate *
                          decay_rate ^ (global_step / decay_steps)
  ```

  If the argument `staircase` is `True`, then `global_step /decay_steps` is an
  integer division and the decayed learning rate follows a staircase function.

  Example: decay every 100000 steps with a base of 0.96:

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.1
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,