# Deep Learning

## Assignment 3

Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in 1_notmnist.ipynb.

In [3]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)



Reformat into a shape that's more adapted to the models we're going to train:

- data as a flat matrix,
- labels as float 1-hot encodings.

In [4]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [5]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

## Problem 1

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.

### Logistic model

In [6]:
# Inspo: https://markojerkic.com/build-a-multi-layer-neural-network-with-l2-regularization-with-tensorflow/
# Subset the training data for faster turnaround
train_subset = 10000

# Beta for L2 regularization
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation
    logits = tf.matmul(tf_train_dataset, weights) + biases
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    # L2 on weights
    reg = tf.nn.l2_loss(weights)
    # L2 loss
    loss = (tf.reduce_mean(loss + reg * beta))
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [7]:
num_steps = 801

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the
    # biases. 
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. We tell .run() that we want to run the optimizer,
        # and get the loss value and the training predictions returned as numpy
        # arrays.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(
                predictions, train_labels[:train_subset, :]))
            # Calling .eval() on valid_prediction is basically like calling run(), but
            # just to get that one numpy array. Note that it recomputes all its graph
            # dependencies.
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 48.419266
Training accuracy: 11.8%
Validation accuracy: 13.1%
Loss at step 100: 11.957874
Training accuracy: 73.9%
Validation accuracy: 72.2%
Loss at step 200: 4.527286
Training accuracy: 79.1%
Validation accuracy: 76.8%
Loss at step 300: 2.001214
Training accuracy: 82.5%
Validation accuracy: 79.8%
Loss at step 400: 1.142560
Training accuracy: 83.7%
Validation accuracy: 81.4%
Loss at step 500: 0.847472
Training accuracy: 84.4%
Validation accuracy: 82.3%
Loss at step 600: 0.744908
Training accuracy: 84.5%
Validation accuracy: 82.5%
Loss at step 700: 0.708933
Training accuracy: 84.6%
Validation accuracy: 82.7%
Loss at step 800: 0.696218
Training accuracy: 84.6%
Validation accuracy: 82.8%
Test accuracy: 88.6%


### Neural Network

In [8]:
hidden_nodes = 1024 
batch_size = 128

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes]))
    biases_1 = tf.Variable(tf.zeros([hidden_nodes]))
    weights_2 = tf.Variable(tf.truncated_normal([hidden_nodes, num_labels]))
    biases_2 = tf.Variable(tf.zeros([num_labels]))
    
    hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
    
    
    # Training computation
    logits = tf.matmul(hidden_layer, weights_2) + biases_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    # L2 on weights
    regs = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2) 
    # L2 loss
    loss = (tf.reduce_mean(loss + regs * beta))
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

In [9]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")  
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.   
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.    
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3558.861572
Minibatch accuracy: 2.3%
Validation accuracy: 36.1%
Minibatch loss at step 500: 21.285677
Minibatch accuracy: 86.7%
Validation accuracy: 84.4%
Minibatch loss at step 1000: 1.116940
Minibatch accuracy: 73.4%
Validation accuracy: 83.6%
Minibatch loss at step 1500: 0.692980
Minibatch accuracy: 82.8%
Validation accuracy: 83.6%
Minibatch loss at step 2000: 0.724194
Minibatch accuracy: 85.2%
Validation accuracy: 84.1%
Minibatch loss at step 2500: 0.760707
Minibatch accuracy: 83.6%
Validation accuracy: 83.2%
Minibatch loss at step 3000: 0.815905
Minibatch accuracy: 81.2%
Validation accuracy: 84.3%
Test accuracy: 90.0%


## Problem 2

Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [10]:
train_dataset.shape

(200000, 784)

In [11]:
train_dataset_batch = train_dataset[:500, :]
train_labels_batch = train_labels[:500] 

In [12]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")  
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels_batch.shape[0] - batch_size)
        # Generate a minibatch.   
        batch_data = train_dataset_batch[offset:(offset + batch_size), :]
        batch_labels = train_labels_batch[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.    
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3478.993164
Minibatch accuracy: 10.9%
Validation accuracy: 34.2%
Minibatch loss at step 500: 21.043390
Minibatch accuracy: 100.0%
Validation accuracy: 78.0%
Minibatch loss at step 1000: 0.474626
Minibatch accuracy: 100.0%
Validation accuracy: 78.8%
Minibatch loss at step 1500: 0.305464
Minibatch accuracy: 100.0%
Validation accuracy: 78.6%
Minibatch loss at step 2000: 0.291283
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Minibatch loss at step 2500: 0.282924
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Minibatch loss at step 3000: 0.273904
Minibatch accuracy: 100.0%
Validation accuracy: 78.6%
Test accuracy: 85.0%


Seems like the accuracy is really good on traning data but crappy on the validation set. Typical case of overfitting I would say....

## Problem 3

Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

### Neural Network

In [13]:
hidden_nodes = 1024 
batch_size = 128

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes]))
    biases_1 = tf.Variable(tf.zeros([hidden_nodes]))
    weights_2 = tf.Variable(tf.truncated_normal([hidden_nodes, num_labels]))
    biases_2 = tf.Variable(tf.zeros([num_labels]))
    
    hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
    
    
    # Training computation
    logits = tf.matmul(hidden_layer, weights_2) + biases_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    # L2 on weights
    regs = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2) 
    # L2 loss
    loss = (tf.reduce_mean(loss + regs * beta))
    # Dropout for training data
    keep_prob = tf.placeholder('float32')
    dropout = tf.nn.dropout(hidden_layer, keep_prob)
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

In [14]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")  
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.   
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.    
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3475.812012
Minibatch accuracy: 8.6%
Validation accuracy: 26.9%
Minibatch loss at step 500: 21.309294
Minibatch accuracy: 85.9%
Validation accuracy: 84.3%
Minibatch loss at step 1000: 1.111368
Minibatch accuracy: 75.8%
Validation accuracy: 83.6%
Minibatch loss at step 1500: 0.694707
Minibatch accuracy: 82.0%
Validation accuracy: 83.7%
Minibatch loss at step 2000: 0.722223
Minibatch accuracy: 84.4%
Validation accuracy: 84.2%
Minibatch loss at step 2500: 0.756295
Minibatch accuracy: 84.4%
Validation accuracy: 83.2%
Minibatch loss at step 3000: 0.818760
Minibatch accuracy: 81.2%
Validation accuracy: 84.2%
Test accuracy: 90.1%


### Extreme overfitting

In [15]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")  
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels_batch.shape[0] - batch_size)
        # Generate a minibatch.   
        batch_data = train_dataset_batch[offset:(offset + batch_size), :]
        batch_labels = train_labels_batch[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.    
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3442.605469
Minibatch accuracy: 7.8%
Validation accuracy: 35.5%
Minibatch loss at step 500: 21.038683
Minibatch accuracy: 100.0%
Validation accuracy: 78.1%
Minibatch loss at step 1000: 0.473515
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 1500: 0.304299
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 2000: 0.290945
Minibatch accuracy: 100.0%
Validation accuracy: 78.5%
Minibatch loss at step 2500: 0.282359
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 3000: 0.273758
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Test accuracy: 84.9%


## Problem 4

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.

One avenue you can explore is to add multiple layers.

In [17]:
hidden_nodes_1 = 500 
hidden_nodes_2 = 500
batch_size = 128

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes_1]))
    biases_1 = tf.Variable(tf.zeros([hidden_nodes_1]))
    weights_2 = tf.Variable(tf.truncated_normal([hidden_nodes_1, num_labels]))
    biases_2 = tf.Variable(tf.zeros([hidden_nodes_2]))
    weights_3 = tf.Variable(tf.truncated_normal([hidden_nodes_2, num_labels]))
    biases_3 = tf.Variable(tf.zeros([num_labels]))
    
    hidden_layer_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
    hidden_layer_2 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_2) + biases_2)
    
    
    # Training computation
    logits = tf.matmul(hidden_layer_1, weights_2) + biases_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits))
    # L2 on weights
    regs = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2) 
    # L2 loss
    loss = (tf.reduce_mean(loss + regs * beta))
    # Dropout for training data
    keep_prob = tf.placeholder('float32')
    dropout = tf.nn.dropout(hidden_layer, keep_prob)
    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

ValueError: Dimensions must be equal, but are 784 and 1024 for 'MatMul_1' (op: 'MatMul') with input shapes: [128,784], [1024,10].

In [None]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")  
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels_batch.shape[0] - batch_size)
        # Generate a minibatch.   
        batch_data = train_dataset_batch[offset:(offset + batch_size), :]
        batch_labels = train_labels_batch[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.    
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))