Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [21]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [22]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [23]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

Logistic regression 
--------

In [5]:
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000

graph = tf.Graph()
with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random values following a (truncated)
  # normal distribution. The biases get initialized to zero.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  # We multiply the inputs with the weight matrix, and add biases. We compute
  # the softmax and cross-entropy (it's one operation in TensorFlow, because
  # it's very common, and it can be optimized). We take the average of this
  # cross-entropy across all training examples: that's our loss.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits)) + 0.005*tf.nn.l2_loss(weights)
  
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [6]:
num_steps = 801

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

with tf.Session(graph=graph) as session:
  # This is a one-time operation which ensures the parameters get initialized as
  # we described in the graph: random weights for the matrix, zeros for the
  # biases. 
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.

    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      print('Loss at step %d: %f' % (step, l))
      print('Training accuracy: %.1f%%' % accuracy(
        predictions, train_labels[:train_subset, :]))
      # Calling .eval() on valid_prediction is basically like calling run(), but
      # just to get that one numpy array. Note that it recomputes all its graph
      # dependencies.
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 34.431984
Training accuracy: 8.5%
Validation accuracy: 9.6%
Loss at step 100: 10.591539
Training accuracy: 72.2%
Validation accuracy: 71.3%
Loss at step 200: 6.278162
Training accuracy: 76.6%
Validation accuracy: 75.0%
Loss at step 300: 3.848575
Training accuracy: 79.4%
Validation accuracy: 76.9%
Loss at step 400: 2.471738
Training accuracy: 81.4%
Validation accuracy: 78.8%
Loss at step 500: 1.691110
Training accuracy: 83.0%
Validation accuracy: 80.1%
Loss at step 600: 1.246318
Training accuracy: 83.8%
Validation accuracy: 81.0%
Loss at step 700: 0.991098
Training accuracy: 84.6%
Validation accuracy: 81.4%
Loss at step 800: 0.843680
Training accuracy: 84.8%
Validation accuracy: 81.9%
Test accuracy: 88.0%


Neural network
--------

#### light version 


In [11]:
batch_size = 128
hidden_layers_size = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    weights = tf.Variable(tf.truncated_normal([hidden_layers_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    hidden_weights = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layers_size]))
    hidden_biases = tf.Variable(tf.truncated_normal([hidden_layers_size]))
    
    logit_hidden = tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases
    
    logit = tf.matmul(tf.nn.relu(logit_hidden), weights) + biases

    # Training computation.
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logit)) \
            + 0.01*tf.nn.l2_loss(weights) + 0.01*tf.nn.l2_loss(hidden_weights) 

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logit)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases), weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases), weights) + biases)


In [12]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 3463.336426
Minibatch accuracy: 13.3%
Validation accuracy: 27.9%
Minibatch loss at step 500: 1153.818237
Minibatch accuracy: 81.2%
Validation accuracy: 78.3%
Minibatch loss at step 1000: 421.323700
Minibatch accuracy: 81.2%
Validation accuracy: 82.7%
Minibatch loss at step 1500: 154.846878
Minibatch accuracy: 82.8%
Validation accuracy: 83.9%
Minibatch loss at step 2000: 57.057449
Minibatch accuracy: 90.6%
Validation accuracy: 85.7%
Minibatch loss at step 2500: 21.505241
Minibatch accuracy: 82.8%
Validation accuracy: 85.9%
Minibatch loss at step 3000: 8.318549
Minibatch accuracy: 85.9%
Validation accuracy: 85.1%
Test accuracy: 90.7%


#### heavy version 

In [7]:
batch_size = 128
hidden_layers_size = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    input_weights = tf.Variable(tf.truncated_normal([image_size * image_size, batch_size]))
    input_biases = tf.Variable(tf.truncated_normal([batch_size]))
     
    hidden_weights = tf.Variable(tf.truncated_normal([batch_size, hidden_layers_size]))
    hidden_biases = tf.Variable(tf.truncated_normal([hidden_layers_size]))
    
    output_weights = tf.Variable(tf.truncated_normal([hidden_layers_size, num_labels]))
    output_biases = tf.Variable(tf.zeros([num_labels]))

    
    def network(input): 
        logit_input = tf.matmul(input, input_weights) + input_biases
        logit_hidden = tf.matmul(tf.nn.relu(logit_input), hidden_weights) + hidden_biases    
        logit_output = tf.matmul(logit_hidden, output_weights) + output_biases
        
        return logit_output + 0.01*tf.nn.l2_loss(input_weights) + 0.01*tf.nn.l2_loss(hidden_weights) \
                + 0.01*tf.nn.l2_loss(output_weights)

    # Training computation.
    predictions = network(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=predictions))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(predictions)
    valid_prediction = tf.nn.softmax(network(tf_valid_dataset))
    test_prediction = tf.nn.softmax(network(tf_test_dataset))


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [8]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

    

Initialized
Minibatch loss at step 0: 3270.537109
Minibatch accuracy: 13.3%
Validation accuracy: 10.9%
Minibatch loss at step 500: 504.530029
Minibatch accuracy: 66.4%
Validation accuracy: 62.2%
Minibatch loss at step 1000: 423.991638
Minibatch accuracy: 66.4%
Validation accuracy: 68.4%
Minibatch loss at step 1500: 519.006836
Minibatch accuracy: 64.1%
Validation accuracy: 70.4%
Minibatch loss at step 2000: 278.909546
Minibatch accuracy: 71.9%
Validation accuracy: 71.7%
Minibatch loss at step 2500: 383.797058
Minibatch accuracy: 66.4%
Validation accuracy: 72.7%
Minibatch loss at step 3000: 214.970551
Minibatch accuracy: 75.8%
Validation accuracy: 73.5%
Test accuracy: 80.5%


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [24]:
print(train_dataset.shape)
print(train_labels.shape)
train_dataset1 = train_dataset[:256, :]
train_labels1 = train_labels[:256, :]
print(train_dataset1.shape)
print(train_labels.shape)


batch_size = 128
hidden_layers_size = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    weights = tf.Variable(tf.truncated_normal([hidden_layers_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    hidden_weights = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layers_size]))
    hidden_biases = tf.Variable(tf.truncated_normal([hidden_layers_size]))
    
    logit_hidden = tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases
    
    logit = tf.matmul(tf.nn.relu(logit_hidden), weights) + biases

    # Training computation.
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logit)) \
            + 0.01*tf.nn.l2_loss(weights) + 0.01*tf.nn.l2_loss(hidden_weights) 

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logit)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases), weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases), weights) + biases)


(200000, 784)
(200000, 10)
(256, 784)
(256, 10)


In [26]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels1.shape[0] - batch_size)

        # Generate a minibatch.
        batch_data = train_dataset1[offset:(offset + batch_size), :]
        batch_labels = train_labels1[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3585.023193
Minibatch accuracy: 10.2%
Validation accuracy: 22.1%
Minibatch loss at step 500: 1159.299194
Minibatch accuracy: 100.0%
Validation accuracy: 57.5%
Minibatch loss at step 1000: 426.268768
Minibatch accuracy: 100.0%
Validation accuracy: 56.9%
Minibatch loss at step 1500: 156.739532
Minibatch accuracy: 100.0%
Validation accuracy: 57.5%
Minibatch loss at step 2000: 57.648376
Minibatch accuracy: 100.0%
Validation accuracy: 61.6%
Minibatch loss at step 2500: 21.250650
Minibatch accuracy: 100.0%
Validation accuracy: 67.5%
Minibatch loss at step 3000: 7.905726
Minibatch accuracy: 100.0%
Validation accuracy: 69.4%
Test accuracy: 75.9%


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

#### Normal case 

In [40]:
batch_size = 128
hidden_layers_size = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    weights = tf.Variable(tf.truncated_normal([hidden_layers_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    hidden_weights = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layers_size]))
    hidden_biases = tf.Variable(tf.truncated_normal([hidden_layers_size]))
    
    logit_hidden = tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases
    
    logit = tf.matmul(tf.nn.relu(tf.nn.dropout(logit_hidden, 0.8)), weights) + biases

    # Training computation.
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logit)) \
            + 0.01*tf.nn.l2_loss(weights) + 0.01*tf.nn.l2_loss(hidden_weights) 

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logit)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases), weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases), weights) + biases)


In [41]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

    

Initialized
Minibatch loss at step 0: 3528.815674
Minibatch accuracy: 5.5%
Validation accuracy: 25.3%
Minibatch loss at step 500: 1156.549316
Minibatch accuracy: 78.1%
Validation accuracy: 82.1%
Minibatch loss at step 1000: 421.678162
Minibatch accuracy: 75.0%
Validation accuracy: 84.2%
Minibatch loss at step 1500: 154.540054
Minibatch accuracy: 79.7%
Validation accuracy: 83.0%
Minibatch loss at step 2000: 57.067448
Minibatch accuracy: 85.2%
Validation accuracy: 85.3%
Minibatch loss at step 2500: 21.497766
Minibatch accuracy: 81.2%
Validation accuracy: 85.4%
Minibatch loss at step 3000: 8.293703
Minibatch accuracy: 86.7%
Validation accuracy: 84.8%
Test accuracy: 90.5%


#### Overfitting case 

In [35]:
print(train_dataset.shape)
print(train_labels.shape)
train_dataset1 = train_dataset[:256, :]
train_labels1 = train_labels[:256, :]
print(train_dataset1.shape)
print(train_labels.shape)


batch_size = 128
hidden_layers_size = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    weights = tf.Variable(tf.truncated_normal([hidden_layers_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    hidden_weights = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layers_size]))
    hidden_biases = tf.Variable(tf.truncated_normal([hidden_layers_size]))
    
    logit_hidden = tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases
    
    logit = tf.matmul(tf.nn.relu(tf.nn.dropout(logit_hidden, 0.8)), weights) + biases

    # Training computation.
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logit)) \
            + 0.01*tf.nn.l2_loss(weights) + 0.01*tf.nn.l2_loss(hidden_weights) 

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logit)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases), weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases), weights) + biases)


(200000, 784)
(200000, 10)
(256, 784)
(200000, 10)


In [36]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels1.shape[0] - batch_size)

        # Generate a minibatch.
        batch_data = train_dataset1[offset:(offset + batch_size), :]
        batch_labels = train_labels1[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3554.497314
Minibatch accuracy: 3.1%
Validation accuracy: 27.0%
Minibatch loss at step 500: 1154.585205
Minibatch accuracy: 100.0%
Validation accuracy: 64.0%
Minibatch loss at step 1000: 424.539520
Minibatch accuracy: 100.0%
Validation accuracy: 65.1%
Minibatch loss at step 1500: 156.103851
Minibatch accuracy: 100.0%
Validation accuracy: 64.9%
Minibatch loss at step 2000: 57.407085
Minibatch accuracy: 100.0%
Validation accuracy: 66.0%
Minibatch loss at step 2500: 21.156561
Minibatch accuracy: 100.0%
Validation accuracy: 67.8%
Minibatch loss at step 3000: 7.874798
Minibatch accuracy: 100.0%
Validation accuracy: 69.3%
Test accuracy: 75.6%


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [49]:
batch_size = 128
hidden_layers_size = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    weights = tf.Variable(tf.truncated_normal([hidden_layers_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    hidden_weights = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layers_size]))
    hidden_biases = tf.Variable(tf.truncated_normal([hidden_layers_size]))
    
    logit_hidden = tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases
    
    logit = tf.matmul(tf.nn.relu(tf.nn.dropout(logit_hidden, 0.8)), weights) + biases

    # Training computation.
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logit)) \
            + 0.01*tf.nn.l2_loss(weights) + 0.01*tf.nn.l2_loss(hidden_weights) 

    # Optimizer.
    #optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
    
    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, 500, 0.1)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logit)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases), weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases), weights) + biases)


In [53]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels }
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

    

Initialized
0
0
Minibatch loss at step 0: 3473.745117
Minibatch accuracy: 13.3%
Validation accuracy: 34.1%
128
1
256
2
384
3
512
4
640
5
768
6
896
7
1024
8
1152
9
1280
10
1408
11
1536
12
1664
13
1792
14
1920
15
2048
16
2176
17
2304
18
2432
19
2560
20
2688
21
2816
22
2944
23
3072
24
3200
25
3328
26
3456
27
3584
28
3712
29
3840
30
3968
31
4096
32
4224
33
4352
34
4480
35
4608
36
4736
37
4864
38
4992
39
5120
40
5248
41
5376
42
5504
43
5632
44
5760
45
5888
46
6016
47
6144
48
6272
49
6400
50
6528
51
6656
52
6784
53
6912
54
7040
55
7168
56
7296
57
7424
58
7552
59
7680
60
7808
61
7936
62
8064
63
8192
64
8320
65
8448
66
8576
67
8704
68
8832
69
8960
70
9088
71
9216
72
9344
73
9472
74
9600
75
9728
76
9856
77
9984
78
10112
79
10240
80
10368
81
10496
82
10624
83
10752
84
10880
85
11008
86
11136
87
11264
88
11392
89
11520
90
11648
91
11776
92
11904
93
12032
94
12160
95
12288
96
12416
97
12544
98
12672
99
12800
100
12928
101
13056
102
13184
103
13312
104
13440
105
13568
106
13696
107
13824
108
13952


104704
818
104832
819
104960
820
105088
821
105216
822
105344
823
105472
824
105600
825
105728
826
105856
827
105984
828
106112
829
106240
830
106368
831
106496
832
106624
833
106752
834
106880
835
107008
836
107136
837
107264
838
107392
839
107520
840
107648
841
107776
842
107904
843
108032
844
108160
845
108288
846
108416
847
108544
848
108672
849
108800
850
108928
851
109056
852
109184
853
109312
854
109440
855
109568
856
109696
857
109824
858
109952
859
110080
860
110208
861
110336
862
110464
863
110592
864
110720
865
110848
866
110976
867
111104
868
111232
869
111360
870
111488
871
111616
872
111744
873
111872
874
112000
875
112128
876
112256
877
112384
878
112512
879
112640
880
112768
881
112896
882
113024
883
113152
884
113280
885
113408
886
113536
887
113664
888
113792
889
113920
890
114048
891
114176
892
114304
893
114432
894
114560
895
114688
896
114816
897
114944
898
115072
899
115200
900
115328
901
115456
902
115584
903
115712
904
115840
905
115968
906
116096
907
116224
908

KeyboardInterrupt: 