Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [5]:
batch_size = 128

graph = tf.Graph()

with graph.as_default():
    
    # datasets
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    reg_param = tf.placeholder(tf.float32)
    
    # Variables
    weight = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    bias = tf.Variable(tf.zeros([num_labels]))
    
    # Computation for training
    logits = tf.matmul(tf_train_dataset, weight) + bias
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)) + reg_param * tf.nn.l2_loss(weight)
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions
    train_predictions = tf.nn.softmax(logits)
    valid_predictions = tf.nn.softmax(tf.matmul(tf_valid_dataset, weight) + bias)
    test_predictions = tf.nn.softmax(tf.matmul(tf_test_dataset, weight) + bias)
    

In [6]:
steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # generate minibatch using offset
        batch_dataset = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # prepare dict
        feed_dict = {tf_train_dataset: batch_dataset, tf_train_labels: batch_labels, reg_param: 1.26e-3}
        
        _, l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            
            print("Validation accuracy: %.1f%%" % accuracy(valid_predictions.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_predictions.eval(), test_labels))

Initialized
Minibatch loss at step 0: 23.352020
Minibatch accuracy: 5.5%
Validation accuracy: 12.8%
Minibatch loss at step 500: 2.648945
Minibatch accuracy: 79.7%
Validation accuracy: 75.9%
Minibatch loss at step 1000: 1.886076
Minibatch accuracy: 72.7%
Validation accuracy: 78.8%
Minibatch loss at step 1500: 1.364893
Minibatch accuracy: 77.3%
Validation accuracy: 80.7%
Minibatch loss at step 2000: 0.964634
Minibatch accuracy: 78.9%
Validation accuracy: 81.3%
Minibatch loss at step 2500: 0.498364
Minibatch accuracy: 88.3%
Validation accuracy: 82.1%
Minibatch loss at step 3000: 0.697720
Minibatch accuracy: 82.8%
Validation accuracy: 81.7%
Test accuracy: 88.4%


To get the most out of the L2 regularization, we must tune the reg_param so that we can get the most accurate model

In [7]:
import matplotlib.pyplot as plt

reg_params = [pow(10, i) for i in np.arange(-4, -2, 0.1)]
accuracy_values = []

for curr_reg_param in reg_params:
    with tf.Session(graph=graph) as session:
        tf.initialize_all_variables().run()
        for step in range(steps):
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # generate minibatch in offset
            batch_dataset = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            feed_dict = {tf_train_dataset: batch_dataset, tf_train_labels: batch_labels, reg_param: curr_reg_param}
            _, l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        accuracy_values.append(accuracy(test_predictions.eval(), test_labels))

# plot figure to measure accuracy
plt.semilogx(reg_params, accuracy_values)
plt.grid(True)
plt.title("Test accuracy by regularization parameter(logistic regression)")
plt.show()

The most accurate reg_param was 1.26e-3 thus we will go with that (I have also modified the above code with that param)

In [8]:
num_hidden_nodes = 1024

graph = tf.Graph()

with graph.as_default():
    
    # Datasets
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    reg_param = tf.placeholder(tf.float32)
    
    # Variables
    weights_first = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
    bias_first = tf.Variable(tf.zeros([num_hidden_nodes]))
    weights_second = tf.Variable(tf.truncated_normal([num_hidden_nodes, num_labels]))
    bias_second = tf.Variable(tf.zeros([num_labels]))
    
    # Computation for training
    first_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_first) + bias_first)
    logits = tf.matmul(first_layer, weights_second) + bias_second
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)) + reg_param * (tf.nn.l2_loss(weights_first) + tf.nn.l2_loss(weights_second))
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions
    train_predictions = tf.nn.softmax(logits)
    first_layer_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_first) + bias_first)
    valid_predictions = tf.nn.softmax(tf.matmul(first_layer_valid, weights_second) + bias_second)
    first_layer_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights_first) + bias_first)
    test_predictions = tf.nn.softmax(tf.matmul(first_layer_test, weights_second) + bias_second)

In [9]:
steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # generate minibatch using offset
        batch_dataset = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_dataset, tf_train_labels: batch_labels, reg_param: 1.26e-3}
        
        _,l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_predictions.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_predictions.eval(), test_labels))

Initialized
Minibatch loss at step 0: 730.159668
Minibatch accuracy: 11.7%
Validation accuracy: 28.0%
Minibatch loss at step 500: 213.082825
Minibatch accuracy: 81.2%
Validation accuracy: 78.2%
Minibatch loss at step 1000: 110.844536
Minibatch accuracy: 76.6%
Validation accuracy: 81.8%
Minibatch loss at step 1500: 58.453274
Minibatch accuracy: 84.4%
Validation accuracy: 84.1%
Minibatch loss at step 2000: 31.075710
Minibatch accuracy: 82.0%
Validation accuracy: 85.9%
Minibatch loss at step 2500: 16.554005
Minibatch accuracy: 94.5%
Validation accuracy: 86.8%
Minibatch loss at step 3000: 9.147886
Minibatch accuracy: 85.2%
Validation accuracy: 87.2%
Test accuracy: 93.0%


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [10]:
steps = 100
batch_count = 3

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(steps):
        offset = step % batch_count
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels, reg_param: 1.26e-3}
        
        _, l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        if(step % 2 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_predictions.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_predictions.eval(), test_labels))

Initialized
Minibatch loss at step 0: 750.919128
Minibatch accuracy: 12.5%
Validation accuracy: 37.3%
Minibatch loss at step 2: 1367.725220
Minibatch accuracy: 32.0%
Validation accuracy: 45.2%
Minibatch loss at step 4: 722.323120
Minibatch accuracy: 65.6%
Validation accuracy: 48.4%
Minibatch loss at step 6: 435.970306
Minibatch accuracy: 86.7%
Validation accuracy: 67.7%
Minibatch loss at step 8: 396.874420
Minibatch accuracy: 99.2%
Validation accuracy: 68.0%
Minibatch loss at step 10: 391.745026
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 12: 390.758789
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 14: 389.775055
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 16: 388.793762
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 18: 387.814911
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 20: 386.838684
Minibatch accuracy: 100.0%
Validation a

---
Problem 3
---------

Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.


What happens to our extreme overfitting case?

---

In [11]:
graph = tf.Graph()

with graph.as_default():
    
    # get datasets and labels
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    weights_first = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
    bias_first = tf.Variable(tf.zeros([num_hidden_nodes]))
    weights_second = tf.Variable(tf.truncated_normal([num_hidden_nodes, num_labels]))
    bias_second = tf.Variable(tf.zeros([num_labels]))
    
    # Computation for training (includes dropout)
    first_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_first) + bias_first)
    first_dropout = tf.nn.dropout(first_layer, 0.5)
    logits = tf.matmul(first_dropout, weights_second) + bias_second
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for datasets
    train_predictions = tf.nn.softmax(logits)
    first_layer_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_first) + bias_first)
    valid_predictions = tf.nn.softmax(tf.matmul(first_layer_valid, weights_second) + bias_second)
    first_layer_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights_first) + bias_first)
    test_predictions = tf.nn.softmax(tf.matmul(first_layer_test, weights_second) + bias_second)

In [12]:
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(steps):
        offset = step % batch_count
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        
        _, l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        if(step % 2 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_predictions.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_predictions.eval(), test_labels))

Initialized
Minibatch loss at step 0: 544.800903
Minibatch accuracy: 8.6%
Validation accuracy: 25.8%
Minibatch loss at step 2: 1137.412720
Minibatch accuracy: 46.1%
Validation accuracy: 51.5%
Minibatch loss at step 4: 57.004860
Minibatch accuracy: 86.7%
Validation accuracy: 66.4%
Minibatch loss at step 6: 18.200998
Minibatch accuracy: 89.1%
Validation accuracy: 65.7%
Minibatch loss at step 8: 2.077314
Minibatch accuracy: 96.9%
Validation accuracy: 67.7%
Minibatch loss at step 10: 4.589981
Minibatch accuracy: 96.1%
Validation accuracy: 66.8%
Minibatch loss at step 12: 4.512859
Minibatch accuracy: 95.3%
Validation accuracy: 68.2%
Minibatch loss at step 14: 3.183395
Minibatch accuracy: 98.4%
Validation accuracy: 69.3%
Minibatch loss at step 16: 2.326460
Minibatch accuracy: 98.4%
Validation accuracy: 68.4%
Minibatch loss at step 18: 1.475852
Minibatch accuracy: 99.2%
Validation accuracy: 68.4%
Minibatch loss at step 20: 0.103189
Minibatch accuracy: 99.2%
Validation accuracy: 68.8%
Minibatc

There is a slight improvement in accuracy when using dropout, indicating better generalization

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


First try: 2 layers

In [13]:
batch_size = 128
num_hidden_nodes_first = 1024
num_hidden_nodes_second = 100
reg_param = 1.26e-3

graph = tf.Graph()
with graph.as_default():
    
    # Datasets
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    global_step = tf.Variable(0)
    weights_first = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_hidden_nodes_first], stddev=np.sqrt(2.0 / (image_size * image_size))))
    bias_first = tf.Variable(tf.zeros([num_hidden_nodes_first]))
    weights_second = tf.Variable(tf.truncated_normal([num_hidden_nodes_first, num_hidden_nodes_second], stddev=np.sqrt(2.0 / num_hidden_nodes_first)))
    bias_second = tf.Variable(tf.zeros([num_hidden_nodes_second]))
    weights_third = tf.Variable(tf.truncated_normal([num_hidden_nodes_second, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes_second)))
    bias_third = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation
    first_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_first) + bias_first)
    second_layer = tf.nn.relu(tf.matmul(first_layer, weights_second) + bias_second)
    logits = tf.matmul(second_layer, weights_third) + bias_third
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels)) + reg_param * (tf.nn.l2_loss(weights_first) + tf.nn.l2_loss(weights_second) + tf.nn.l2_loss(weights_third))
    
    # Optimizer and learning rate (alpha)
    alpha = tf.train.exponential_decay(0.5, global_step, 1000, 0.7, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(loss, global_step=global_step)
    
    # Predictions for datasets
    train_predictions = tf.nn.softmax(logits)
    
    first_layer_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_first) + bias_first)
    second_layer_valid = tf.nn.relu(tf.matmul(first_layer_valid, weights_second) + bias_second)
    valid_predictions = tf.nn.softmax(tf.matmul(second_layer_valid, weights_third) + bias_third)
    
    first_layer_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights_first) + bias_first)
    second_layer_test = tf.nn.relu(tf.matmul(first_layer_test, weights_second) + bias_second)
    test_predictions = tf.nn.softmax(tf.matmul(second_layer_test, weights_third) + bias_third)

In [14]:
steps = 9001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized...")
    
    for step in range(steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        
        _, l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_predictions.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_predictions.eval(), test_labels))

Initialized...
Minibatch loss at step 0: 3.468110
Minibatch accuracy: 12.5%
Validation accuracy: 36.1%
Minibatch loss at step 500: 1.139329
Minibatch accuracy: 88.3%
Validation accuracy: 85.2%
Minibatch loss at step 1000: 1.077731
Minibatch accuracy: 73.4%
Validation accuracy: 86.6%
Minibatch loss at step 1500: 0.736350
Minibatch accuracy: 85.9%
Validation accuracy: 88.0%
Minibatch loss at step 2000: 0.612993
Minibatch accuracy: 88.3%
Validation accuracy: 88.3%
Minibatch loss at step 2500: 0.352820
Minibatch accuracy: 96.1%
Validation accuracy: 88.6%
Minibatch loss at step 3000: 0.529827
Minibatch accuracy: 89.8%
Validation accuracy: 89.0%
Minibatch loss at step 3500: 0.519320
Minibatch accuracy: 88.3%
Validation accuracy: 89.2%
Minibatch loss at step 4000: 0.466587
Minibatch accuracy: 89.8%
Validation accuracy: 89.5%
Minibatch loss at step 4500: 0.411180
Minibatch accuracy: 93.0%
Validation accuracy: 89.5%
Minibatch loss at step 5000: 0.479356
Minibatch accuracy: 90.6%
Validation accu

In [44]:
num_hidden_nodes_second = 512
num_hidden_nodes_third = 256
num_hidden_nodes_fourth = 64
keep_probability = 0.5

graph = tf.Graph()

with graph.as_default():
    
    # Datasets
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables
    global_step = tf.Variable(0)
    
    weights_first = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes_first], stddev=np.sqrt(2.0 / (image_size * image_size))))
    bias_first = tf.Variable(tf.zeros([num_hidden_nodes_first]))
    
    weights_second = tf.Variable(tf.truncated_normal([num_hidden_nodes_first, num_hidden_nodes_second], stddev=np.sqrt(2.0 / num_hidden_nodes_first)))
    bias_second = tf.Variable(tf.zeros([num_hidden_nodes_second]))
    
    weights_third = tf.Variable(tf.truncated_normal([num_hidden_nodes_second, num_hidden_nodes_third], stddev=np.sqrt(2.0 / num_hidden_nodes_second)))
    bias_third = tf.Variable(tf.zeros([num_hidden_nodes_third]))
    
    weights_fourth = tf.Variable(tf.truncated_normal([num_hidden_nodes_third, num_hidden_nodes_fourth], stddev=np.sqrt(2.0 / num_hidden_nodes_third)))
    bias_fourth = tf.Variable(tf.zeros([num_hidden_nodes_fourth]))
    
    weights_fifth = tf.Variable(tf.truncated_normal([num_hidden_nodes_fourth, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes_fourth)))
    bias_fifth = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation
    first_layer = tf.nn.relu(tf.matmul(tf_train_dataset, weights_first) + bias_first)
    second_layer = tf.nn.relu(tf.matmul(first_layer, weights_second) + bias_second)
    third_layer = tf.nn.relu(tf.matmul(second_layer, weights_third) + bias_third)
    third_dropout = tf.nn.dropout(third_layer, keep_probability)
    fourth_layer = tf.nn.relu(tf.matmul(third_dropout, weights_fourth) + bias_fourth)
    logits = tf.matmul(fourth_layer, weights_fifth) + bias_fifth
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    
    #Optimizer and learning rate alpha
    alpha = tf.train.exponential_decay(0.5, global_step, 4000, 0.7, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(loss, global_step=global_step)
    
    # Predictions
    train_predictions = tf.nn.softmax(logits)
    
    first_layer_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_first) + bias_first)
    second_layer_valid = tf.nn.relu(tf.matmul(first_layer_valid, weights_second) + bias_second)
    third_layer_valid = tf.nn.relu(tf.matmul(second_layer_valid, weights_third) + bias_third)
    fourth_layer_valid = tf.nn.relu(tf.matmul(third_layer_valid, weights_fourth) + bias_fourth)
    valid_predictions = tf.nn.softmax(tf.matmul(fourth_layer_valid, weights_fifth) + bias_fifth)
    
    
    first_layer_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights_first) + bias_first)
    second_layer_test = tf.nn.relu(tf.matmul(first_layer_test, weights_second) + bias_second)
    third_layer_test = tf.nn.relu(tf.matmul(second_layer_test, weights_third) + bias_third)
    fourth_layer_test = tf.nn.relu(tf.matmul(third_layer_test, weights_fourth) + bias_fourth)
    test_predictions = tf.nn.softmax(tf.matmul(fourth_layer_test, weights_fifth) + bias_fifth)

In [45]:
steps = 18000

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(steps):
        # get offset
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        #generate batch using offset
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        
        _, l, predictions = session.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
        
        if step %  500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_predictions.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_predictions.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.433242
Minibatch accuracy: 10.2%
Validation accuracy: 20.2%
Minibatch loss at step 500: 0.540473
Minibatch accuracy: 85.2%
Validation accuracy: 85.2%
Minibatch loss at step 1000: 0.736138
Minibatch accuracy: 75.0%
Validation accuracy: 86.7%
Minibatch loss at step 1500: 0.466213
Minibatch accuracy: 82.0%
Validation accuracy: 88.2%
Minibatch loss at step 2000: 0.419045
Minibatch accuracy: 85.2%
Validation accuracy: 88.9%
Minibatch loss at step 2500: 0.142423
Minibatch accuracy: 96.1%
Validation accuracy: 88.9%
Minibatch loss at step 3000: 0.310257
Minibatch accuracy: 90.6%
Validation accuracy: 89.2%
Minibatch loss at step 3500: 0.373479
Minibatch accuracy: 87.5%
Validation accuracy: 89.1%
Minibatch loss at step 4000: 0.340930
Minibatch accuracy: 89.1%
Validation accuracy: 89.7%
Minibatch loss at step 4500: 0.199712
Minibatch accuracy: 92.2%
Validation accuracy: 90.0%
Minibatch loss at step 5000: 0.294791
Minibatch accuracy: 92.2%
Validation accurac

The above code demonstrates a 4 layer network with dropout on the third layer.