Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

### Simplr GD Logistic Regression through Linear Regression and Softmax w L2 regularisation

In [32]:
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000
beta = 0.005  # parameter to tune for l2 reguraliser  - best value is 0.005 = 88.4

graph = tf.Graph()
with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random values following a (truncated)
  # normal distribution. The biases get initialized to zero.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  # We multiply the inputs with the weight matrix, and add biases. We compute
  # the softmax and cross-entropy (it's one operation in TensorFlow, because
  # it's very common, and it can be optimized). We take the average of this
  # cross-entropy across all training examples: that's our loss.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Adding regularisation l2
  regularizers = (tf.nn.l2_loss(weights) + tf.nn.l2_loss(biases))
  # Add the regularization term to the loss.
  loss += beta * regularizers
  
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [33]:
num_steps = 801

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

with tf.Session(graph=graph) as session:
  # This is a one-time operation which ensures the parameters get initialized as
  # we described in the graph: random weights for the matrix, zeros for the
  # biases. 
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.
    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      print('Loss at step %d: %f' % (step, l))
      print('Training accuracy: %.1f%%' % accuracy(
        predictions, train_labels[:train_subset, :]))
      # Calling .eval() on valid_prediction is basically like calling run(), but
      # just to get that one numpy array. Note that it recomputes all its graph
      # dependencies.
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 30.772821
Training accuracy: 10.1%
Validation accuracy: 11.9%
Loss at step 100: 10.482635
Training accuracy: 73.3%
Validation accuracy: 72.2%
Loss at step 200: 6.207876
Training accuracy: 77.1%
Validation accuracy: 75.1%
Loss at step 300: 3.802901
Training accuracy: 79.7%
Validation accuracy: 76.9%
Loss at step 400: 2.437117
Training accuracy: 81.6%
Validation accuracy: 78.8%
Loss at step 500: 1.662688
Training accuracy: 83.1%
Validation accuracy: 79.9%
Loss at step 600: 1.222661
Training accuracy: 84.4%
Validation accuracy: 80.7%
Loss at step 700: 0.971148
Training accuracy: 84.9%
Validation accuracy: 81.1%
Loss at step 800: 0.826370
Training accuracy: 85.2%
Validation accuracy: 81.2%
Test accuracy: 88.4%


### Stochastic Gradient Descent with linear activation function and softmax and L2 regularisation

In [44]:

batch_size = 128
beta = 0.001  # parameter to tune for l2 reguraliser  - best value is 0.001 = 88.5
graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
  # Adding regularisation l2
  regularizers = (tf.nn.l2_loss(weights) + tf.nn.l2_loss(biases))
  # Add the regularization term to the loss.
  loss += beta * regularizers
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [45]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Offset -- %d Minibatch loss at step %d: %f" % (offset, step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Offset -- 0 Minibatch loss at step 0: 20.464306
Minibatch accuracy: 6.2%
Validation accuracy: 13.3%
Offset -- 64000 Minibatch loss at step 500: 3.278008
Minibatch accuracy: 67.2%
Validation accuracy: 76.5%
Offset -- 128000 Minibatch loss at step 1000: 1.809902
Minibatch accuracy: 74.2%
Validation accuracy: 79.0%
Offset -- 192000 Minibatch loss at step 1500: 0.973770
Minibatch accuracy: 86.7%
Validation accuracy: 80.4%
Offset -- 56128 Minibatch loss at step 2000: 0.912485
Minibatch accuracy: 81.2%
Validation accuracy: 80.9%
Offset -- 120128 Minibatch loss at step 2500: 0.847945
Minibatch accuracy: 80.5%
Validation accuracy: 81.6%
Offset -- 184128 Minibatch loss at step 3000: 0.782698
Minibatch accuracy: 81.2%
Validation accuracy: 81.4%
Test accuracy: 88.5%


### One Hidden Layer NN (1024 units) with Relu and fancy weight initialisation with SGD, L2
the accuracy doesn't get better in this case of adding L2. Conversly, in normal Logistic Regression scenario with simple GD it improved.

In [83]:
# with one Relu hidden layer
beta = 0.0001    # parameter to tune for l2 reguraliser - best result with beta 0.0001
batch_size = 128
L1 = 4096
L2 = 10

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, L1], stddev=np.sqrt(2/(image_size*image_size))))
  biases1 = tf.Variable(tf.ones([L1])/10)
  weights2 = tf.Variable(tf.truncated_normal([L1, L2], stddev=np.sqrt(2/L1)))
  biases2 = tf.Variable(tf.zeros([L2]))
  
  # Training computation.
  Y1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
  logits = tf.matmul(Y1, weights2) + biases2
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

  # Adding regularisation l2
  regularizers = (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(biases1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(biases2))
  # Add the regularization term to the loss.
  loss += beta * regularizers

  # Validation Computation
  Y1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
  logits_valid = tf.matmul(Y1_valid, weights2) + biases2
  # Test Computation
  Y1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
  logits_test = tf.matmul(Y1_test, weights2) + biases2 
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(logits_valid)
  test_prediction = tf.nn.softmax(logits_test)
# lr = 0.1
# without touching weights, the best option for biases as per accuracy (90.6%) is to use tf.Variable(tf.ones([L1])/10) for biases for Relu 
# and tf.Variable(tf.zeros([L2])) for non Relu functions
# accuracy jumps from 90.6% to 92.3% only by adding to weights1 sttdev=np.sqrt(2/n) where n = # of neurons in prev
# layer. In case of Relu here I used n = # of features since that's what Relu has as input in this case.
# Accuracy jumps from 92.3% to 93.1% by adding stddev=np.sqrt(2/L1) to weights2
# acc goes to 94.2% when using lr = 0.5 with GD

# above data is for case w/o L2 regularisation. Unfortunately when I add it, the accuracy doesn't change

In [82]:
num_steps = 3001
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # the line below makes the loop running only over the first 5 batches of the training dataset
    # which leads to overfitting as training accuracy becomes 100% and test accuracy goes down to 86.3%
    # offset = (step %5 * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.792274
Minibatch accuracy: 10.2%
Validation accuracy: 31.9%
Minibatch loss at step 500: 0.306184
Minibatch accuracy: 100.0%
Validation accuracy: 79.2%
Minibatch loss at step 1000: 0.290982
Minibatch accuracy: 100.0%
Validation accuracy: 79.2%
Minibatch loss at step 1500: 0.276988
Minibatch accuracy: 100.0%
Validation accuracy: 79.3%
Minibatch loss at step 2000: 0.263764
Minibatch accuracy: 100.0%
Validation accuracy: 79.3%
Minibatch loss at step 2500: 0.251213
Minibatch accuracy: 100.0%
Validation accuracy: 79.3%
Minibatch loss at step 3000: 0.239283
Minibatch accuracy: 100.0%
Validation accuracy: 79.3%
Test accuracy: 86.3%


### interestingly, L2 improved results only for simple GD and Logistic Regression, but not for SGD and 1 layer ANN. Maybe further tuning would help.

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---
Answer: look the the change in the code above (5% - to restrict only first 5 batches)

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

### Adding dropout to training only data

In [107]:
# with one Relu hidden layer
beta = 0.0001    # parameter to tune for l2 reguraliser - best result with beta 0.0001
batch_size = 400
# L1 = 4096
# default 128, with batch_size = 200 and dp 0.9, acc  went up from 94 to 94.6% 
# with 400 and dp 0.9, acc  went up from 94.6 to 95.2% 
# with 800 and dp 0.9, acc  = 95.2% 
# with 500 and dp 0.95, acc  = 95.1% 
# steps 7000, batch_size = 300, pkeep = 0.8 = test acc = 95%
# L1 = 2048 with 400 and dp 0.9, acc  = 95%
# L1 = 784 with 400 and dp 0.9, acc  = 95%
# L1 = 784, steps = 3001 with 800 and dp 0.9, acc  = 94.9%
# L1 = 784, steps = 9001 with 400 and dp 0.9, acc  = 95.3% (highest)
# L1 = 784, steps = 18001 with 400 and dp 0.9, acc  = 95.3% (highest)

L1 = 784
L2 = 10


graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, L1], stddev=np.sqrt(2/(image_size*image_size))))
  biases1 = tf.Variable(tf.ones([L1])/10)
  weights2 = tf.Variable(tf.truncated_normal([L1, L2], stddev=np.sqrt(2/L1)))
  biases2 = tf.Variable(tf.zeros([L2]))
  pkeep = tf.placeholder(tf.float32)
  
  # Training computation with dropout
  Y1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
  Y1d = tf.nn.dropout(Y1, pkeep)
  logits = tf.matmul(Y1d, weights2) + biases2
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))


  # Validation Computation
  Y1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
  logits_valid = tf.matmul(Y1_valid, weights2) + biases2
  # Test Computation
  Y1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
  logits_test = tf.matmul(Y1_test, weights2) + biases2 
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(logits_valid)
  test_prediction = tf.nn.softmax(logits_test)
# lr = 0.1
# without touching weights, the best option for biases as per accuracy (90.6%) is to use tf.Variable(tf.ones([L1])/10) for biases for Relu 
# and tf.Variable(tf.zeros([L2])) for non Relu functions
# accuracy jumps from 90.6% to 92.3% only by adding to weights1 sttdev=np.sqrt(2/n) where n = # of neurons in prev
# layer. In case of Relu here I used n = # of features since that's what Relu has as input in this case.
# Accuracy jumps from 92.3% to 93.1% by adding stddev=np.sqrt(2/L1) to weights2
# acc goes to 94.2% when using lr = 0.5 with GD

# above data is for case w/o L2 regularisation. Unfortunately when I add it, the accuracy doesn't change

In [109]:
num_steps = 18001
#3001
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # the line below makes the loop running only over the first 5 batches of the training dataset
    # which leads to overfitting as training accuracy becomes 100% and test accuracy goes down to 86.3%
    # offset = (step %5 * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, pkeep : 0.9}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.535658
Minibatch accuracy: 10.0%
Validation accuracy: 37.9%
Minibatch loss at step 500: 0.507536
Minibatch accuracy: 85.8%
Validation accuracy: 87.4%
Minibatch loss at step 1000: 0.380261
Minibatch accuracy: 87.8%
Validation accuracy: 88.8%
Minibatch loss at step 1500: 0.361023
Minibatch accuracy: 90.8%
Validation accuracy: 89.3%
Minibatch loss at step 2000: 0.365382
Minibatch accuracy: 88.8%
Validation accuracy: 89.8%
Minibatch loss at step 2500: 0.339212
Minibatch accuracy: 90.2%
Validation accuracy: 89.7%
Minibatch loss at step 3000: 0.269138
Minibatch accuracy: 91.2%
Validation accuracy: 90.0%
Minibatch loss at step 3500: 0.339402
Minibatch accuracy: 88.5%
Validation accuracy: 89.9%
Minibatch loss at step 4000: 0.236547
Minibatch accuracy: 93.8%
Validation accuracy: 90.4%
Minibatch loss at step 4500: 0.210617
Minibatch accuracy: 92.8%
Validation accuracy: 90.3%
Minibatch loss at step 5000: 0.200006
Minibatch accuracy: 93.5%
Validation accurac

### the case of the overfitted model and dropout

In [None]:
# with one Relu hidden layer
beta = 0.0001    # parameter to tune for l2 reguraliser - best result with beta 0.0001
batch_size = 200
L1 = 4096
L2 = 10


graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, L1], stddev=np.sqrt(2/(image_size*image_size))))
  biases1 = tf.Variable(tf.ones([L1])/10)
  weights2 = tf.Variable(tf.truncated_normal([L1, L2], stddev=np.sqrt(2/L1)))
  biases2 = tf.Variable(tf.zeros([L2]))
  pkeep = tf.placeholder(tf.float32)
  
  # Training computation with dropout
  Y1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
  Y1d = tf.nn.dropout(Y1, pkeep)
  logits = tf.matmul(Y1d, weights2) + biases2
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))



#   # Adding regularisation l2
#   regularizers = (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(biases1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(biases2))
#   # Add the regularization term to the loss.
#   loss += beta * regularizers

  # Validation Computation
  Y1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
  logits_valid = tf.matmul(Y1_valid, weights2) + biases2
  # Test Computation
  Y1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
  logits_test = tf.matmul(Y1_test, weights2) + biases2 
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(logits_valid)
  test_prediction = tf.nn.softmax(logits_test)
# lr = 0.1
# without touching weights, the best option for biases as per accuracy (90.6%) is to use tf.Variable(tf.ones([L1])/10) for biases for Relu 
# and tf.Variable(tf.zeros([L2])) for non Relu functions
# accuracy jumps from 90.6% to 92.3% only by adding to weights1 sttdev=np.sqrt(2/n) where n = # of neurons in prev
# layer. In case of Relu here I used n = # of features since that's what Relu has as input in this case.
# Accuracy jumps from 92.3% to 93.1% by adding stddev=np.sqrt(2/L1) to weights2
# acc goes to 94.2% when using lr = 0.5 with GD

# above data is for case w/o L2 regularisation. Unfortunately when I add it, the accuracy doesn't change

In [None]:
num_steps = 3001
#3001
with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
#     offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # the line below makes the loop running only over the first 5 batches of the training dataset
    # which leads to overfitting as training accuracy becomes 100% and test accuracy goes down to 86.3%
    offset = (step %5 * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, pkeep : 0.90}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---
