## Deep Learning
### Assignment 3
<hr />
Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
import os

#### First reload the data we generated in 1_notmnist.ipynb.

In [2]:
data_path = '/Users/heany/code/AI-For-NLP/dataset'
pickle_file = 'notMNIST.pickle'

with open(os.path.join(data_path, pickle_file),'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save # hint to help gc free up memory
    
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


#### Reformat into a shape that's more adapted to the models we're going to train:
<hr />

+ data as flat matrix
+ labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labes = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size* image_size)).astype(np.float32) # shape(-1, 784)
    # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
    labels = (np.arange(num_labes) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)    

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
    return ( 100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels,1)) / predictions.shape[0])

### Problem 1

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.



In [10]:
batch_size = 128


graph = tf.Graph()
with graph.as_default():
    
    # Input data. For the training data, we use a placeholder that will be fed 
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labes))
    
    lmbda = tf.placeholder(tf.float32) # add placeholder
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights = tf.Variable(
        tf.truncated_normal([image_size * image_size, num_labes])
    ) 
    biases = tf.Variable(tf.zeros([num_labes]))
    
#     weights = [
#         tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes])),
#         tf.Variable(tf.truncated_normal([hidden_nodes, num_labes]))
#     ]
#     biases = [tf.Variable(tf.zeros([hidden_nodes])), tf.Variable(tf.zeros([num_labes]))]
    
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)
    ) + lmbda * tf.nn.l2_loss(weights)  # ==> add regularized item
    
#     logits = computation(tf_train_dataset, weights, biases)
#     loss = tf.add(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)) 
#     , lmbda * (tf.nn.l2_loss(weights[0]) + tf.nn.l2_loss(weights[1]))) # add regularization
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
    
num_steps = 3001

with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomied.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate a minibatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, lmbda: 1e-3}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict
        )
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print("Test Accuracy : {}".format(accuracy(test_prediction.eval(), test_labels)))
        
        


Initialized
Minibatch loss at step 0: 21.926792
Minibatch accuracy: 6.2%
Validation accuracy: 10.6%
Minibatch loss at step 500: 2.691623
Minibatch accuracy: 79.7%
Validation accuracy: 74.4%
Minibatch loss at step 1000: 1.863580
Minibatch accuracy: 73.4%
Validation accuracy: 77.3%
Minibatch loss at step 1500: 1.230054
Minibatch accuracy: 75.0%
Validation accuracy: 79.5%
Minibatch loss at step 2000: 1.020983
Minibatch accuracy: 75.8%
Validation accuracy: 80.4%
Minibatch loss at step 2500: 0.928662
Minibatch accuracy: 78.1%
Validation accuracy: 80.5%
Minibatch loss at step 3000: 0.906563
Minibatch accuracy: 77.3%
Validation accuracy: 81.3%
Test accuracy: 88.5%
Test Accuracy : 88.55


<h2 style ="color : red">Neural Network (1-layer ReLU) with Regularization</h2>

In [7]:
hidden_nodes = 1024
batch_size = 128

def computation(dataset, weights, biases):
    weight_sum = tf.add(tf.matmul(dataset, weights[0]), biases[0])
    hidden_layer = tf.nn.relu(weight_sum)
    logits = tf.add(tf.matmul(hidden_layer, weights[1]), biases[1])
    return logits

graph = tf.Graph()
with graph.as_default():
    
    # Input data. For the training data, we use a placeholder that will be fed 
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labes))
    
    lmbda = tf.placeholder(tf.float32) # add placeholder
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights = [
        tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes])),
        tf.Variable(tf.truncated_normal([hidden_nodes, num_labes]))
    ]
    biases = [tf.Variable(tf.zeros([hidden_nodes])), tf.Variable(tf.zeros([num_labes]))]
    
    # Training computation.
    # logits = tf.matmul(tf_train_dataset, weights) + biases
    # loss = tf.reduce_mean(
    #     tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)
    # )
    
    logits = computation(tf_train_dataset, weights, biases)
    loss = tf.add(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)) 
    , lmbda * (tf.nn.l2_loss(weights[0]) + tf.nn.l2_loss(weights[1]))) # add regularization
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(computation(tf_valid_dataset, weights, biases))
    test_prediction = tf.nn.softmax(computation(tf_test_dataset, weights, biases))
    
num_steps = 3001

with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomied.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate a minibatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, lmbda: 1e-3}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict
        )
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print("Test Accuracy : {}".format(accuracy(test_prediction.eval(), test_labels)))
        
        


Initialized
Minibatch loss at step 0: 574.232239
Minibatch accuracy: 18.0%
Validation accuracy: 37.1%
Minibatch loss at step 500: 194.966507
Minibatch accuracy: 83.6%
Validation accuracy: 78.0%
Minibatch loss at step 1000: 114.550072
Minibatch accuracy: 82.0%
Validation accuracy: 81.6%
Minibatch loss at step 1500: 68.761139
Minibatch accuracy: 81.2%
Validation accuracy: 82.5%
Minibatch loss at step 2000: 41.435272
Minibatch accuracy: 85.9%
Validation accuracy: 84.7%
Minibatch loss at step 2500: 25.322062
Minibatch accuracy: 86.7%
Validation accuracy: 86.0%
Minibatch loss at step 3000: 15.607914
Minibatch accuracy: 78.9%
Validation accuracy: 86.3%
Test accuracy: 92.7%
Test Accuracy : 92.71


### Problem 2

Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [11]:
batch_size = 128


graph = tf.Graph()
with graph.as_default():
    
    # Input data. For the training data, we use a placeholder that will be fed 
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labes))
    
    lmbda = tf.placeholder(tf.float32) # add placeholder
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights = tf.Variable(
        tf.truncated_normal([image_size * image_size, num_labes])
    ) 
    biases = tf.Variable(tf.zeros([num_labes]))
    
#     weights = [
#         tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes])),
#         tf.Variable(tf.truncated_normal([hidden_nodes, num_labes]))
#     ]
#     biases = [tf.Variable(tf.zeros([hidden_nodes])), tf.Variable(tf.zeros([num_labes]))]
    
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)
    ) + lmbda * tf.nn.l2_loss(weights)  # ==> add regularized item
    
#     logits = computation(tf_train_dataset, weights, biases)
#     loss = tf.add(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)) 
#     , lmbda * (tf.nn.l2_loss(weights[0]) + tf.nn.l2_loss(weights[1]))) # add regularization
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
    
num_steps = 3001

with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomied.
        # Note: we could use better randomization across epochs.
        offset = ((step %10) * batch_size) % (train_labels.shape[0] - batch_size)
        # reuse the data of 10 batch => step % 10
        
        # Generate a minibatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, lmbda: 1e-3}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict
        )
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print("Test Accuracy : {}".format(accuracy(test_prediction.eval(), test_labels)))
        

Initialized
Minibatch loss at step 0: 17.669857
Minibatch accuracy: 13.3%
Validation accuracy: 11.8%
Minibatch loss at step 500: 1.989387
Minibatch accuracy: 92.2%
Validation accuracy: 67.8%
Minibatch loss at step 1000: 1.171489
Minibatch accuracy: 99.2%
Validation accuracy: 69.8%
Minibatch loss at step 1500: 0.761966
Minibatch accuracy: 100.0%
Validation accuracy: 71.3%
Minibatch loss at step 2000: 0.526063
Minibatch accuracy: 100.0%
Validation accuracy: 72.6%
Minibatch loss at step 2500: 0.386867
Minibatch accuracy: 100.0%
Validation accuracy: 73.3%
Minibatch loss at step 3000: 0.304362
Minibatch accuracy: 100.0%
Validation accuracy: 74.1%
Test accuracy: 82.6%
Test Accuracy : 82.57


### Problem 3

Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

In [13]:
hidden_nodes = 1024
batch_size = 128

def computation(dataset, weights, biases, is_dropout = False, keep_prob=0.5):
    weight_sum = tf.add(tf.matmul(dataset, weights[0]), biases[0])
    hidden_layer = tf.nn.relu(weight_sum)
    if is_dropout: # ==> add dropout
        hidden_layer = tf.nn.dropout(hidden_layer, keep_prob)
    logits = tf.add(tf.matmul(hidden_layer, weights[1]), biases[1])
    return logits

graph = tf.Graph()
with graph.as_default():
    
    # Input data. For the training data, we use a placeholder that will be fed 
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labes))
    
    lmbda = tf.placeholder(tf.float32) # add placeholder
    keep_prob = tf.placeholder(tf.float32) # ==> add placeholder
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights = [
        tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes])),
        tf.Variable(tf.truncated_normal([hidden_nodes, num_labes]))
    ]
    biases = [tf.Variable(tf.zeros([hidden_nodes])), tf.Variable(tf.zeros([num_labes]))]
    
    # Training computation.
    # logits = tf.matmul(tf_train_dataset, weights) + biases
    # loss = tf.reduce_mean(
    #     tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)
    # )
    
    logits = computation(tf_train_dataset, weights, biases, is_dropout=True, keep_prob=keep_prob)
    loss = tf.add(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)) 
    , lmbda * (tf.nn.l2_loss(weights[0]) + tf.nn.l2_loss(weights[1]))) # add regularization
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(computation(tf_valid_dataset, weights, biases))
    test_prediction = tf.nn.softmax(computation(tf_test_dataset, weights, biases))
    
num_steps = 3001

with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomied.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate a minibatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, lmbda: 1e-3, keep_prob : 0.5}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict
        )
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print("Test Accuracy : {}".format(accuracy(test_prediction.eval(), test_labels)))
        

Initialized
Minibatch loss at step 0: 734.326538
Minibatch accuracy: 13.3%
Validation accuracy: 36.4%
Minibatch loss at step 500: 204.507080
Minibatch accuracy: 76.6%
Validation accuracy: 79.6%
Minibatch loss at step 1000: 115.815300
Minibatch accuracy: 70.3%
Validation accuracy: 80.4%
Minibatch loss at step 1500: 69.262787
Minibatch accuracy: 69.5%
Validation accuracy: 81.0%
Minibatch loss at step 2000: 42.340229
Minibatch accuracy: 78.9%
Validation accuracy: 83.3%
Minibatch loss at step 2500: 25.335667
Minibatch accuracy: 78.1%
Validation accuracy: 84.1%
Minibatch loss at step 3000: 15.744453
Minibatch accuracy: 80.5%
Validation accuracy: 84.9%
Test accuracy: 92.0%
Test Accuracy : 91.98


### Problem 4

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:
```
global_step = tf.Variable(0)  # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
```

In [24]:
hidden_nodes_1 = 1024
hidden_nodes_2 = 500
hidden_nodes_3 = 100
batch_size = 128


def computation(dataset, weights, biases, is_dropout = False):
    weight_sum_1 = tf.add(tf.matmul(dataset, weights[0]), biases[0])
    hidden_layer_1 = tf.nn.relu(weight_sum_1)
    if is_dropout: # ==> add dropout
        hidden_layer_1 = tf.nn.dropout(hidden_layer_1,  keep_prob=0.7)
    
    # Hidden Layer # 2
    weight_sum_2 = tf.add(tf.matmul(hidden_layer_1, weights[1]), biases[1])
    hidden_layer_2 = tf.nn.relu(weight_sum_2)
    if is_dropout:
        hidden_layer_2 = tf.nn.dropout(hidden_layer_2, keep_prob = 0.7)
    
    # Hidden Layer # 3
    weight_sum_3 = tf.add(tf.matmul(hidden_layer_2, weights[2]), biases[2])
    hidden_layer_3 = tf.nn.relu(weight_sum_3)
    if is_dropout:
        hidden_layer_3 = tf.nn.dropout(hidden_layer_3, keep_prob=0.7)
    
    # Fully-connected Layer
    logits = tf.add(tf.matmul(hidden_layer_3, weights[3]), biases[3])
    return logits

graph = tf.Graph()
with graph.as_default():
    
    # Input data. For the training data, we use a placeholder that will be fed 
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labes))
    
    lmbda = tf.placeholder(tf.float32) # add placeholder
#     keep_prob = tf.placeholder(tf.float32) # ==> add placeholder
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    global_step = tf.Variable(0) # ==> Add for learning rate decay. count the number of steps taken.
    
    # Variables.
    weights = [
        tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes_1],
                                       stddev=np.sqrt(2.0 / (image_size * image_size)))),
        tf.Variable(tf.truncated_normal([hidden_nodes_1, hidden_nodes_2],
                                       stddev=np.sqrt(2.0 / hidden_nodes_1))),
        tf.Variable(tf.truncated_normal([hidden_nodes_2, hidden_nodes_3],
                                       stddev=np.sqrt(2.0 / hidden_nodes_2))),
        tf.Variable(tf.truncated_normal([hidden_nodes_3, num_labes],
                                       stddev=np.sqrt(2.0 / hidden_nodes_3)))
    ]
    biases = [
        tf.Variable(tf.zeros([hidden_nodes_1])), 
        tf.Variable(tf.zeros([hidden_nodes_2])),
        tf.Variable(tf.zeros([hidden_nodes_3])),
        tf.Variable(tf.zeros([num_labes])),
    ]

    
    # Training computation.
    # logits = tf.matmul(tf_train_dataset, weights) + biases
    # loss = tf.reduce_mean(
    #     tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)
    # )
    
    logits = computation(tf_train_dataset, weights, biases, is_dropout=True)
    loss = tf.add(tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)
    ) 
    ,lmbda * (
        tf.nn.l2_loss(weights[0]) 
        + tf.nn.l2_loss(weights[1]) 
        + tf.nn.l2_loss(weights[2])
        + tf.nn.l2_loss(weights[1])
    )) # add regularization
    
    # Optimizer.
    learning_rate = tf.train.exponential_decay(0.5, global_step, 500, 0.9)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
#
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(computation(tf_valid_dataset, weights, biases))
    test_prediction = tf.nn.softmax(computation(tf_test_dataset, weights, biases))
    
num_steps = 8001

with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomied.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate a minibatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, lmbda: 1e-3}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict
        )
        
        if step % 500 == 0:
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print("Test Accuracy : {} %".format(accuracy(test_prediction.eval(), test_labels)))
        

Initialized
Minibatch loss at step 0: 4.258118
Minibatch accuracy: 3.1%
Validation accuracy: 25.3%
Minibatch loss at step 500: 1.364112
Minibatch accuracy: 87.5%
Validation accuracy: 84.3%
Minibatch loss at step 1000: 1.143685
Minibatch accuracy: 83.6%
Validation accuracy: 85.9%
Minibatch loss at step 1500: 0.831119
Minibatch accuracy: 87.5%
Validation accuracy: 87.0%
Minibatch loss at step 2000: 0.774338
Minibatch accuracy: 86.7%
Validation accuracy: 86.8%
Minibatch loss at step 2500: 0.745686
Minibatch accuracy: 82.8%
Validation accuracy: 87.4%
Minibatch loss at step 3000: 0.730782
Minibatch accuracy: 82.8%
Validation accuracy: 87.6%
Minibatch loss at step 3500: 0.573340
Minibatch accuracy: 86.7%
Validation accuracy: 87.8%
Minibatch loss at step 4000: 0.471787
Minibatch accuracy: 90.6%
Validation accuracy: 88.4%
Minibatch loss at step 4500: 0.499786
Minibatch accuracy: 89.1%
Validation accuracy: 88.4%
Minibatch loss at step 5000: 0.542042
Minibatch accuracy: 89.8%
Validation accuracy

In [20]:
global_step.shape

TensorShape([])

In [23]:
global_step.eval

<bound method RefVariable.eval of <tf.Variable 'Variable:0' shape=() dtype=int32_ref>>