Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [0]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

## Check whether GPU is available

In [0]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 6548502576229029977, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11288962663
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16407767318875256520
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]

In [0]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [0]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [0]:
img_size = 28
num_classes = 10
train_size  = 200000

In [0]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [0]:
# implementation 1 using method tf.nn.conv2d

## Important points to remember

In tensorflow "valid" padding means we never pad and only apply the filter. while "same" padding means we pad equally on all sides such that   

output_size = [input/stride] [] for GIF


## Things to remember

For random initialization of weights you can try stddev = sqrt(1/n) for normal layers and sqrt(2/n) for relu layers.

I observed great changes in results when I switched weight initializations from stddev of 1 to 0.1 initially and then to above mentioned values

In [0]:
batch_size = 4096
kernel_size_1 = 4
kernel_size = 4
num_channels = 1
depth_1 = 16
depth_2 = 32
fc_2_nodes = 512
fc_3_nodes = 512



graph_conv1 = tf.Graph()

with graph_conv1.as_default():
    
    #input_data_placeholder
    X_train = tf.placeholder(tf.float32, shape = [batch_size,img_size,img_size,num_channels])
    y_train = tf.placeholder(tf.float32, shape = [batch_size,num_classes])
    
    X_valid = tf.constant(valid_dataset)
    X_test  = tf.constant(test_dataset)
    
    #initial_layers
    conv_1_weights = (tf.Variable(tf.truncated_normal
                          ([kernel_size,kernel_size,num_channels,depth_1], stddev = tf.sqrt(1/batch_size))))
    conv_1_biases  = tf.Variable(tf.zeros(depth_1))
    
    #pool1 will be here
    
    conv_2_weights = (tf.Variable(tf.truncated_normal
                                 ([kernel_size,kernel_size,depth_1,depth_2],stddev=tf.sqrt(1/batch_size))))
    conv_2_biases  = tf.Variable(tf.zeros(depth_2))
    #pool2 will be here
    
    
    fc1_weights = (tf.Variable(tf.truncated_normal(
                            [img_size//4*img_size//4*depth_2,fc_2_nodes],stddev=tf.sqrt(2/batch_size))))
    
    fc1_biases  = tf.Variable(tf.zeros(fc_2_nodes))
    
    
    #another fc layer
    
    fc2_weights = tf.Variable(tf.truncated_normal([fc_2_nodes,fc_3_nodes],stddev=tf.sqrt(2/batch_size)))
    fc2_biases = tf.Variable(tf.zeros(fc_3_nodes))
    
    #generation of logits
    
    fc3_weights = tf.Variable(tf.truncated_normal([fc_3_nodes,num_classes], stddev=tf.sqrt(1/batch_size)))
    fc3_biases = tf.Variable(tf.zeros(num_classes))
    
    
    def model(data):
        
        conv1= tf.nn.conv2d(data,conv_1_weights,[1,1,1,1], "SAME", name = "convolution_1")
        pool1 = tf.layers.max_pooling2d(conv1,2,2,padding = 'same',data_format = 'channels_last')
        
        #applying relu
        pool1_relu = tf.nn.relu(pool1)
        
        """2nd convolution layer"""
        conv2 = tf.nn.conv2d(pool1_relu,conv_2_weights,[1,1,1,1], "SAME", name = "convolution_2")
        pool2 = tf.layers.max_pooling2d(conv2,2,2,padding = 'same')
        pool_2_relu = tf.nn.relu(pool2)
        
        p2_shape = pool_2_relu.get_shape().as_list()
        
        fc1 = tf.reshape(pool_2_relu, [p2_shape[0],p2_shape[1]*p2_shape[2]*p2_shape[3] ])
        
        fc1_dr = tf.nn.dropout(fc1,keep_prob=0.8)
        
        fc2 = tf.nn.relu(tf.nn.xw_plus_b(fc1_dr,fc1_weights,fc1_biases))
        
        fc2_dr = tf.nn.dropout(fc2,keep_prob = 0.9)
        
        fc3 = tf.nn.relu(tf.nn.xw_plus_b(fc2_dr,fc2_weights,fc2_biases))
        
        logits = tf.nn.xw_plus_b(fc3,fc3_weights,fc3_biases)
        
        return logits
    
    
    #computing loss
    logits = model(X_train)
    loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2
                           (logits = logits,labels=y_train)))
    
    
        
    
    #optimizer
    optimizer = tf.train.AdamOptimizer(0.002).minimize(loss)
    
    
    #Making predictions
    y_pred = tf.nn.softmax(logits)
    y_valid= tf.nn.softmax(model(valid_dataset))
    y_test = tf.nn.softmax(model(test_dataset))
        
        
        

    
    
    

In [0]:
# Let's try interesting architectures for this problem.
#1 - LeNet-5

graph_lenet = tf.Graph()
with graph_le

In [0]:
num_epochs = 40

with tf.Session(graph= graph_conv1) as sess:
    tf.global_variables_initializer().run()
    
    for epoch in range(1,num_epochs+1):
        start_batch = 0
        
        while start_batch<=train_size-batch_size:
            
            batch_data   = train_dataset[start_batch:min(start_batch+batch_size,train_size),:,:,:]
            batch_labels = train_labels[start_batch:min(start_batch+batch_size,train_size),:]
            
            feed_dict = {X_train:batch_data,y_train:batch_labels}
            
            _,l,predictions = sess.run([optimizer,loss,y_pred],feed_dict=feed_dict)
            
            if(start_batch % 51200==0 and epoch%4==0):
                print('Minibatch loss at epoch %d: %f' % (epoch, l))
                print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
                
            start_batch+=batch_size
            
        if(epoch%4==0):
            print("\nValidation accuracy after epoch {} is {} \n".format(epoch,accuracy(y_valid.eval(),valid_labels)))
        
    
    print("\nTest accuracy is {} \n".format(accuracy(y_test.eval(),test_labels)))
                
            
            

NameError: ignored

## Results

lr = 0.0001 -> 83.64(76.21) with 75.8

with 512 nodes in fc1
lr = 0.001 ->87.91(81.58) with 83.6


## Weight Initialization is extremely important
Ok! An amazing result
So Google guys in their original code had set stddev for conv weights to be 0.1 instead of 1,
tried it amazingly results are very very different:

lr = 0.001 -> 94.12(87.84!!) with mb_acc = 91.2 in 16 epochs
lr = 0.001 -> 95.44(89.52!!!) with mb_acc = 93.4 in 32 epochs
lr = 0.0015-> 95.34(89.89!!!) with mb_acc = 95.7 in 64 epochs with droput on last layer with keep_prob = 0.
same netowork was able to reach 90% val accuracy at 68th epoch

for fc_2_nodes = 256
lr = 0.001 -> 92.31(86.24) with 89.1 in 8 epochs. This is very different from earlier results with same network structure, same lr where at this lr the accuracy started falling pretty quickly


Now a research paper said that for relu layers stddev = sqrt(2/n), for others sqrt(1/n). I dont know whether they meant it for conv layers weights as well or not but anyhow I set stddev = sqrt(1/n) for all layers except relu and....

By 4TH EPOCH ONLY Val. accuracy was at 91.19% !!!!(Yes by 4th epoch itself)
final results after 80 epochs->

97.01(92.12!!!!!!!) with 99.0

With a modified network structure, bs = 4096, with two fully connected layers between conv and logits each with 512 nodes

keep_prob for both layers kept at 0.9
lr = 0.001 -> 96.94(92.25) with 94.8




In [0]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))