Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [5]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [6]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset_c, train_labels_c = reformat(train_dataset, train_labels)
valid_dataset_c, valid_labels_c = reformat(valid_dataset, valid_labels)
test_dataset_c, test_labels_c = reformat(test_dataset, test_labels)
print('Training set', train_dataset_c.shape, train_labels_c.shape)
print('Validation set', valid_dataset_c.shape, valid_labels_c.shape)
print('Test set', test_dataset_c.shape, test_labels_c.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [7]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [283]:
batch_size = 200
num_hidden = 200
# neural network structure for this sample:
#
# · · · · · · · · · ·    (input data, 1-deep)                 X [batch, 28, 28, 1]
# @ @ @ @ @ @ @ @ @ @ -- conv. layer 6x6x1=>6 stride 1        W1 [6, 6, 1, 6]        B1 [6]
# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶                                         Y1 [batch, 28, 28, 6]
#   @ @ @ @ @ @ @ @   -- conv. layer 5x5x6=>12 stride 2       W2 [5, 5, 6, 12]        B2 [12]
#   ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶                                           Y2 [batch, 14, 14, 12]
#     @ @ @ @ @ @     -- conv. layer 4x4x12=>24 stride 2      W3 [4, 4, 12, 24]       B3 [24]
#     ∶∶∶∶∶∶∶∶∶∶∶                                             Y3 [batch, 7, 7, 24] => reshaped to YY [batch, 7*7*24]
#      \x/x\x\x/ ✞    -- fully connected layer (relu+dropout) W4 [7*7*24, 200]       B4 [200]
#       · · · ·                                               Y4 [batch, 200]
#       \x/x\x/       -- fully connected layer (softmax)      W5 [200, 10]           B5 [10]
#        · · ·                                                Y [batch, 20]

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset_c)
  tf_test_dataset = tf.constant(test_dataset_c)
  
  #Variables.
  # variable learning rate
  lr = tf.placeholder(tf.float32)
  # pkeep for dropout
  pkeep = tf.placeholder(tf.float32)
  # three convolutional layers with their channel counts, and a
  # fully connected layer (tha last layer has 10 softmax neurons)
  L1 = 6  # first convolutional layer output depth
  L2 = 12  # second convolutional layer output depth
  L3 = 24  # third convolutional layer
  L4 = 200  # fully connected layer

  W1 = tf.Variable(tf.truncated_normal([6, 6, 1, L1], stddev=np.sqrt(2/28*28)))  # 6x6 patch, 1 input channel, L1 output channels
  B1 = tf.Variable(tf.constant(0.1, tf.float32, [L1]))
  W2 = tf.Variable(tf.truncated_normal([5, 5, L1, L2], stddev=0.1))
  B2 = tf.Variable(tf.constant(0.1, tf.float32, [L2]))
  W3 = tf.Variable(tf.truncated_normal([4, 4, L2, L3], stddev=0.1))
  B3 = tf.Variable(tf.constant(0.1, tf.float32, [L3]))

  W4 = tf.Variable(tf.truncated_normal([7 * 7 * L3, L4], stddev=0.1))
  B4 = tf.Variable(tf.constant(0.1, tf.float32, [L4]))
  W5 = tf.Variable(tf.truncated_normal([L4, 10], stddev=0.1))
  B5 = tf.Variable(tf.constant(0.1, tf.float32, [10]))

  # Model.
  def model(data, dropout=True):
    if dropout:
        stride = 1  # output is 28x28
        Y1 = tf.nn.relu(tf.nn.conv2d(data, W1, strides=[1, stride, stride, 1], padding='SAME') + B1)
        stride = 2  # output is 14x14
        Y2 = tf.nn.relu(tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME') + B2)
        stride = 2  # output is 7x7
        Y3 = tf.nn.relu(tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='SAME') + B3)

        # reshape the output from the third convolution for the fully connected layer
        YY = tf.reshape(Y3, shape=[-1, 7 * 7 * L3])

        Y4 = tf.nn.relu(tf.matmul(YY, W4) + B4)
        Y4d = tf.nn.dropout(Y4, pkeep)

        return tf.matmul(Y4d, W5) + B5
    if not dropout:
        stride = 1  # output is 28x28
        Y1 = tf.nn.relu(tf.nn.conv2d(data, W1, strides=[1, stride, stride, 1], padding='SAME') + B1)
        stride = 2  # output is 14x14
        Y2 = tf.nn.relu(tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME') + B2)
        stride = 2  # output is 7x7
        Y3 = tf.nn.relu(tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='SAME') + B3)

        # reshape the output from the third convolution for the fully connected layer
        YY = tf.reshape(Y3, shape=[-1, 7 * 7 * L3])

        Y4 = tf.nn.relu(tf.matmul(YY, W4) + B4)
        Y4d = tf.nn.dropout(Y4, pkeep)
        return tf.matmul(Y4, W5) + B5

 
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  batch = tf.Variable(0, trainable=False)
  train_size = train_labels.shape[0]
  learning_rate = tf.train.exponential_decay(0.1, 
                                               batch, 
                                               train_size,
                                               0.01,
                                               staircase = True)

  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=batch)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset, False))
  test_prediction = tf.nn.softmax(model(tf_test_dataset, False))

In [284]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels_c.shape[0] - batch_size)
    batch_data = train_dataset_c[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels_c[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, pkeep : 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels_c))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels_c))

# original = 89.1%
# with biases set to 0.1 or num of entry points / 10 = 90.2%
#  learning_rate = tf.train.exponential_decay(lr start = 0.099 and step decay 0.0001 = 90.7%
# batch_size = 300 91.9%
# bs = 200, ps = 6, depth = 24, num_hidden = 64, dropout = 0.9  - accuracy = 92.3% 
# bs = 200, ps = 6, depth = 32, num_hidden = 100, dropout = 0.9  - accuracy = 92.9% 
# bs = 200, ps = 6, depth = 32, num_hidden = 200, dropout = 0.5  - accuracy = 92.9% 
# bs = 200, ps = 6, depth = 48, num_hidden = 100, dropout = 0.9  - accuracy = 93.1% 
# bs = 200, ps = 6, depth = 64, num_hidden = 200, dropout = 0.9  - accuracy = 93.4% 
# bs = 300, ps = 6, depth = 64, num_hidden = 200, dropout = 0.9  - accuracy = 93% 
# bs = 200, ps = 5, depth = 64, num_hidden = 200, dropout = 0.9  - accuracy = 93.3% 
# bs = 200, ps = 5, depth = 32, num_hidden = 200, dropout = 0.9  - accuracy = 93.7% 
# bs = 200, ps = 6, depth = 44, num_hidden = 200, dropout = 0.9  - accuracy = 93% 
# bs = 200, ps = 6, depth = 44, num_hidden = 32, dropout = 0.9  - accuracy = 91.8% 
# bs = 200, ps = 5, depth = 32, num_hidden = 600, dropout = 0.9  - accuracy = 93.7%
# bs = 200, ps = 5, depth = 32, num_hidden = 800, dropout = 0.9  - accuracy = 93.6%
# bs = 200, ps = 5, depth = 32, num_hidden = 200, dropout = 0.75  - accuracy = 92.8%
# bs = 200, ps = 5, depth = 32, num_hidden = 200, dropout = 0.5  - accuracy = 92.6%
# new architecture
# L1, L2, L3, L4 = 6, 12, 24, 200, bs = 200, lr decay = 0.099, 0.0001, steps = 3001 = 94.6%
# L1, L2, L3, L4 = 6, 12, 24, 200, bs = 200, lr decay = 0.1, 0.01, steps = 3001 = 94.9%
# L1, L2, L3, L4 = 6, 12, 24, 200, bs = 200, lr decay = 0.2, 0.01, steps = 1001 = 93.6%
# L1, L2, L3, L4 = 6, 12, 24, 200, bs = 200, lr decay = 0.2, 0.01, steps = 3001 = 93.7%
# L1, L2, L3, L4 = 6, 12, 24, 200, bs = 200, lr decay = 0.09, 0.01, steps = 3001 = 94.4%

Initialized
Minibatch loss at step 0: 11.841475
Minibatch accuracy: 5.5%
Validation accuracy: 8.8%


KeyboardInterrupt: 

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [41]:
batch_size = 16
num_hidden = 200
image_size = 28
num_channels = 1
# neural network structure for this sample:
#
# · · · · · · · · · ·    (input data, 1-deep)                 X [batch, 28, 28, 1]
# @ @ @ @ @ @ @ @ @ @ -- conv. layer 6x6x1=>6 stride 1        W1 [6, 6, 1, 6]        B1 [6]
# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶                                         Y1 [batch, 28, 28, 6]
#   @ @ @ @ @ @ @ @   -- conv. layer 5x5x6=>12 stride 2       W2 [5, 5, 6, 12]        B2 [12]
#   ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶                                           Y2 [batch, 14, 14, 12]
#     @ @ @ @ @ @     -- conv. layer 4x4x12=>24 stride 2      W3 [4, 4, 12, 24]       B3 [24]
#     ∶∶∶∶∶∶∶∶∶∶∶                                             Y3 [batch, 7, 7, 24] => reshaped to YY [batch, 7*7*24]
#      \x/x\x\x/ ✞    -- fully connected layer (relu+dropout) W4 [7*7*24, 200]       B4 [200]
#       · · · ·                                               Y4 [batch, 200]
#       \x/x\x/       -- fully connected layer (softmax)      W5 [200, 10]           B5 [10]
#        · · ·                                                Y [batch, 20]

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset_c)
  tf_test_dataset = tf.constant(test_dataset_c)
  
  #Variables.
  # variable learning rate
  lr = tf.placeholder(tf.float32)
  # pkeep for dropout
  pkeep = tf.placeholder(tf.float32)
  # three convolutional layers with their channel counts, and a
  # fully connected layer (tha last layer has 10 softmax neurons)
  L1 = 6  # first convolutional layer output depth
  L2 = 16  # second convolutional layer output depth
  L3 = 120  # third convolutional layer
  L4 = 84  # fully connected layer

  W1 = tf.Variable(tf.truncated_normal([6, 6, 1, L1], stddev=0.1))  # 6x6 patch, 1 input channel, L1 output channels
  B1 = tf.Variable(tf.constant(0.1, tf.float32, [L1]))
  W2 = tf.Variable(tf.truncated_normal([5, 5, L1, L2], stddev=0.1))
  B2 = tf.Variable(tf.constant(0.1, tf.float32, [L2]))
  W3 = tf.Variable(tf.truncated_normal([4, 4, L2, L3], stddev=0.1))
  B3 = tf.Variable(tf.constant(0.1, tf.float32, [L3]))

  W4 = tf.Variable(tf.truncated_normal([7 * 7 * L3, L4], stddev=0.1))
  B4 = tf.Variable(tf.constant(0.1, tf.float32, [L4]))
  W5 = tf.Variable(tf.truncated_normal([L4, 10], stddev=0.1))
  B5 = tf.Variable(tf.constant(0.1, tf.float32, [10]))



  # Model.
  def model(data, dropout=True):
    if dropout:
        stride = 1  # output is 28x28
        k = 2
        Y1 = tf.nn.relu(tf.nn.conv2d(data, W1, strides=[1, stride, stride, 1], padding='SAME') + B1)
        Y1 = tf.nn.max_pool(Y1, ksize=[1, 2, 2, 1], strides=[1, k, k, 1], padding='SAME')
        # after max_pool with stride 2 (the same as kernel) output is 14x14
        stride = 1  
        Y2 = tf.nn.relu(tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME') + B2)
        Y2 = tf.nn.max_pool(Y2, ksize=[1, 2, 2, 1], strides=[1, k, k, 1], padding='SAME')
        # after max_pool output is 7x7
        stride = 1 
        Y3 = tf.nn.relu(tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='SAME') + B3)
        Y3 = tf.nn.max_pool(Y3, ksize=[1, 2, 2, 1], strides=[1, stride, stride, 1], padding='SAME')
        # again # after max_pool output is 7x7
        # reshape the output from the third convolution for the fully connected layer
        YY = tf.reshape(Y3, shape=[-1, 7 * 7 * L3])

        Y4 = tf.nn.relu(tf.matmul(YY, W4) + B4)
        Y4d = tf.nn.dropout(Y4, pkeep)

        return tf.matmul(Y4d, W5) + B5
    if not dropout:
        stride = 1  # output is 28x28
        k = 2
        Y1 = tf.nn.relu(tf.nn.conv2d(data, W1, strides=[1, stride, stride, 1], padding='SAME') + B1)
        Y1 = tf.nn.max_pool(Y1, ksize=[1, 2, 2, 1], strides=[1, k, k, 1], padding='SAME')
        # after max_pool with stride 2 (the same as kernel) output is 14x14
        stride = 1  
        Y2 = tf.nn.relu(tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME') + B2)
        Y2 = tf.nn.max_pool(Y2, ksize=[1, 2, 2, 1], strides=[1, k, k, 1], padding='SAME')
        # after max_pool output is 7x7
        stride = 1 
        Y3 = tf.nn.relu(tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='SAME') + B3)
        Y3 = tf.nn.max_pool(Y3, ksize=[1, 2, 2, 1], strides=[1, stride, stride, 1], padding='SAME')
        # again # after max_pool output is 7x7
        # reshape the output from the third convolution for the fully connected layer
        YY = tf.reshape(Y3, shape=[-1, 7 * 7 * L3])

        Y4 = tf.nn.relu(tf.matmul(YY, W4) + B4)
        return tf.matmul(Y4, W5) + B5

 
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
#   batch = tf.Variable(0, trainable=False)
#   train_size = train_labels.shape[0]
#   learning_rate = tf.train.exponential_decay(0.05, 
#                                                batch, 
#                                                train_size,
#                                                0.01,
#                                                staircase = True)

#   optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=batch)
  optimizer = tf.train.AdagradOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset, False))
  test_prediction = tf.nn.softmax(model(tf_test_dataset, False))

In [42]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels_c.shape[0] - batch_size)
    batch_data = train_dataset_c[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels_c[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, pkeep : 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels_c))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels_c))

# bs = 16, Layers: 6,16,120,84, Adagrad(0.05), num_steps = 3001 - 93%

Initialized
Minibatch loss at step 0: 3.431571
Minibatch accuracy: 12.5%
Validation accuracy: 9.8%
Minibatch loss at step 50: 2.177061
Minibatch accuracy: 31.2%
Validation accuracy: 26.1%
Minibatch loss at step 100: 1.566627
Minibatch accuracy: 37.5%
Validation accuracy: 43.0%
Minibatch loss at step 150: 1.319181
Minibatch accuracy: 56.2%
Validation accuracy: 62.3%
Minibatch loss at step 200: 0.898019
Minibatch accuracy: 68.8%
Validation accuracy: 69.1%
Minibatch loss at step 250: 1.098106
Minibatch accuracy: 68.8%
Validation accuracy: 69.1%
Minibatch loss at step 300: 1.162069
Minibatch accuracy: 68.8%
Validation accuracy: 78.0%
Minibatch loss at step 350: 1.379933
Minibatch accuracy: 56.2%
Validation accuracy: 80.1%
Minibatch loss at step 400: 0.433599
Minibatch accuracy: 81.2%
Validation accuracy: 77.8%
Minibatch loss at step 450: 1.356342
Minibatch accuracy: 68.8%
Validation accuracy: 79.6%
Minibatch loss at step 500: 0.596271
Minibatch accuracy: 87.5%
Validation accuracy: 82.5%
Mi

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

### Code of endri.deliu from udacity forum

In [None]:
batch_size = 16
patch_size = 3
depth = 16
num_hidden = 705
num_hidden_last = 205

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layerconv1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layerconv1_biases = tf.Variable(tf.zeros([depth]))
  layerconv2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth * 2], stddev=0.1))
  layerconv2_biases = tf.Variable(tf.zeros([depth * 2]))
  
  layerconv3_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth * 2, depth * 4], stddev=0.03))
  layerconv3_biases = tf.Variable(tf.zeros([depth * 4]))
  
  layerconv4_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth * 4, depth * 4], stddev=0.03))
  layerconv4_biases = tf.Variable(tf.zeros([depth * 4]))
  

  layerconv5_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth * 4, depth * 16], stddev=0.03))
  layerconv5_biases = tf.Variable(tf.zeros([depth * 16]))

    
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size / 7 * image_size / 7 * (depth * 4), num_hidden], stddev=0.03))
  layer3_biases = tf.Variable(tf.zeros([num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_hidden_last], stddev=0.0532))
  layer4_biases = tf.Variable(tf.zeros([num_hidden_last]))
  
  layer5_weights = tf.Variable(tf.truncated_normal(
      [num_hidden_last, num_labels], stddev=0.1))
  layer5_biases = tf.Variable(tf.zeros([num_labels]))
  

  # Model.
  def model(data, use_dropout=False):
    conv = tf.nn.conv2d(data, layerconv1_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv1_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    conv = tf.nn.conv2d(pool, layerconv2_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv2_biases)
    #pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    

    conv = tf.nn.conv2d(hidden, layerconv3_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv3_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    # norm1
    # norm1 = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
    
    conv = tf.nn.conv2d(pool, layerconv4_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv4_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    # norm1 = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)

    
    conv = tf.nn.conv2d(pool, layerconv5_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv5_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    # norm1 = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
    
    shape = pool.get_shape().as_list()
    print shape
    reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.elu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    
    if use_dropout:
        hidden = tf.nn.dropout(hidden, 0.75)
    
    nn_hidden_layer = tf.matmul(hidden, layer4_weights) + layer4_biases
    hidden = tf.nn.elu(nn_hidden_layer)
    
    if use_dropout:
        hidden = tf.nn.dropout(hidden, 0.75)
    
    
    return tf.matmul(hidden, layer5_weights) + layer5_biases
  
  # Training computation.
  logits = model(tf_train_dataset, True)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.1, global_step, 3000, 0.86, staircase=True)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))


num_steps = 5001
# original 95001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print "Initialized"
  for step in xrange(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print "Minibatch loss at step", step, ":", l
      print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
      print "Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels)
      print time.ctime()
  print "Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)


### Code of cobi.bento from udacity forum

In [None]:
batch_size = 16
num_channels = 1

c1_depth = 6
c1_ker_sz = 5
c3_depth = 16
c3_ker_sz = 6
c5_depth = 120
c5_ker_sz = 6

num_hidden = 84

graph = tf.Graph()

with graph.as_default():
    
        # Input data.
    tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    c1_weights = tf.Variable(tf.truncated_normal(
      [c1_ker_sz, c1_ker_sz, num_channels, c1_depth], stddev=0.1))
    c1_biases = tf.Variable(tf.zeros([c1_depth]))
    c3_weights = tf.Variable(tf.truncated_normal(
      [c3_ker_sz, c3_ker_sz, c1_depth, c3_depth], stddev=0.1))
    c3_biases = tf.Variable(tf.constant(1.0, shape=[c3_depth]))
    c5_weights = tf.Variable(tf.truncated_normal(
      [c5_ker_sz, c5_ker_sz, c3_depth, c5_depth], stddev=0.1))
    c5_biases = tf.Variable(tf.constant(1.0, shape=[c5_depth]))
    c5_conv_dim = (((((image_size+1)//2) + 1) // 2) + 1 )//2
    fc_weights = tf.Variable(tf.truncated_normal(
      [c5_conv_dim * c5_conv_dim * c5_depth, num_hidden], stddev=0.1))
    fc_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    out_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
    out_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))

    # Model.
    def model(data):
        print(data.get_shape().as_list())
        conv = tf.nn.conv2d(data, c1_weights, [1, 1, 1, 1], padding='SAME')
        hidden = tf.nn.relu(conv + c1_biases)
        print(conv.get_shape().as_list())
        pooled = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        print(pooled.get_shape().as_list())
        conv = tf.nn.conv2d(pooled, c3_weights, [1, 1, 1, 1], padding='SAME')
        hidden = tf.nn.relu(conv + c3_biases)
        pooled = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        shape = pooled.get_shape().as_list()
        print(shape)
        conv = tf.nn.conv2d(pooled, c5_weights, [1, 1, 1, 1], padding='SAME')
        hidden = tf.nn.relu(conv + c5_biases)
        pooled = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        shape = pooled.get_shape().as_list()
        print(shape)
        reshape = tf.reshape(pooled, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, fc_weights) + fc_biases)
        return tf.matmul(hidden, out_weights) + out_biases

    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    #optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    optimizer = tf.train.AdagradOptimizer(0.05).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))
