---
Introduction
---------
The code below is post by endri.deliu on [this Deep Learning discussion](https://discussions.udacity.com/t/assignment-4-problem-2/46525/35). Below are a few posts by him:

I finally managed to smash my previous result on this dataset (97.3%) with a conv net similar in architecture to AlexNet. Used 4 convolutional layers and 2-3 fully connected layers on top. Am getting on the validation set 92.8 - 92.9% and on the test set 97.6 - 97-7%. One thing to note here is that had to use a reduced test set of only 12k images to calculate the final test error otherwise my machine would go out of memory if it used the full test set of 18k on the last step.

Didn't use the Adam optimizer. Will give it a try at a later point. One thing I did play with was the elu neurons (as opposed to relu). ELU-s use an exponential function similar in shape to RELU-s but they can have negative values and really start shining in deeper nets. Using elus on my deep net of 5 conv layers and 2 fully connected layers made the model converge to the final result much sooner (about 2-3x faster) and I was able to achieve 93.4% on the validation set and 97.8% on the test set. Elus make sense if you have a lot of convolutions and/or fully connected layers and are also cheaper computationally. My previous conv net was using batch normalization and got me about 97.7%. Using ELU-s did speed things up significantly. The ELU paper: http://arxiv.org/abs/1511.0728928. If anyone is interested I can post the code of my conv net. I am pretty sure with further tweaking of the hyperparameters you can get even better accuracy. Again, had to use just 12k images (out of 18k) to calculate the test set error, otherwise my computer would throw an out of memory error.

the net architectures usually sharpen towards the output layer so you force them to learn the main classification features, they are pyramid like, the more so for convolutional layers. The rest of the parameters was chosen by trial and error, choose whatever works better on the validation set.





In [None]:
batch_size = 16
patch_size = 3
depth = 16
num_hidden = 705
num_hidden_last = 205

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layerconv1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layerconv1_biases = tf.Variable(tf.zeros([depth]))
  layerconv2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth * 2], stddev=0.1))
  layerconv2_biases = tf.Variable(tf.zeros([depth * 2]))
  
  layerconv3_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth * 2, depth * 4], stddev=0.03))
  layerconv3_biases = tf.Variable(tf.zeros([depth * 4]))
  
  layerconv4_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth * 4, depth * 4], stddev=0.03))
  layerconv4_biases = tf.Variable(tf.zeros([depth * 4]))
  

  layerconv5_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth * 4, depth * 16], stddev=0.03))
  layerconv5_biases = tf.Variable(tf.zeros([depth * 16]))

    
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size / 7 * image_size / 7 * (depth * 4), num_hidden], stddev=0.03))
  layer3_biases = tf.Variable(tf.zeros([num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_hidden_last], stddev=0.0532))
  layer4_biases = tf.Variable(tf.zeros([num_hidden_last]))
  
  layer5_weights = tf.Variable(tf.truncated_normal(
      [num_hidden_last, num_labels], stddev=0.1))
  layer5_biases = tf.Variable(tf.zeros([num_labels]))
  

  # Model.
  def model(data, use_dropout=False):
    conv = tf.nn.conv2d(data, layerconv1_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv1_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    conv = tf.nn.conv2d(pool, layerconv2_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv2_biases)
    #pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    

    conv = tf.nn.conv2d(hidden, layerconv3_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv3_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    # norm1
    # norm1 = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
    
    conv = tf.nn.conv2d(pool, layerconv4_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv4_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    # norm1 = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)

    
    conv = tf.nn.conv2d(pool, layerconv5_weights, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.elu(conv + layerconv5_biases)
    pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    # norm1 = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
    
    shape = pool.get_shape().as_list()
    print shape
    reshape = tf.reshape(pool, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.elu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    
    if use_dropout:
        hidden = tf.nn.dropout(hidden, 0.75)
    
    nn_hidden_layer = tf.matmul(hidden, layer4_weights) + layer4_biases
    hidden = tf.nn.elu(nn_hidden_layer)
    
    if use_dropout:
        hidden = tf.nn.dropout(hidden, 0.75)
    
    
    return tf.matmul(hidden, layer5_weights) + layer5_biases
  
  # Training computation.
  logits = model(tf_train_dataset, True)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.1, global_step, 3000, 0.86, staircase=True)
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))


num_steps = 95001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print "Initialized"
  for step in xrange(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print "Minibatch loss at step", step, ":", l
      print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
      print "Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels)
      print time.ctime()
  print "Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)