## Deep Learning

### Assignment 4

Previously in 2_fullyconnected.ipynb and 3_regularization.ipynb, we trained fully connected networks to classify notMNIST characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
import os

In [5]:
data_path = '/Users/heany/code/AI-For-NLP/dataset'
pickle_file = 'notMNIST.pickle'

with open(os.path.join(data_path, pickle_file),'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save # hint to help gc free up memory
    
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


#### Reformat into a Tensorflow-friendly shape:

+ convolutions need the image data formatted as cube (width by height by # channels)
+ labels as float 1-hot encodings.

In [6]:
image_size = 28
num_labes = 10
num_channels = 1 # grayscale

def reformat(dataset, labels):
    dataset = dataset.reshape(
        (-1, image_size, image_size, num_channels)).astype(np.float32) # shape(-1, 28,28,1)
    # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
    labels = (np.arange(num_labes) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)    

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [25]:
train_dataset[0].shape

(28, 28, 1)

In [11]:
def accuracy(predictions, labels):
    return ( 100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels,1)) / predictions.shape[0])

**Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.**



In [42]:
batch_size = 16
patch_size = 5 # size of window
depth = 16  # num of filters
num_hidden = 64


graph = tf.Graph()
with graph.as_default():
    
    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size,num_labels))
    
    lmbda = tf.placeholder(tf.float32) # add placeholder
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # global_step = tf.Variable(0) # ==> Add for learning rate decay. count the number of steps taken.
    
    # Variables.
    layer1_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev = 0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    
    layer2_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, depth, depth], stddev = 0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    
    layer3_weights = tf.Variable(
        tf.truncated_normal([image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
    
    # Model
    def model(data):
        conv  = tf.nn.conv2d(data,layer1_weights, [1,2,2,1],padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2]* shape[3]]) # flatten => 28x28x1=784
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        
        return tf.matmul(hidden, layer4_weights) + layer4_biases
    
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.add(tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits)), 
                  lmbda * (tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer2_weights) 
                           + tf.nn.l2_loss(layer3_weights) + tf.nn.l2_loss(layer4_weights)))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Prediction for the training, validation, and the test
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))
    
    
num_steps = 1001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels, lmbda : 1e-3}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict)
        
        if step % 50 == 0:
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    


Initialized
Minibatch loss at step 0: 4.384343
Minibatch accuracy: 0.0%
Validation accuracy: 10.0%
Minibatch loss at step 50: 3.202241
Minibatch accuracy: 0.0%
Validation accuracy: 10.0%
Minibatch loss at step 100: 3.081206
Minibatch accuracy: 18.8%
Validation accuracy: 10.0%
Minibatch loss at step 150: 3.073403
Minibatch accuracy: 12.5%
Validation accuracy: 10.0%
Minibatch loss at step 200: 3.067702
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 250: 3.034269
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 300: 2.958816
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibatch loss at step 350: 2.876323
Minibatch accuracy: 18.8%
Validation accuracy: 10.0%
Minibatch loss at step 400: 2.881592
Minibatch accuracy: 18.8%
Validation accuracy: 10.0%
Minibatch loss at step 450: 2.820594
Minibatch accuracy: 25.0%
Validation accuracy: 10.0%
Minibatch loss at step 500: 2.878308
Minibatch accuracy: 6.2%
Validation accuracy: 10.0%
Minibat

<h3 style="color: red"> implement by miwochai using the helper-function </h3>

In [26]:
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1))

In [27]:
def new_biases(length):
    return tf.Variable(tf.constant(0.1, shape=[length]))

####  Helper-function for creating a new Fully-Connected Layer
This function creates a new fully-connected layer in the computational graph for TensorFlow. Nothing is actually calculated here, we are just adding the mathematical formulas to the TensorFlow graph.

It is assumed that the input is a 2-dim tensor of shape [num_images, num_inputs]. The output is a 2-dim tensor of shape [num_images, num_outputs].

In [24]:
def new_fc_layer(input, num_inputs, num_outputs, use_relu = True):
    # parameter input : the previous layer.
    # Create new weights and biases.
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length = num_outputs)
    
    # Calculate teh layer as the matrix multiplication of
    # the input and weights, and then add the bias-values.
    layer = tf.matmul(input, weights) + biases
    
    # Use ReLU?
    if use_relu:
        layer = tf.nn.relu(layer)
    return layer
    

#### Helper-function for creating a new Convolutional Layer
This function creates a new convolutional layer in the computational graph for TensorFlow. Nothing is actually calculated here, we are just adding the mathematical formulas to the TensorFlow graph.

It is assumed that the input is a 4-dim tensor with the following dimensions:

+ Image number.
+ Y-axis of each image.
+ X-axis of each image.
+ Channels of each image.
Note that the input channels may either be colour-channels, or it may be filter-channels if the input is produced from a previous convolutional layer.

The output is another 4-dim tensor with the following dimensions:

+ Image number, same as input.
+ Y-axis of each image. If 2x2 pooling is used, then the height and width of the input images is divided by 2.
+ X-axis of each image. Ditto.
+ Channels produced by the convolutional filters.

In [23]:
def new_conv_layer(input,             # The previous layer.
                  num_input_channels, # Num. channels in prev. layer.
                  filter_size,        # Width and height of each filter.
                  num_filters,        # Number of filters.
                  use_pooling=True):  # Use 2 x 2 max-pooling.
    # Shape of the filter-weights for the convolution.
    # This format is determined by the Tensorflow API.
    shape = [filter_size, filter_size, num_input_channels, num_filters]
    
    # Create new Weights filters with the given shape.
    weights = new_weights(shape = shape)
    
    # Create new biases, one for each filter
    biases = new_biases(length = num_filters)
    
    # Create the TensorFlow operation for convolution.
    # Note the strides are set to 1 in all dimensions.
    # The first and last stride must always be 1,
    # because the first is for the image-number and 
    # the last is for the input-channel.
    # But e.g. strides=[1, 2, 2, 1] would mean that the filter
    # is moved 2 pixels across the x- and y-axis of the image.
    # The padding is set to'SAME' which means the input image
    # is padded with zeroes so the size of the output is the same.
    layer = tf.nn.conv2d(input = input, filter = weights, strides = [1, 2, 2, 1], padding = 'SAME')
    
    # Add the biases to the results of the convolution.
    # A bias-value is added to each filter-channels.
    layer += biases
    
    # Use pooling to down-sample the image resolution?
    if use_pooling:
        # This is 2x2 max-pooling, which means that we
        # consider 2x2 windows and select the largest value
        # in each window. Then we move 2 pixel to the next window.
        layer = tf.nn.max_pool(value = layer,
                              ksize = [1, 2, 2, 1],
                              strides=[1, 2, 2, 1],
                              padding='SAME')
        
    # Rectified Linear Unit (ReLU)
    # It calculates max(x,0) for each input pixel x.
    # This adds some non-linearity to the formula and allows us 
    # to learn more complicated functions.
    layer = tf.nn.relu(layer)

    # Note that ReLU is normally executed before the pooling,
    # but since relu(max_pool(x)) == max_pooling(relu(x)) we can
    # save 75% of the relu-operations by max-pooling first.

    # We return both the resulting layer and the filter-weights
    # because we will plot the weights later.
    return layer, weights
        
    

#### Helper-function for flattening a layer
A convolutional layer produces an output tensor with 4 dimensions. We will add fully-connected layers after the convolution layers, so we need to reduce the 4-dim tensor to 2-dim which can be used as input to the fully-connected layer.

In [21]:
def flatten_layer(layer):
    # Get the shape of the input layer.
    layer_shape = layer.get_shape()
    
    # The shape of the input layer is assumed to be:
    # layer_shape == [num_images, img_height, img_width, num_channels]
    
    # The number of features is : img_height * img_width * num_channels
    # We can use a function from Tensorflow to calculate this.
    num_features = layer_shape[1:4].num_elements()
    
    # Reshape the layer to [num_images, num_features].
    # Note that we just set the size of the second dimension
    # to num_features and the size of the first dimension to -1
    # which means the size in that dimension is calculated
    # so the total size of the tensor is unchanged from the reshaping.
    layer_flat = tf.reshape(layer, [-1, num_features])
    
    # The shape of the flattened layer is nowL
    # [num_images, img_height * img_width * num_channels]
    
    # Return both the flattened layer and the number of features
    return layer_flat, num_features
    

### Problem 1
The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (nn.max_pool()) of stride 2 and kernel size 2.

In [58]:
batch_size = 64
patch_size = 5 # size of window
depth = 16  # num of filters
depth = 36
num_hidden = 64


graph = tf.Graph()
with graph.as_default():
    
    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size,num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    
    # Variables.
    layer1_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev = 0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    
    layer2_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, depth, depth_2], stddev = 0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth_2]))
    
    layer3_weights = tf.Variable(
        tf.truncated_normal([image_size // 4 * image_size // 4 * depth_2, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
    
    # Model
    def model(data):
        conv1  = tf.nn.conv2d(data,layer1_weights, [1,1,1,1],padding='SAME')
        biases1 = tf.nn.relu(conv1 + layer1_biases)
        
        pool1 = tf.nn.max_pool(biases1, [1,2,2,1], [1,2,2,1], padding='SAME')
        
        conv2 = tf.nn.conv2d(pool1, layer2_weights, [1,1,1,1], padding='SAME')
        biases2 = tf.nn.relu(conv2 + layer2_biases)
        
        pool2 = tf.nn.max_pool(biases2, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
        
        shape = pool2.get_shape().as_list()
        reshape = tf.reshape(pool2, [shape[0], shape[1] * shape[2]* shape[3]]) # flatten => 28x28x1=784
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        
        return tf.matmul(hidden, layer4_weights) + layer4_biases
    
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    
    # Prediction for the training, validation, and the test
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))
    
    
num_steps = 2001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict)
        
        if step % 50 == 0:
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    


Initialized
Minibatch loss at step 0: 3.678619
Minibatch accuracy: 3.1%
Validation accuracy: 10.1%
Minibatch loss at step 50: 1.805728
Minibatch accuracy: 43.8%
Validation accuracy: 47.4%
Minibatch loss at step 100: 0.697275
Minibatch accuracy: 73.4%
Validation accuracy: 75.6%
Minibatch loss at step 150: 0.961410
Minibatch accuracy: 70.3%
Validation accuracy: 75.3%
Minibatch loss at step 200: 0.869984
Minibatch accuracy: 71.9%
Validation accuracy: 79.5%
Minibatch loss at step 250: 0.871272
Minibatch accuracy: 76.6%
Validation accuracy: 79.5%
Minibatch loss at step 300: 0.634850
Minibatch accuracy: 82.8%
Validation accuracy: 80.7%
Minibatch loss at step 350: 0.348089
Minibatch accuracy: 90.6%
Validation accuracy: 79.9%
Minibatch loss at step 400: 0.773133
Minibatch accuracy: 78.1%
Validation accuracy: 81.4%
Minibatch loss at step 450: 0.584569
Minibatch accuracy: 87.5%
Validation accuracy: 82.9%
Minibatch loss at step 500: 0.678189
Minibatch accuracy: 81.2%
Validation accuracy: 83.3%
Mi

### Problem 2
Try to get the best performance you can using a convolutional net. Look for example at the classic LeNet5 architecture, adding Dropout, and/or adding learning rate decay.



<hr />
<h3 style="color:red">The CNN below is loosely inspired by the LeNet5 architecture.</h3>

In [52]:
batch_size = 16
patch_size = 5 # size of window
depth = 16  # num of filters
num_hidden = 64


graph = tf.Graph()
with graph.as_default():
    
    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size,num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    
    # Variables.
    layer1_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev = 0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    
    layer2_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, depth, depth], stddev = 0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    
    size3 = ((image_size - patch_size + 1) // 2 - patch_size + 1) // 2
    
    layer3_weights = tf.Variable(
        tf.truncated_normal([size3 * size3 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
    
    # Model
    def model(data):
        # C1 input 28 x 28
        conv1  = tf.nn.conv2d(data,layer1_weights, [1,1,1,1],padding='VALID')
        biases1 = tf.nn.relu(conv1 + layer1_biases)
        
        # P1 input 24 x 24
        pool1 = tf.nn.max_pool(biases1, [1,2,2,1], [1,2,2,1], padding='VALID')
        
        # C2 input 12 x 12
        conv2 = tf.nn.conv2d(pool1, layer2_weights, [1,1,1,1], padding='VALID')
        biases2 = tf.nn.relu(conv2 + layer2_biases)
        
        # P2 input 8 x 8
        pool2 = tf.nn.max_pool(biases2, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        
        # F1 input 4 x 4
        shape = pool2.get_shape().as_list()
        reshape = tf.reshape(pool2, [shape[0], shape[1] * shape[2]* shape[3]]) # flatten => 28x28x1=784
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        
        return tf.matmul(hidden, layer4_weights) + layer4_biases
    
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    
    # Prediction for the training, validation, and the test
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))
    
    
num_steps = 2001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict)
        
        if step % 500 == 0:
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    


Initialized
Minibatch loss at step 0: 2.961719
Minibatch accuracy: 12.5%
Validation accuracy: 13.3%
Minibatch loss at step 50: 1.449486
Minibatch accuracy: 43.8%
Validation accuracy: 58.8%
Minibatch loss at step 100: 0.842380
Minibatch accuracy: 81.2%
Validation accuracy: 67.9%
Minibatch loss at step 150: 0.637184
Minibatch accuracy: 81.2%
Validation accuracy: 66.2%
Minibatch loss at step 200: 1.527062
Minibatch accuracy: 50.0%
Validation accuracy: 72.4%
Minibatch loss at step 250: 1.122714
Minibatch accuracy: 62.5%
Validation accuracy: 76.8%
Minibatch loss at step 300: 1.005300
Minibatch accuracy: 75.0%
Validation accuracy: 74.7%
Minibatch loss at step 350: 1.040233
Minibatch accuracy: 62.5%
Validation accuracy: 77.7%
Minibatch loss at step 400: 0.885031
Minibatch accuracy: 75.0%
Validation accuracy: 78.2%
Minibatch loss at step 450: 1.285821
Minibatch accuracy: 68.8%
Validation accuracy: 75.8%
Minibatch loss at step 500: 0.528767
Minibatch accuracy: 81.2%
Validation accuracy: 80.3%
M

#### The next version of the net uses dropout and learning rate decay:

In [63]:
batch_size = 16
patch_size = 5 # size of window
depth = 16  # num of filters
depth_2 = 32
num_hidden = 64
beta_regul = 1e-3
drop_out = 0.5


graph = tf.Graph()
with graph.as_default():
    
    # Input data.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size,num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    global_step = tf.Variable(0)
    
    # Variables.
    layer1_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev = 0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    
    layer2_weights = tf.Variable(
        tf.truncated_normal([patch_size, patch_size, depth, depth_2], stddev = 0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth_2]))
    
    size3 = ((image_size - patch_size + 1) // 2 - patch_size + 1) // 2
    
    layer3_weights = tf.Variable(
        tf.truncated_normal([size3 * size3 * depth_2, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden, num_hidden], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer5_weights = tf.Variable(tf.truncated_normal([num_hidden, num_labels], stddev=0.1))
    layer5_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
    
    # Model
    def model(data, keep_prob):
        # C1 input 28 x 28
        conv1  = tf.nn.conv2d(data,layer1_weights, [1,1,1,1],padding='VALID')
        biases1 = tf.nn.relu(conv1 + layer1_biases)
        
        # P1 input 24 x 24
        pool1 = tf.nn.max_pool(biases1, [1,2,2,1], [1,2,2,1], padding='VALID')
        
        # C2 input 12 x 12
        conv2 = tf.nn.conv2d(pool1, layer2_weights, [1,1,1,1], padding='VALID')
        biases2 = tf.nn.relu(conv2 + layer2_biases)
        
        # P2 input 8 x 8
        pool2 = tf.nn.max_pool(biases2, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        
        # F1 input 4 x 4
        shape = pool2.get_shape().as_list()
        reshape = tf.reshape(pool2, [shape[0], shape[1] * shape[2]* shape[3]]) # flatten => 28x28x1=784
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        
        # Dropout 1
        drop1 = tf.nn.dropout(hidden, keep_prob)
        hidden1 = tf.nn.relu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        drop2 = tf.nn.dropout(hidden1, keep_prob)
        
        return tf.matmul(drop2, layer5_weights) + layer5_biases
    
    # Training computation.
    logits = model(tf_train_dataset, drop_out)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf_train_labels, logits = logits))
    
    # Optimizer.
#     optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    
    # Optimizer.
    learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    
    # Prediction for the training, validation, and the test
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset,1.0))
    test_prediction = tf.nn.softmax(model(tf_test_dataset,1.0))
    
    
num_steps = 2001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict = feed_dict)
        
        if step % 50 == 0:
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    

ValueError: Dimensions must be equal, but are 1568 and 512 for 'MatMul' (op: 'MatMul') with input shapes: [16,1568], [512,64].