In [1]:
import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
# read mnist dataset
mnist = input_data.read_data_sets('data', one_hot=True, validation_size=0)

Extracting data\train-images-idx3-ubyte.gz
Extracting data\train-labels-idx1-ubyte.gz
Extracting data\t10k-images-idx3-ubyte.gz
Extracting data\t10k-labels-idx1-ubyte.gz


In [97]:
# parameters
learningRate = 0.001
batchSize = 128
displayStep = 20
numIters = 501

In [120]:
# tf placeholders & variables
X = tf.placeholder(tf.float32,[None,28,28,1], name='X')
Y_ = tf.placeholder(tf.float32,[None,10], name='labels')
pKeep = tf.placeholder(tf.float32)

# weight Variables
W1 = tf.Variable(tf.random_normal([7,7,1,6],stddev=0.1),name="W1") # stride=1, 28*28, padding=SAME, followed by max pool
W2 = tf.Variable(tf.random_normal([5,5,6,12],stddev=0.1),name="W2") # stride=1, 14*14, padding=SAME, followed by max pool 
W3 = tf.Variable(tf.random_normal([7*7*12,128],stddev=0.1),name="W3") # stride=1, 7*7, padding=SAME
W4 = tf.Variable(tf.random_normal([128,10],stddev=0.1), name="W4")

# biases
b1 = tf.Variable(tf.constant(0.1,tf.float32,[6]),name="b1")
b2 = tf.Variable(tf.constant(0.1,tf.float32,[12]),name="b2")
b3 = tf.Variable(tf.constant(0.1,tf.float32,[128]),name="b3")
b4 = tf.Variable(tf.constant(0.1,tf.float32,[10]),name="b4")

In [121]:
# CNN
Y1conv = tf.nn.conv2d(X,W1,strides=[1,1,1,1],padding='SAME')
Y1 = tf.nn.relu(Y1conv + b1)

Y2 = tf.nn.max_pool(Y1,[1,2,2,1],[1,2,2,1],padding='SAME')

Y3conv = tf.nn.conv2d(Y2, W2, strides=[1,1,1,1], padding='SAME')
Y3 = tf.nn.relu(Y3conv + b2)

Y3 = tf.nn.max_pool(Y3,[1,2,2,1],[1,2,2,1],'SAME')

YY = tf.reshape(Y3,[-1,7*7*12])
Y4 = tf.nn.relu(tf.matmul(YY, W3) + b3)

Ylogits = tf.matmul(Y4, W4) + b4
Y = tf.nn.softmax(Ylogits)

In [122]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y_, logits=Ylogits))
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)

correct_prediction = tf.equal(tf.argmax(Y_,1),tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [123]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [126]:
testDict = {X:np.reshape(mnist.test.images,(-1,28,28,1)), Y_:mnist.test.labels, pKeep:1.0}
for i in range(numIters):
    batchX, batchY = mnist.train.next_batch(batchSize)
    trainDict = {X:np.reshape(batchX, (-1,28,28,1)), Y_:batchY, pKeep:0.8}
    sess.run(optimizer,feed_dict=trainDict)
    
    if i % displayStep == 0:
        train_acc, train_cost = sess.run([accuracy, loss], feed_dict=trainDict)
        print("Train: " + str(i) + ": accuracy:" + str(train_acc) + " loss: " + str(train_cost))

# moved it out of display loop because of memory issue        
test_acc, test_cost = sess.run([accuracy, loss], feed_dict=testDict)
print("Test accuracy:" + str(test_acc) + " loss: " + str(test_cost))

Train: 0: accuracy:0.0546875 loss: 2.39066
Train: 20: accuracy:0.664063 loss: 1.47133
Train: 40: accuracy:0.851563 loss: 0.554998
Train: 60: accuracy:0.898438 loss: 0.324867
Train: 80: accuracy:0.914063 loss: 0.245463
Train: 100: accuracy:0.890625 loss: 0.290564
Train: 120: accuracy:0.945313 loss: 0.17926
Train: 140: accuracy:0.953125 loss: 0.133514
Train: 160: accuracy:0.953125 loss: 0.155203
Train: 180: accuracy:0.960938 loss: 0.107094
Train: 200: accuracy:0.953125 loss: 0.146135
Train: 220: accuracy:0.914063 loss: 0.245952
Train: 240: accuracy:0.96875 loss: 0.126019
Train: 260: accuracy:0.953125 loss: 0.228144
Train: 280: accuracy:0.960938 loss: 0.117108
Train: 300: accuracy:0.953125 loss: 0.128259
Train: 320: accuracy:0.953125 loss: 0.124853
Train: 340: accuracy:0.984375 loss: 0.0860289
Train: 360: accuracy:0.96875 loss: 0.0900542
Train: 380: accuracy:0.976563 loss: 0.0670137
Train: 400: accuracy:0.992188 loss: 0.064476
Train: 420: accuracy:0.953125 loss: 0.141809
Train: 440: accur

### Understanding CNN

This project aims to understand CNNs and how the paramters impact the results.

* **Initialization of weights:**  
Initializing weights with standard deviation of 0.1 yielded better results when ran for 500 iterations. 
Train data accuracy hit over 95% in just ~200 iterations. While it struggled to reach 95% when weights were initialized with stddev = 1.0 for 500 iterations.  
Train: 500: accuracy:0.992188 loss: 0.0615625  
Test: 500: accuracy:0.9771 loss: 0.0689539  
Layer arch:  
`CONV (stride=1)` -> `RELU` -> `CONV(stride=2)` -> `RELU` -> `CONV (stride=1)` -> `RELU` -> `FULLY-CONNECTED` -> `RELU` -> `FULLY-CONNECTED` -> `SOFTMAX` -> `OUTPUT`  


* **Playing with Stride**  
Changing stride from 2 to 1 in second conv layer. Not much difference in results:  
Train: 500: accuracy:0.992188 loss: 0.0277827  
Test accuracy:0.9745 loss: 0.0767132  
Layer arch:  
`CONV (stride=1)` -> `RELU` -> `CONV(stride=1)` -> `RELU` -> `CONV (stride=1)` -> `RELU` -> `FULLY-CONNECTED` -> `RELU` -> `FULLY-CONNECTED` -> `SOFTMAX` -> `OUTPUT`  


* **Adding max pooling layer**  
Adding maxpool decreased the accuracy to approx 98.5%.  
Train: 500: accuracy:0.984375 loss: 0.0902737  
Test accuracy:0.9717 loss: 0.0938389  
Layer arch:  
`CONV (stride=1)` -> `RELU` -> `MAXPOOL` -> `CONV (stride=1)` -> `RELU` ->  `MAXPOOL` -> `FULLY-CONNECTED` -> `RELU` -> `FULLY-CONNECTED` -> `SOFTMAX` -> `OUTPUT` 

* **Adding dropout**  
Train: 500: accuracy:0.976563 loss: 0.105949  
Test accuracy:0.9743 loss: 0.0844915  
Dropout is applied while training data.


