Intro2TF
==========

(Reference: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/udacity)

### Preprocessing

These are all the modules we'll be using later. Make sure you can import them before proceeding further.

In [1]:
from __future__ import print_function, division
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

First, we load in the data. For how this data file was generated, please refer to https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/1_notmnist.ipynb.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float one-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


Finally, we define some utility functions.

In [4]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

### Multi-nomial Logistic Regression

We're going to train a multi-nomial logistic regression model using stochastic gradient descent.

TensorFlow works like this:
* First you describe the computation that you want to see performed: what the inputs, the variables, and the operations look like. These get created as nodes over a computation graph. This description is all contained within the block below:

      with tf.Graph().as_default() as graph:
          ...

* Then you can run the operations on this graph as many times as you want by calling `session.run()`, providing it outputs to fetch from the graph that get returned. This runtime operation is all contained in the block below:

      with tf.Session(graph=graph) as session:
          ...

Let's build and run the computation graph corresponding to our model.

In [5]:
# settings
in_ndim = image_size * image_size
out_ndim = num_labels

batch_size = 128
num_steps = 10001


# graph
with tf.Graph().as_default() as graph:
    # inputs
    X = tf.placeholder(tf.float32, shape=[batch_size, in_ndim])
    Y = tf.placeholder(tf.float32, shape=[batch_size, out_ndim])
    X_val = tf.constant(valid_dataset)
    X_test = tf.constant(test_dataset)

    
    # parameters
    W_out = tf.Variable(tf.truncated_normal([in_ndim, out_ndim], stddev=0.01))
    b_out = tf.Variable(tf.zeros([out_ndim]))
    
    
    # model
    def model(X):
        # output layer
        logits = tf.matmul(X, W_out) + b_out
        return logits
    
    
    # outputs
    logits = model(X)
    logits_val = model(X_val)
    logits_test = model(X_test)
    

    # optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, Y))
    optimizer = tf.train.AdagradOptimizer(0.05).minimize(loss)

    
    # predictions
    predictions = tf.nn.softmax(logits)
    predictions_val = tf.nn.softmax(logits_val)
    predictions_test = tf.nn.softmax(logits_test)


# run
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    for step in range(num_steps):
        offset = (step * batch_size) % (train_dataset.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        _, p, l = session.run([optimizer, predictions, loss],
                              feed_dict={X: batch_data, Y: batch_labels})
        
        if step % 1000 == 0:
            print('Step {}: Acc_val: {:.1f}% (Acc_batch: {:.1f}%, loss: {:.5f})'.format(
                      step,
                      accuracy(predictions_val.eval(), valid_labels),
                      accuracy(p, batch_labels),
                      l))
    print('Acc_test: {:.1f}%'.format(accuracy(predictions_test.eval(), test_labels)))

Step 0: Acc_val: 44.2% (Acc_batch: 10.2%, loss: 2.29059)
Step 1000: Acc_val: 82.5% (Acc_batch: 79.7%, loss: 0.68959)
Step 2000: Acc_val: 82.7% (Acc_batch: 89.8%, loss: 0.52833)
Step 3000: Acc_val: 82.9% (Acc_batch: 85.9%, loss: 0.63319)
Step 4000: Acc_val: 83.0% (Acc_batch: 87.5%, loss: 0.55271)
Step 5000: Acc_val: 83.0% (Acc_batch: 87.5%, loss: 0.51461)
Step 6000: Acc_val: 82.9% (Acc_batch: 78.9%, loss: 0.75281)
Step 7000: Acc_val: 83.1% (Acc_batch: 81.2%, loss: 0.65789)
Step 8000: Acc_val: 83.0% (Acc_batch: 75.8%, loss: 0.98212)
Step 9000: Acc_val: 83.1% (Acc_batch: 84.4%, loss: 0.67761)
Step 10000: Acc_val: 83.1% (Acc_batch: 79.7%, loss: 0.67248)
Acc_test: 90.0%


### Deep Neural Network

In [6]:
# settings
in_ndim = image_size * image_size
out_ndim = num_labels

beta = 0.001
decay_steps = 1000
decay_rate = 0.98
keep_prob = 0.95

batch_size = 128
num_steps = 10001


# graph
with tf.Graph().as_default() as graph:
    # miscellaneous variables
    global_step = tf.Variable(0, trainable=False)
    
    
    # inputs
    X = tf.placeholder(tf.float32, shape=[batch_size, in_ndim])
    Y = tf.placeholder(tf.float32, shape=[batch_size, out_ndim])
    X_val = tf.constant(valid_dataset)
    X_test = tf.constant(test_dataset)

    
    # parameters
    ndim1 = 128
    W1 = tf.Variable(tf.truncated_normal(shape=[in_ndim, ndim1], stddev=0.01))
    b1 = tf.Variable(tf.zeros(ndim1))
    
    ndim2 = 64
    W2 = tf.Variable(tf.truncated_normal(shape=[ndim1, ndim2], stddev=0.01))
    b2 = tf.Variable(tf.zeros(ndim2))
    
    ndim3 = 32
    W3 = tf.Variable(tf.truncated_normal(shape=[ndim2, ndim3], stddev=0.01))
    b3 = tf.Variable(tf.zeros(ndim3))
        
    W_out = tf.Variable(tf.truncated_normal([ndim3, out_ndim], stddev=0.01))
    b_out = tf.Variable(tf.zeros([out_ndim]))
    
    
    # model
    def model(X, dropout=False):
        # hidden layer 1: fully-connected
        X1 = tf.nn.relu(tf.matmul(X, W1) + b1)
        if dropout:
            X1 = tf.nn.dropout(X1, keep_prob)

        # hidden layer 2: fully-connected
        X2 = tf.nn.relu(tf.matmul(X1, W2) + b2)
        if dropout:
            X2 = tf.nn.dropout(X2, keep_prob)

        # hidden layer 3: fully-connected
        X3 = tf.nn.relu(tf.matmul(X2, W3) + b3)
        if dropout:
            X3 = tf.nn.dropout(X3, keep_prob)
            
        # output layer
        logits = tf.matmul(X3, W_out) + b_out
        return logits
    
    
    # outputs
    logits = model(X, dropout=True)
    logits_val = model(X_val)
    logits_test = model(X_test)
    

    # optimizer
    regularizer = beta * (tf.nn.l2_loss(W1) + tf.nn.l2_loss(b1) +
                              tf.nn.l2_loss(W2) + tf.nn.l2_loss(b2) +
                              tf.nn.l2_loss(W3) + tf.nn.l2_loss(b3) +
                              tf.nn.l2_loss(W_out) + tf.nn.l2_loss(b_out))
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, Y)) + regularizer
    learning_rate = tf.train.exponential_decay(0.05, global_step, decay_steps, decay_rate, staircase=True)
    optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step)

    
    # predictions
    predictions = tf.nn.softmax(logits)
    predictions_val = tf.nn.softmax(logits_val)
    predictions_test = tf.nn.softmax(logits_test)


# run
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    for step in range(num_steps):
        offset = (step * batch_size) % (train_dataset.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        _, p, l = session.run([optimizer, predictions, loss],
                              feed_dict={X: batch_data, Y: batch_labels})
        
        if step % 1000 == 0:
            print('Step {}: Acc_val: {:.1f}% (Acc_batch: {:.1f}%, loss: {:.5f})'.format(
                      step,
                      accuracy(predictions_val.eval(), valid_labels),
                      accuracy(p, batch_labels),
                      l))
    print('Acc_test: {:.1f}%'.format(accuracy(predictions_test.eval(), test_labels)))

Step 0: Acc_val: 10.0% (Acc_batch: 12.5%, loss: 2.30689)
Step 1000: Acc_val: 10.0% (Acc_batch: 10.2%, loss: 2.30576)
Step 2000: Acc_val: 10.0% (Acc_batch: 7.0%, loss: 2.30571)
Step 3000: Acc_val: 10.0% (Acc_batch: 10.2%, loss: 2.30418)
Step 4000: Acc_val: 10.0% (Acc_batch: 6.2%, loss: 2.30514)
Step 5000: Acc_val: 61.4% (Acc_batch: 60.2%, loss: 1.16410)
Step 6000: Acc_val: 83.1% (Acc_batch: 79.7%, loss: 0.76877)
Step 7000: Acc_val: 84.7% (Acc_batch: 80.5%, loss: 0.64766)
Step 8000: Acc_val: 85.4% (Acc_batch: 78.9%, loss: 0.78825)
Step 9000: Acc_val: 86.4% (Acc_batch: 84.4%, loss: 0.61684)
Step 10000: Acc_val: 87.0% (Acc_batch: 84.4%, loss: 0.55245)
Acc_test: 93.7%
