# Neural Network with TensorFlow

## Imports

In [17]:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
from tensorflow.python.framework import ops

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Load the Data

In [18]:
mnist_data = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Create the Model
A neural network is like logistic regression in the sense that first a linear function is calculated and then it is run through an activiation function.  However, in a neural network, this linear/activation is chained together multiple times, with the result of the previous activation function feeding into the next linear function.  Each linear/activation pair is a layer in the neural network.


Again, the linear and activtion functions are as follows:  
A linear function is first calculated.  
**z = xw + b**  
Where **w** is a tensor of weights, **x** is a tensor of features, and **b** is a tensor of biases.  


Then the output of the linear function is run through an activation function.  
**a = g(z)**  
Where **g()** is the activation function.


For this simple example we are assuming the ReLU activation function for the activation function of the inner layers.


All we have to chain tensors together that perform these two calculations for as many layers as the user wants.  The user should also be able to specify how many outputs each layer has.  We will do this by letting them specify a list of layer sizes, where the index of the list corresponds to the layer of the network, and the value at that index corresponds to the number of outputs in that layer.

In [19]:
def build_model(x_tensor, layer_sizes):
    for layer, layer_size in enumerate(layer_sizes):
        if layer == 0:
            # This is the first layer - there is nothing to do besides set up the input for the next layer.
            a_tensor = x_tensor
        else:
            # This is not the first layer - apply the linear function.
            with tf.variable_scope('Layer{0}'.format(layer), tf.AUTO_REUSE):
                # Initialize the weights and biases tensors for this layer.
                w_tensor = tf.get_variable(name='w', shape=(prev_layer_size, layer_size), initializer=tf.contrib.layers.xavier_initializer())
                b_tensor = tf.get_variable(name='b', shape=(1, layer_size), initializer=tf.contrib.layers.xavier_initializer())
                
                # The linear function for this layer.
                z_tensor = tf.matmul(a_tensor, w_tensor) + b_tensor
                
                if layer < len(layer_sizes) - 1:
                    # This is not the last layer - apply the ReLU activation function.
                    a_tensor = tf.nn.relu(z_tensor)
            
        prev_layer_size = layer_size
                
    return z_tensor

## Define the Cost
The measurement of how how well the parameters fit the training values during training.  The goal is the minimize this difference.


In TensorFlow, the last activation function is built into the cost.  Here, we are using the softmax activation which gives a probability of each output class being true.  All the probabilities sum to 1 for each example.


The cost function we are using is cross entropy, which measures the distance between the tensor of probabilities from the output of the softmax and the actual values, y.

In [20]:
def build_cost(z_tensor, y_tensor):
    with tf.variable_scope('CostFunction'):
        cost_tensor = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=z_tensor, labels=y_tensor))
        
    return cost_tensor

## Define the Optimizer
Chose which optimization algorithm to use.  This is the algorithm that adjusts the weights and biases each execution to bring the cost down.  TensorFlow comes with a good selection of pre-built optimizers.  We'll use Adam here.

In [21]:
def build_optimizer(cost_tensor, learning_rate):
    with tf.variable_scope('Optimizer'):
        optimizer_tensor = tf.train.AdamOptimizer(learning_rate).minimize(cost_tensor)
        
    return optimizer_tensor

## Define the Accuracy Measurement
The measurement of accurate the model is at predicting outcomes.

In [22]:
def build_accuracy(y_hat_tensor, y_tensor):
    with tf.variable_scope('AccuracyFunction'):
        # A bool tensor of where the predictions matched the labels.
        correct_predictions_tensor = tf.equal(tf.argmax(y_hat_tensor, axis=1), tf.argmax(y_tensor, axis=1))
        # Convert the true/false values into 0 or 1.
        correct_predictions_tensor = tf.cast(correct_predictions_tensor, tf.float32)
        # The mean of the correct_preditions_tensor will now give us the accuracy.
        accuracy_tensor = tf.reduce_mean(correct_predictions_tensor)
        
    return accuracy_tensor

## Train Function
Put the pieces together to build the model, input features, and train it.

In [25]:
def train(layer_sizes, learning_rate=0.0001, iterations=1000, batch_size=100):
    ops.reset_default_graph()
    
    x_tensor = tf.placeholder(tf.float32, [None, 784])
    y_tensor = tf.placeholder(tf.float32, [None, 10])
    
    z_tensor = build_model(x_tensor, layer_sizes)
    cost_tensor = build_cost(z_tensor, y_tensor)
    optimizer_tensor = build_optimizer(cost_tensor, learning_rate)
    accuracy_tensor = build_accuracy(z_tensor, y_tensor)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for i in range(1, iterations + 1):
            x, y = mnist_data.train.next_batch(batch_size)
            accuracy, cost, _ = sess.run([accuracy_tensor, cost_tensor, optimizer_tensor], feed_dict={x_tensor:x, y_tensor:y})
            
            if i % 100 == 0:
                print('Iteration {0} cost: {1}, accuracy: {2}'.format(i, cost, accuracy))

## Try it out!
With a neural network, the train accuracy is in the 99's for the MNIST dataset.  Much better than logistic regression.

In [26]:
%%time

layer_sizes = [784, 100, 500, 100, 10]

train(layer_sizes, learning_rate = 0.01, iterations=5000, batch_size=1000)

Iteration 100 cost: 0.08462731540203094, accuracy: 0.9779999852180481
Iteration 200 cost: 0.053787849843502045, accuracy: 0.9829999804496765
Iteration 300 cost: 0.04631044343113899, accuracy: 0.9850000143051147
Iteration 400 cost: 0.03487623482942581, accuracy: 0.9860000014305115
Iteration 500 cost: 0.033800847828388214, accuracy: 0.9860000014305115
Iteration 600 cost: 0.02459860034286976, accuracy: 0.9909999966621399
Iteration 700 cost: 0.05959990993142128, accuracy: 0.9860000014305115
Iteration 800 cost: 0.016436897218227386, accuracy: 0.9919999837875366
Iteration 900 cost: 0.014681417495012283, accuracy: 0.9959999918937683
Iteration 1000 cost: 0.010377823375165462, accuracy: 0.9980000257492065
Iteration 1100 cost: 0.0393538735806942, accuracy: 0.9900000095367432
Iteration 1200 cost: 0.036680907011032104, accuracy: 0.9890000224113464
Iteration 1300 cost: 0.021307144314050674, accuracy: 0.9950000047683716
Iteration 1400 cost: 0.008211578242480755, accuracy: 0.9959999918937683
Iteratio