Practical4: Iris Predictions using a Fully-Connected Network
=========

Author:
Jake Dailey

**Purpose:** Train a fully-connected neural network to predict Iris classes using SGD

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
import shutil
import time

In [None]:
filename = "iris.data.csv"

## Streaming in and splitting example/label pairs in to training, validation and test sets

In [None]:
def datasplitter(filename):
    train_rows = 0
    valid_rows = 0
    test_rows = 0
    
    if os.path.exists('iristrain'):
        shutil.rmtree('iristrain')
    os.makedirs('iristrain')
    
    
    if os.path.exists('irisvalid'):
        shutil.rmtree('irisvalid')
    os.makedirs('irisvalid')
    
    if os.path.exists('iristest'):
        shutil.rmtree('iristest')
    os.makedirs('iristest')

    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row)>0:
                filtered = (line.replace('\n', '') for line in row)
                filtered = (line.replace("\r", "") for line in filtered)
                filtered = (line.replace("'", "") for line in filtered)
                row = filtered

                group = np.random.multinomial(1,[.6,.2,.2])
                if np.argmax(group)==0:
                    with open('iristrain/iristrain{0}.csv'.format(train_rows), 'w+', newline = None) as trainfile:
                        trainwriter = csv.writer(trainfile, delimiter=',')
                        trainwriter.writerow(row)
                    train_rows += 1


                if np.argmax(group)==1:
                    with open('irisvalid/irisvalid{0}.csv'.format(valid_rows), 'w+', newline = None) as validfile:
                        validwriter = csv.writer(validfile, delimiter=',')
                        validwriter.writerow(row)
                    valid_rows += 1

                if np.argmax(group)==2:
                    with open('iristest/iristest{0}.csv'.format(test_rows), 'w+', newline = None) as testfile:
                        testwriter = csv.writer(testfile, delimiter=',')
                        testwriter.writerow(row)
                    test_rows += 1
                    
    return train_rows, valid_rows, test_rows

## Function to format inputs

#### Continuous valued predicators and one-hot encoded labels

In [None]:
def read_file_format(filename_queue, possible_labels):
    reader = tf.TextLineReader()
    _, value = reader.read(filename_queue)

    record_defaults = [tf.constant([],dtype=tf.float32), tf.constant([],dtype=tf.float32),
                       tf.constant([],dtype=tf.float32), tf.constant([],dtype=tf.float32),
                       tf.constant([],dtype=tf.float32), tf.constant([],dtype=tf.string)]
    col1, col2, col3, col4, col5, col6 = tf.decode_csv(value, record_defaults=record_defaults)
    
    example = tf.stack([col1, col2, col3, col4, col5])
    label = tf.one_hot(tf.where(tf.equal(possible_labels, col6))[0], depth = possible_labels.shape[0], on_value = 1, off_value = 0)
    label = label[0]
    return example, label

## Function to format, queue and read inputs in batches

In [None]:
def input_pipeline(filenames, possible_labels, batch_size = 3, num_epochs = None, evaluation = False):   
    filename_queue = tf.train.string_input_producer(
        train_set, num_epochs=num_epochs, shuffle=True)

    example, label = read_file_format(filename_queue, possible_labels)

#     # min_after_dequeue defines how big a buffer we will randomly sample
#     #   from -- bigger means better shuffling but slower start up and more
#     #   memory used.
#     # capacity must be larger than min_after_dequeue and the amount larger
#     #   determines the maximum we will prefetch.  Recommendation:
#     #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
    min_after_dequeue = 10
    capacity = min_after_dequeue + 3 * batch_size
    example_batch, label_batch = tf.train.shuffle_batch(
      [example, label], batch_size=batch_size,capacity = capacity, 
        min_after_dequeue = min_after_dequeue
    )    
        
    return example_batch, label_batch

### [Tensorboard](https://www.tensorflow.org/get_started/summaries_and_tensorboard) is a powerful tool for monitoring training and better understanding the inner workings of our model

### Here, we'll define a helper function for creating layer activation summaries for Tensorboard:

In [None]:
def _activation_summary(x):
    tensor_name = x.name
    tensor_name = tensor_name.replace(':', '_')
    tensor_name = tensor_name.replace('(', '_')
    tensor_name = tensor_name.replace(')', '_')
    tensor_name = tensor_name.replace(' ', '_')

    tf.summary.histogram(tensor_name + '/activations', x)
    tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))

## Specifying the predictive function we're looking to optimize

#### Here, we'll do: linear => sigmoid => linear => log-softmax

In [None]:
def make_prediction1(X):  
    with tf.variable_scope('sigmoid1') as scope:
        weights = tf.Variable(tf.random_normal([4, 3]), name='weights', trainable=True)
        biases = tf.Variable(tf.random_normal([3]), name='bias', trainable=True)
        lin_y1 = tf.matmul(X, weights) + biases
        _activation_summary(lin_y1)
        
        sig_y1 = tf.sigmoid(lin_y1)
        _activation_summary(sig_y1)
    
    with tf.variable_scope('softmax2') as scope:
        weights = tf.Variable(tf.random_normal([3, 3]), name='weights', trainable=True)
        biases = tf.Variable(tf.random_normal([3]), name='bias', trainable = True)
        lin_y2 = tf.matmul(sig_y1, weights) + biases
        _activation_summary(lin_y2)
        
        smax_num = tf.transpose(tf.exp(lin_y2 - tf.reduce_max(lin_y2)))
        smax_den = tf.reduce_sum(tf.exp(lin_y2 - tf.reduce_max(lin_y2)), -1)
        softmax_y2 = tf.transpose(tf.divide(smax_num, smax_den))
        _activation_summary(softmax_y2)

    return softmax_y2

#### Also implementing the made-up "ReQu" unit; our model is now: linear => ReQu => linear => log-softmax

In [None]:
def make_prediction2(X):  
    with tf.variable_scope('requ') as scope:
        weights = tf.Variable(tf.random_normal([4, 3]), name='weights', trainable=True)
        biases = tf.Variable(tf.random_normal([3]), name='bias', trainable=True)
        lin_y1 = tf.matmul(X, weights) + biases
        _activation_summary(lin_y1)
       
        requ_y1 = tf.square(tf.maximum(0., lin_y1))
        _activation_summary(requ_y1)
    
    with tf.variable_scope('softmax2') as scope:
        weights = tf.Variable(tf.random_normal([3, 3]), name='weights', trainable=True)
        biases = tf.Variable(tf.random_normal([3]), name='bias', trainable = True)
        lin_y2 = tf.matmul(requ_y1, weights) + biases
        _activation_summary(lin_y2)
        
        smax_num = tf.transpose(tf.exp(lin_y2 - tf.reduce_max(lin_y2)))
        smax_den = tf.reduce_sum(tf.exp(lin_y2 - tf.reduce_max(lin_y2)), -1)
        softmax_y2 = tf.transpose(tf.divide(smax_num, smax_den))
        _activation_summary(softmax_y2)

    return softmax_y2

#### Note the use of variable_scope so we don't have to name new tf.Variable()'s for every layer; rather we can just keep calling them weights and biases and their "scope" will be limited to their specific use in each layer

## Specifying our loss function
> #### And giving ourselves Tensorboard summaries to monitor change in loss during SGD

In [None]:
def calculate_loss(logits, labels):
    NLLCriterion = -tf.reduce_mean(tf.reduce_sum(tf.multiply(labels, tf.log(logits + 1e-10)), axis=1))

    tf.add_to_collection('losses', NLLCriterion)
    return tf.add_n(tf.get_collection('losses'), name='total_loss')

#### Here, borrowing a TF function which adds a smoothed loss, to reduce noise in our loss plot

In [None]:
def _add_loss_summaries(total_loss):
    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    losses = tf.get_collection('losses')
    loss_averages_op = loss_averages.apply(losses + [total_loss])

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
        l_name = l.name.replace(":", "_")
        tf.summary.scalar(l_name + '_raw_', l)
        tf.summary.scalar(l_name, loss_averages.average(l))

    return loss_averages_op

## Specifying a training operation

> #### We're going to decay our loss rate to avoid jumping over potentially lower error rates in our parameter space. Also using tf.train.MomentumOptimizer() to mitigate the same concern.

### Ultimately, our loss function will be:

## $L(X, \Theta) = -(y_1log(z^4_1) + y_2log(z^4_2) + y_3log(z^4_3))$
## $z^4_j = \frac{e^{z^3_j}}{e^{z^3_1}+e^{z^3_2}+e^{z^3_3}}$
## $z^3_j = z^2_j\Theta^2_j$
## $z^2_j = \frac{1}{1+e^{-z^1_j}}$
## $z^1_j = z^0_i\Theta^1_j$
## $z^0_i = x_i$

### We can find the Jacobian of the loss w.r.t. the data as:

## $\frac{\partial L}{\partial x_i} = -(\frac{y_1}{z^4_1}\frac{\partial z^4_1}{\partial x_i} + \frac{y_2}{z^4_2}\frac{\partial z^4_2}{\partial x_i} + \frac{y_3}{z^4_3}\frac{\partial z^4_3}{\partial x_i})$
#### Now, just focusing in on z's from one class for simplicity:
# $\frac{\partial z^4_j}{\partial x_i} = \frac{z^3_je^{z^3_j}\frac{\partial z^3_j}{\partial x_i} \centerdot (e^{z^3_j} + e^{z^3_{j+1}}+e^{z^3_{j+2}})-e^{z^3_j} \centerdot (z^3_je^{z^3_j}\frac{\partial z^3_j}{\partial x_i} +z^3_{j+1}e^{z^3_{j+1}}\frac{\partial z^3_{j+1}}{\partial x_i}+z^3_{j+2}e^{z^3_{j+2}}\frac{\partial z^3_{j+2}}{\partial x_i})}{(e^{z^3_j}+e^{z^3_{j+1}}+e^{z^3_{j+2}})^2}$
# $\frac{\partial z^3_j}{\partial x_i} = \Theta^2_j \centerdot \frac{\partial z^2_j}{\partial x_i}$
# $\frac{\partial z^2_j}{\partial x_i} = \frac{z^1_je^{-z^1_j}}{(1+e^{-z^1_j})^2} \centerdot \frac{\partial z^1_j}{\partial x_i}$
# $\frac{\partial z^1_j}{\partial x_i} = \Theta^1_j \centerdot \frac{\partial z^0}{\partial x_i}$
# $\frac{\partial z^0}{\partial x_i} = 1$

## Rather than code the entire gradient given above, we can use Tensorflow's compute_gradients() method:

In [None]:
def train(total_loss, global_step):
 
    # Variables that affect learning rate.
    num_batches_per_epoch = num_examples_per_train_epoch / batch_size
    decay_steps = int(num_batches_per_epoch * num_epochs_to_decay)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(
        initial_learning_rate, global_step,
        decay_steps, learning_rate_decay_factor, staircase=True)
    tf.summary.scalar('learning_rate', lr)

    # Generate moving averages of all losses and associated summaries.
    loss_averages_op = _add_loss_summaries(total_loss)

    # Compute gradients.
    with tf.control_dependencies([loss_averages_op]):
        opt = tf.train.MomentumOptimizer(lr, momentum)
        grads = opt.compute_gradients(total_loss)
        
    # Apply gradients.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name, var)

    # Add histograms for gradients.
    for grad, var in grads:
        if grad is not None:
            tf.summary.histogram(var.op.name + '/gradients', grad)

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        moving_average_decay, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')

    return train_op

In [None]:
def evaluate_accuracy(y_hat, y_):
    correct = tf.equal(tf.argmax(y_hat, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    tf.summary.scalar('validation_accuracy', accuracy)
    return accuracy

# With all of our functions defined, let's get to work!
## First, we'll split our data using the function we defined above

In [None]:
train_len, valid_len, test_len = datasplitter(filename=filename)

### Now, because we know our dataset is small enough to read, we can test our input_pipeline() to ensure our model will be fed data correctly:

In [None]:
train_set = ['iristrain/iristrain{0}.csv'.format(i) for i in range(train_len)]
valid_set = ['irisvalid/irisvalid{0}.csv'.format(i) for i in range(valid_len)]
test_set = ['iristest/iristest{0}.csv'.format(i) for i in range(test_len)]

In [None]:
for i in range(train_len):
    if i == 0:
        debug = pd.read_csv('iristrain/iristrain{0}.csv'.format(i), header = None)
    else:
        temp = pd.read_csv('iristrain/iristrain{0}.csv'.format(i), header = None)
        debug = debug.append(temp)
debug_ex = debug.ix[:,:4]

#### We'll just do one "epoch" of reading in data and append the fed-in data to a dataframe to test our pipeline

In [None]:
num_epochs = 1
batch_size = 3

pipe_ex = debug_ex.iloc[0:0,:]

with tf.Graph().as_default():
    unique_labels = tf.constant(["Iris-setosa", "Iris-versicolor", "Iris-virginica"], dtype=tf.string)
    
    example_feed, labels_feed = input_pipeline(train_set, possible_labels = unique_labels,
                                               batch_size = batch_size, num_epochs = num_epochs)
    
    with tf.Session() as sess:
        # Start populating the filename queue.
        
        init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        sess.run(init_op)
        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        
        step = 0
        while not coord.should_stop():
            try:
                start_time = time.time()                
               
                example_batch, label_batch = sess.run([example_feed, labels_feed])
                
                pipe_ex = pipe_ex.append(pd.DataFrame(example_batch))
                
                duration = time.time() - start_time
                step += 1

            except (tf.errors.OutOfRangeError, tf.errors.InvalidArgumentError) as e:
           
                print('Done training for %d epochs, %d steps.' % (num_epochs, step))
                # When done, ask the threads to stop.
                coord.request_stop()

        
        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

#### Checking for alignment between both methods of reading data

In [None]:
debug_ex.ix[:,0].astype(float).reset_index(drop=True).equals((pipe_ex.sort_values(0).reset_index(drop=True).ix[:,0]))

## Now, we'll set some of our model's hyperparameters as global variables to avoid passing arguments to all of the functions defined above:

In [None]:
global num_examples_per_train_epoch
num_examples_per_train_epoch = train_len

global num_epochs
num_epochs= 10

global batch_size
batch_size = 3

global moving_average_decay
moving_average_decay = 0.9999   

global num_epochs_to_decay
num_epochs_to_decay = 5  

global learning_rate_decay_factor
learning_rate_decay_factor = 0.001  

global initial_learning_rate
initial_learning_rate = 0.01

global momentum
momentum = 0.05

global logdir
logdir = 'TF_Logs'

#### Note here I'm defining "logdir", which is the name of the folder I'd like my Tensorboard files to be written to. From a new command prompt, I can type,
#### tensorboard --logdir=TF_Logs
#### and navigate to the IP:port it provides to get a closer look at my model and the training procedure.

## Now, the moment we've all been waiting for: training!

### I'm going to construct a new tf.Graph() and fill it with all of the pieces I specified earlier on.
### Then, once I'm happy with how my graph is defined, I'll open a new tf.Session(), initialize all of the variables used throughout my pipeline
>#### Noteworthy: Tensorflow doesn't just use variables in the "model" as we usually understand it; instead, the entire pipeline is viewed as part of the graph, which allows us to make use of relatively flexible modules, but means we need to initialize variables only after they've all been specified to the graph but before they're used by any other functions

### Now, we can run our pipeline just as we did with the test pipeline
>#### Except now our examples will be passed to our training procedure and we can monitor training progress using merged summaries, written using writer below and read using Tensorboard!

In [None]:
with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
        
    unique_labels = tf.constant(["Iris-setosa", "Iris-versicolor", "Iris-virginica"], dtype=tf.string)
    
    example_feed, labels_feed = input_pipeline(train_set, possible_labels=unique_labels,
                                               batch_size=batch_size, num_epochs=num_epochs)
    
    valid_example_feed, valid_labels_feed = input_pipeline(
        valid_set, possible_labels=unique_labels,
        batch_size=valid_len, num_epochs=1)
    
    x = tf.placeholder(tf.float32, shape=[None, 4])
    y_ = tf.placeholder(tf.float32, shape = [None, 3])
    
    y_hat = make_prediction1(x)
    
    loss = calculate_loss(y_hat, y_)
    
    train_op = train(loss, global_step=global_step)
    
    accuracy = evaluate_accuracy(y_hat, y_)
    
    with tf.Session() as sess:
        # Start populating the filename queue.
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir, sess.graph)

        init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        sess.run(init_op)
        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)

        step = 0
        while not coord.should_stop():
            try:
                start_time = time.time()                

                example_batch, label_batch = sess.run([example_feed, labels_feed])
                example_batch = example_batch[:, 1:]

                result, summary =  sess.run([train_op, merged],
                                            feed_dict={x: example_batch,
                                                       y_: label_batch})
                writer.add_summary(summary, global_step.eval())

                step += 1
                print(step)
                duration = time.time() - start_time
                print(duration)

            except (tf.errors.OutOfRangeError, tf.errors.InvalidArgumentError) as e:
                print("Training complete, entering validation...")

                print("Evaluating inputs...")
                valid_examples, valid_labels = sess.run([valid_example_feed, valid_labels_feed])
                valid_examples = valid_examples[:, 1:]

                print("Calculating accuracy...")
                acc = sess.run(accuracy, feed_dict={x: valid_examples,
                                                             y_: valid_labels})
                print("Evaluating validation summary...")
                summary = sess.run(merged,
                                   feed_dict={
                                       x: valid_examples, y_: valid_labels
                                   })

                print('Done training for %d epochs, %d steps.' % (num_epochs, step))
                # When done, ask the threads to stop.
                coord.request_stop()

        
        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

### Checking what percent of validation examples we got right:

In [None]:
acc

## Here, testing a second architecture with a "ReQu" unit, per practical4's requirements

In [None]:
with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
        
    unique_labels = tf.constant(["Iris-setosa", "Iris-versicolor", "Iris-virginica"], dtype=tf.string)
    
    example_feed, labels_feed = input_pipeline(train_set, possible_labels=unique_labels,
                                               batch_size=batch_size, num_epochs=num_epochs)
    
    valid_example_feed, valid_labels_feed = input_pipeline(
        valid_set, possible_labels=unique_labels,
        batch_size=valid_len, num_epochs=1)
    
    x = tf.placeholder(tf.float32, shape=[None, 4])
    y_ = tf.placeholder(tf.float32, shape = [None, 3])
    
    y_hat = make_prediction2(x)
    
    loss = calculate_loss(y_hat, y_)
    
    train_op = train(loss, global_step=global_step)
    step = 0
    
    accuracy = evaluate_accuracy(y_hat, y_)
    
    with tf.Session() as sess:
        # Start populating the filename queue.
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir, sess.graph)

        init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        sess.run(init_op)
        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)

        step = 0
        while not coord.should_stop():
            try:
                start_time = time.time()                

                example_batch, label_batch = sess.run([example_feed, labels_feed])
                example_batch = example_batch[:, 1:]

                result, summary =  sess.run([train_op, merged],
                                            feed_dict={x: example_batch,
                                                       y_: label_batch})
                writer.add_summary(summary, global_step.eval())

                step += 1
                print(step)
                duration = time.time() - start_time
                print(duration)

            except (tf.errors.OutOfRangeError, tf.errors.InvalidArgumentError) as e:
                print("Training complete, entering validation...")

                print("Evaluating inputs...")
                valid_examples, valid_labels = sess.run([valid_example_feed, valid_labels_feed])
                valid_examples = valid_examples[:, 1:]

                print("Calculating accuracy...")
                acc = sess.run(accuracy, feed_dict={x: valid_examples,
                                                             y_: valid_labels})
                print("Evaluating validation summary...")
                summary = sess.run(merged,
                                   feed_dict={
                                       x: valid_examples, y_: valid_labels
                                   })

                print('Done training for %d epochs, %d steps.' % (num_epochs, step))
                # When done, ask the threads to stop.
                coord.request_stop()

        
        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

### Testing validation accuracy on this second network

In [None]:
acc