# Neural Network for MNIST Problem

The following document contains generalized code to create a fully connected network to solve the MNIST Problem.

### Tunable Features:

Any number of hidden layers

Any size for the hidden layers

Any function for hidden layer/output layer (sigmoid, softmax, tanh, and ReLU are given)

Minibatch gradient descent

Learning rate (for both biases and synapses)

Momentum

Max-norm regularization

Dropout on input and hidden layers

Synapse size (Synapses are instantiated to random values in [-1,1])


### Sources 
https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf - Tuning hyperparameters and structure of the final network, dropout tips

https://www.tensorflow.org/versions/master/tutorials/mnist/pros/index.html#deep-mnist-for-experts - 10 nodes as output rather than 1

http://iamtrask.github.io/2015/07/28/dropout/ - Implementing dropout to NN

http://iamtrask.github.io/2015/07/12/basic-python-network/ - Original struture of the network

https://en.wikipedia.org/ Regarding functions

## Get and Setup Test Data

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn import cross_validation
from sklearn.metrics import accuracy_score

# Retrieve the test data from the filesystem
data = pd.read_csv("Kaggle Competition MINST train.csv")
labels = data['label']
data = data.drop('label', axis=1)
data = data.div(255)

# Retrieve labels for the test data
target = pd.read_csv("Kaggle Competition MINST train revised target.csv")
target = target.drop("Unnamed: 0", axis = 1)

# Split the training data so that I can analyze testing error
train_data, test_data, train_target, test_target = cross_validation.train_test_split(
 data, target, test_size=0.25, random_state=0)

# Get test data in label form
test_data_labels = np.zeros(len(test_target), dtype=np.int8)
test_target_arr = np.array(test_target)
for i in range(len(test_target)):
    test_data_labels[i] = test_target_arr[i].argmax()

# Get the train data in label form
train_data_labels = np.zeros(len(train_target), dtype=np.int8)
train_target_arr = np.array(train_target)
for i in range(len(train_target)):
    train_data_labels[i] = train_target_arr[i].argmax()    
    
num_attributes = len(train_data.columns)

In [2]:
# rectifier function
def rectifier(x, deriv=False):
    flag = x > 0
    if(deriv==True):
        return 1 if flag else 0  #1 / (1 + np.exp(-x))
    return x if flag else 0  #np.log(1 + np.exp(x))
rectifier = np.vectorize(rectifier)

# softmax function
def softmax(x, deriv=False):
    if(deriv==True):
        return 1
    exp_scores = np.exp(x)
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# tanh function
def tanh(x,deriv=False):
    if(deriv==True):
        return 1 - np.power(x,2)
    return np.tanh(x)

# sigmoid/logistic function
def sigmoid(x,deriv=False):
    if(deriv==True):
        return x*(1-x)
    return 1/(1+np.exp(-x))

In [3]:
# Neural network architecture parameters
hidden_layer_funct = sigmoid
output_layer_funct = softmax

input_layer_size = num_attributes
num_hidden_layers = 3
hidden_layer_size = 1024
output_layer_size = 10

In [15]:
# Hyperparameters for learning
mini_batch_size = 100
training_steps = 10
epsilon = 0.0004
momentum = 0.90
synapse_size = 1
bias_size = 1
max_norm = True
c = 10
hidden_dropout, input_dropout, do_dropout = (0.5, 0.80, True)
if do_dropout:
    epsilon *= 7
if do_dropout:
    momentum = 0.95

In [16]:
# Synapse/bias instantiation
synapse = []
synapse_deltas = []
biases = []
bias_deltas = []
prev_test_error = 1.0

# Input-Hidden synapse
if(hidden_layer_funct == sigmoid or hidden_layer_funct == tanh):
    synapse.append(2 * np.random.random((input_layer_size, hidden_layer_size)) - 1)
if(hidden_layer_funct == rectifier):
    synapse.append(np.ones((input_layer_size, hidden_layer_size)))
synapse_deltas.append(np.zeros((input_layer_size, hidden_layer_size)))
biases.append(np.zeros(hidden_layer_size))
bias_deltas.append(np.zeros(hidden_layer_size))

# Hidden-Hidden synapse
for layer in range(1, num_hidden_layers):
    if(hidden_layer_funct == sigmoid or hidden_layer_funct == tanh):
        synapse.append(2 * np.random.random((hidden_layer_size, hidden_layer_size)) - 1)
    if(hidden_layer_funct == rectifier):
        synapse.append(np.ones((hidden_layer_size, hidden_layer_size)))
    synapse_deltas.append(np.zeros((hidden_layer_size, hidden_layer_size)))
    biases.append(np.zeros(hidden_layer_size))
    bias_deltas.append(np.zeros(hidden_layer_size))

# Hidden-Output synapse
synapse.append(2 * np.random.random((hidden_layer_size, output_layer_size)) - 1)
synapse_deltas.append(np.zeros((hidden_layer_size, output_layer_size)))

synapse = map(lambda(x): x*synapse_size, synapse)

def pred(input):
    current_layer = input
    for layer in range(num_hidden_layers):
        current_layer = hidden_layer_funct(np.dot(current_layer, synapse[layer])
                                            + biases[layer])
        
    output = output_layer_funct(np.dot(current_layer, synapse[num_hidden_layers]) + biases[num_hidden_layers])
    
    answer = np.zeros(len(output), dtype = np.int8)
    for i in range(len(output)):
        answer[i] = output[i].argmax()
    return answer

## Train the Network

In [17]:
# Multiple training runs Stoichiastic training of the neural network
for iteration in xrange(training_steps):
    
    # Create a set of minibatches for training
    all_inds = np.array(train_data.index)
    random.shuffle(all_inds)
    batch_inds = [all_inds[i:i+mini_batch_size] for i in xrange(0,
                                                len(all_inds), mini_batch_size)]

    # train on each batch
    for inds in batch_inds:
        X = np.array(train_data.ix[inds])
        y = np.array(train_target.ix[inds])

        layers = []
        layer_deltas = []
        # Input layer
        layers.append(X)
        if(do_dropout):
            layers[0] *= np.random.binomial([np.ones(layers[0].shape)],
                          input_dropout)[0]
            
        # Hidden Layers
        for layer in range(num_hidden_layers):
            layers.append(hidden_layer_funct(np.dot(layers[layer], synapse[layer])
                                                                  + biases[layer]))
            layer_deltas.append(np.zeros((hidden_layer_size, mini_batch_size)))
            if(do_dropout):
                layers[layer + 1] *= np.random.binomial([np.ones(layers[layer+1].shape)],
                                      hidden_dropout)[0]     
                
        # Output Layer
        layers.append(output_layer_funct(np.dot(layers[num_hidden_layers], synapse[num_hidden_layers])))
        layer_deltas.append(np.zeros((output_layer_size, mini_batch_size)))
        
        # Backpropogation: Layer contribution to errors
        layer_deltas[-1] = (layers[-1] - y) * output_layer_funct(layers[-1], True)
        for layer in range(2, num_hidden_layers + 2):
            layer_deltas[-layer] = layer_deltas[-(layer - 1)].dot(
                                    synapse[-(layer - 1)].T) * hidden_layer_funct(
                                    layers[-layer], True)
            
        # Update synapses
        for index in range(len(synapse)):
            synapse_deltas[index] = layers[index].T.dot(
                                     layer_deltas[index]) * epsilon + momentum*synapse_deltas[index]
            synapse[index] -= synapse_deltas[index]
            
        # Update biases
        for index in range(len(biases)):
            bias_deltas[index] = np.ones(mini_batch_size).dot(
                                  layer_deltas[index]) * epsilon + momentum*bias_deltas[index]
            biases[index] -= bias_deltas[index]
        
        # Find magnitude of neuron vectors and preform max-normalization
        if(max_norm):
            for i in range(len(synapse)):
                syn_mag = np.linalg.norm(synapse[i], axis = 0)
                if(any(syn_mag > c)):
                    synapse[i] = synapse[i] / (syn_mag / c)

# Scale down the synapses to accomadate using the network without dropout
if do_dropout:
    synapse[0] *= input_dropout
    for i in range(1, len(synapse)):
        synapse[i] *= hidden_dropout
        
# Add a set of zeros for the out layer bias
biases.append(np.zeros(output_layer_size))
        
# Test the model at this iteration
prediction = pred(train_data)
train_error = 1 - accuracy_score(train_data_labels, prediction)
prediction = pred(test_data)
test_error = 1 - accuracy_score(test_data_labels, prediction)

print "Training Iteration: " , iteration
print "Training Error: ", train_error
print "Testing Error: " , test_error
print

prev_test_error = test_error

Training Iteration:  499
Training Error:  0.0
Testing Error:  0.0158095238095



0.0006 - 0.0008
0.0005 - 0.00019
0.0004 - 0.00009
0.0001 - 0.01

*20 - 0.8
*10 - 0.10
*8 - 0.0383
*7 - 0.03785
*6 - 0.038
*5 - 0.0409
*4 - 0.0459
*3 - 0.0507
*1 - 0.08

In [43]:
biases[0]

array([-0.37166659, -0.57607966, -0.83539794, ..., -0.72201505,
       -0.60238743, -0.06176436])

## Save the Network Synapses and Biases

In [24]:
# Save the synapses for later use
for index in range(len(synapse)):
    snaypse_i = pd.DataFrame(synapse[index])
    bias_i = pd.DataFrame(biases[index])
    snaypse_i.to_csv("%d-Layer %s-%s %d-%d-%d nodes syn%d.csv" % (
            num_hidden_layers, hidden_layer_funct.func_name, output_layer_funct.func_name, input_layer_size, hidden_layer_size, output_layer_size, index))
    bias_i.to_csv("%d-Layer %s-%s %d-%d-%d nodes bias%d.csv" % (
            num_hidden_layers, hidden_layer_funct.func_name, output_layer_funct.func_name, input_layer_size, hidden_layer_size, output_layer_size, index))

## Test the network

In [29]:
import pandas as pd
import numpy as np
import random
from sklearn import cross_validation
from sklearn.metrics import accuracy_score

# Retrieve the test data from the filesystem
data = pd.read_csv("Kaggle Competition MINST train.csv")
target = data['label']
data = data.drop('label', axis=1)
data = data.div(255)

# Split the training data so that I can analyze testing error (same split as training)
train_data, test_data, train_target, test_target = cross_validation.train_test_split(
 data, target, test_size=0.25, random_state=0)

In [30]:
# Neural network architecture parameters
hidden_layer_funct = sigmoid
output_layer_funct = softmax

input_layer_size = num_attributes
num_hidden_layers = 3
hidden_layer_size = 1024
output_layer_size = 10

In [34]:
# Save the synapses for later use
synapse = []
biases = []
for index in range(num_hidden_layers + 1):
    syn_filename = "%d-Layer %s-%s %d-%d-%d nodes syn%d.csv" % (
       num_hidden_layers, hidden_layer_funct.func_name, output_layer_funct.func_name, input_layer_size, hidden_layer_size, output_layer_size, index)
    bias_filename = "%d-Layer %s-%s %d-%d-%d nodes bias%d.csv" % (
       num_hidden_layers, hidden_layer_funct.func_name, output_layer_funct.func_name, input_layer_size, hidden_layer_size, output_layer_size, index)
    synapse.append(np.array(pd.read_csv(syn_filename).drop("Unnamed: 0", axis = 1)))
    biases.append(np.array(pd.read_csv(bias_filename).drop("Unnamed: 0", axis = 1)))

# rectifier function
def rectifier(x, deriv=False):
    flag = x > 0
    if(deriv==True):
        return 1 if flag else 0  #1 / (1 + np.exp(-x))
    return x if flag else 0  #np.log(1 + np.exp(x))
rectifier = np.vectorize(rectifier)

# softmax function
def softmax(x, deriv=False):
    if(deriv==True):
        return 1
    exp_scores = np.exp(x)
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# tanh function
def tanh(x,deriv=False):
    if(deriv==True):
        return 1 - np.power(x,2)
    return np.tanh(x)

# sigmoid/logistic function
def sigmoid(x,deriv=False):
    if(deriv==True):
        return x*(1-x)
    return 1/(1+np.exp(-x))

def pred(input):
    current_layer = input
    for layer in range(num_hidden_layers):
        current_layer = hidden_layer_funct(np.dot(current_layer, synapse[layer])
                                            + biases[layer].T)
        
    output = output_layer_funct(np.dot(current_layer, synapse[num_hidden_layers]) + biases[num_hidden_layers].T)
    
    answer = np.zeros(len(output), dtype = np.int8)
    for i in range(len(output)):
        answer[i] = output[i].argmax()
    return answer

In [35]:
# Test the model at this iteration
prediction = pred(train_data)
train_error = 1 - accuracy_score(train_data_labels, prediction)
prediction = pred(test_data)
test_error = 1 - accuracy_score(test_data_labels, prediction)

print "Training Error: ", train_error
print "Testing Error: " , test_error
print

prev_test_error = test_error

Training Iteration:  499
Training Error:  0.0
Testing Error:  0.0158095238095

