In this notebook, we will generate a toy data set and train a simple neural network to predict the binary classification of each data point. 

### Let's import some useful libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import theano
# By convention, the tensor submodule is loaded as T
import theano.tensor as T
# The below script for training a simple neural net on
# toy data is built on Theano
import simple_nn
from simple_nn import MLP

### Generate some toy data
Two Gaussian-distributed clusters in 3-D space

In [None]:
np.random.seed(0)
N = 1000
y = np.random.randint(0, 1+1, size=N)
# Mean of each cluster
mu = .7
means = np.array([[-mu, mu],[-mu, mu],[-mu, mu]])
# Covariance (in X and Y direction) of each cluster
covariances = np.random.random_sample((3, 2)) + 1
# Assign training inputs and labels to variables X and y
X = np.vstack([np.random.randn(N)*covariances[0, y] + means[0, y],
               np.random.randn(N)*covariances[1, y] + means[1, y],
               np.random.randn(N)*covariances[2, y] + means[2, y]]).astype(theano.config.floatX)
y = y.astype(theano.config.floatX)

#### Plot the data

In [None]:
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111,projection='3d')
ax.set_xlabel('x', fontsize=14)
ax.set_ylabel('y', fontsize=14)
ax.set_zlabel('z', fontsize=14)
colors = [0 if i==0 else 'y' for i in y]
for x0, x1, x2, c in zip(X[0,:],X[1,:],X[2,:],colors):
    ax.scatter3D(x0, x1, x2, c=c, s=40, edgecolor='k',marker='o', zdir='z')

#### Let's look at a 2-D representation to get a better feel for how the two classes overlap

In [None]:
fig = plt.figure()
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
for x0, x1, c in zip(X[0,:],X[1,:],colors):
    plt.scatter(x0, x1, c=c, s=40)

### Build a simple neural network with 1 hidden layer
First specify layer sizes and instantiate a multi-layer perceptron class object ("multi-layer perceptron" may not be an accurate description of the network we are constructing but we won't get bogged down in those details in this tutorial!)

In [None]:
# First, set the size of each layer (and the number of layers)
# Input layer size is training data dimensionality (3)
# Output size is 1 since this is a binary classification task
# Finally, let the hidden layer be twice the size of the input.
# If we wanted more layers, we could just add another layer size to this list.
layer_sizes = [X.shape[0], 6, 1]
# Set initial parameter values
W_init = []
b_init = []
activations = []
for n_input, n_output in zip(layer_sizes[:-1], layer_sizes[1:]):
    # Implement small weight initialization
    W_init.append(np.random.randn(n_output, n_input)/np.sqrt(n_input))
    # Set initial biases to 1
    b_init.append(np.ones(n_output))
    # We'll use sigmoid activation for all layers
    activations.append(T.nnet.sigmoid)
# Create an instance of the MLP class
mlp = MLP(W_init, b_init, activations)

#### Create symbolic variables for input and target and specify learning rate

In [None]:
# Create Theano variables for the MLP input
mlp_input = T.matrix('mlp_input')
# ... and the desired output
mlp_target = T.vector('mlp_target')
# Set the learning rate
learning_rate = 0.01

#### Create functions for computing cost, training the network and computing output for a given input
Note: `cross_entropy` is a class method of `MLP`

In [None]:
# Create a function for computing the cost of the network given an input
cost = mlp.cross_entropy(mlp_input, mlp_target)
# Create a theano function for training the network
train = theano.function([mlp_input, mlp_target], cost,
                        updates=simple_nn.gradient_updates(cost, mlp.params, learning_rate))
# Create a theano function for computing the network output given some input
mlp_output = theano.function([mlp_input], mlp.output(mlp_input))

#### Specify mini-batch size and number of epochs to train for

In [None]:
mb_size = 100
n_epochs = 400

#### Vstack the training data and labels so that when we shuffle the data, labels stick with their corresponding inputs

In [None]:
training_data = np.vstack((X,y))

#### Train the network using stochastic gradient descent, monitoring training cost and training accuracy at each epoch
Typically we would hold out a test set to see how well the trained model generalizes to new data but since this is a toy example for the purpose of illustration, we won't bother measuring test cost/accuracy.

In [None]:
# Keep track of epochs
epoch = 0
n = X.shape[1]
training_cost = []
training_accuracy = []
mb_costs = []
for j in range(n_epochs):
    np.random.shuffle(training_data.T)
    # Split training set into mini-batches and then separate inputs from labels:
    mini_batches = [training_data[:,k:k+mb_size] for k in range(0,n,mb_size)]
    for mini_batch in mini_batches:
        # Train network parameters on the mini-batch
        inputs, labels = mini_batch[:3,:], mini_batch[3,:]
        mb_cost = train(inputs, labels)
        mb_costs.append(mb_cost)
    # Get the current cost taken across the training set
    current_cost = mlp.cross_entropy(X, y).eval()
    training_cost.append(current_cost)
    # We can compute the accuracy by thresholding the output and computing 
    # the proportion of points whose class match the ground truth class.
    current_output = mlp.output(X).eval()
    accuracy = np.mean((current_output > .5) == y)
    training_accuracy.append(accuracy)

    if epoch % 10 == 0:
        print('Cost: {:.3f}, Accuracy: {:.3f}'.format(float(current_cost), accuracy))
    
    epoch += 1

#### How did training accuracy and training cost change over epochs?

In [None]:
plt.figure(figsize=(12,10))
plt.plot(range(len(training_accuracy)),training_accuracy, label='Training Accuracy')
plt.plot(range(len(training_cost)), training_cost, label='Training Cost')
plt.xlabel('Epoch', fontsize=14)
plt.legend(loc=3)
plt.grid()

#### Let's plot another 2-D representation of the data using the star marker to show data points whose class is correctly predicted by our model and the circle marker to show incorrect predictions

In [None]:
correct_predictions = ((current_output>.5)==y).astype(theano.config.floatX)[0]
markers = ['*' if i==1. else 'o' for i in correct_predictions]
colors = [0 if i==0 else 'y' for i in y]
plt.figure(figsize=(10,10))
plt.xlabel('x', fontsize=14)
plt.ylabel('z', fontsize=14)
plt.grid()
fig.add_subplot()
for x0, x1, c, m in zip(X[0,:], X[2,:], colors, markers):
    plt.scatter(x0, x1, c=c, marker=m, s=60)