### Neural Net Results

A 5-layer neural network produced a high of 73% accuracy, a 3% improvment over the best machine learning classifier previously. 

Configuration:
- Glorot normalization for initial weights
- Random batch indexing without replacement 
- ELU activation function, alpha=1.0
- RMS optimization 
- BN inserted after linear transform, before non-linear 
- Minibatches of 60 on train
- 1,161 x 1000 x 1000 x 1000 x 5 network 
- Minibatches of 60 on test 


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
import sklearn.metrics as sk

import pandas as pd
from collections import Counter
import numpy as np
import nltk

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [2]:
modern = pd.read_pickle('data/5color_modern_no_name_hardmode.pkl')
Counter(modern.colors)

Counter({u'Black': 1576,
         u'Blue': 1573,
         u'Green': 1566,
         u'Red': 1575,
         u'White': 1584})

In [3]:
vectorizer = CountVectorizer()

y = pd.get_dummies(modern.colors)

X = vectorizer.fit_transform(modern.text)

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]


print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

There are 1,161 words in the vocabulary.


In [37]:
%%time

""" 5-layer """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def model(X, w_h, g_h, bb_h, w_h2, g_h2, bb_h2,
          w_h3, g_h3, bb_h3, w_o, g_ho, bb_ho):
    
    X = T.dot(X, w_h) 
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h = rectify(X)

    h  = T.dot(h, w_h2)
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h2 = rectify(h)

    h2 = T.dot(h2, w_h3)
    h2 = batch_normalization(h2, gamma= g_h3, beta= bb_h3, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h3 = rectify(h2)
    
    h3 = T.dot(h3, w_o)
    h3 = batch_normalization(h3, gamma= g_ho, beta= bb_ho, 
                            mean= h3.mean((0,), keepdims=True),
                            std= T.ones_like(h3.var((0,), keepdims = True)), 
                            mode='high_mem') 
    py_x = softmax(h3)
    return h, h2, h3, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 1000
h2_size = 1000
h3_size = 1000

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_h3 = init_weights((h2_size, h3_size))
g_h3 = theano.shared(floatX(np.ones((h3_size))))
bb_h3 = theano.shared(floatX(np.zeros((h3_size))))

w_o = init_weights((h3_size, yTest.shape[1]))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_h3, noise_py_x = model(X, w_h, g_h, bb_h, 
                                      w_h2, g_h2, bb_h2, 
                                       w_h3, g_h3, bb_h3, 
                                      w_o, g_ho, bb_ho)

h, h2, h3, py_x = model(X, w_h, g_h, bb_h, 
                    w_h2, g_h2, bb_h2, 
                     w_h3, g_h3, bb_h3, 
                    w_o, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, g_h, bb_h, w_h2, g_h2, bb_h2, 
           w_h3, g_h3, bb_h3, w_o, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(11):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print



Round: 0     Test: 0.61875        Train: 0.690306122449
Round: 1     Test: 0.686979166667 Train: 0.772278911565
Round: 2     Test: 0.703125       Train: 0.815816326531
Round: 3     Test: 0.7109375      Train: 0.842517006803
Round: 4     Test: 0.722916666667 Train: 0.873299319728
Round: 5     Test: 0.721875       Train: 0.883333333333
Round: 6     Test: 0.7171875      Train: 0.896088435374
Round: 7     Test: 0.731770833333 Train: 0.907482993197
Round: 8     Test: 0.716666666667 Train: 0.915986394558
Round: 9     Test: 0.7171875      Train: 0.925   
Round: 10    Test: 0.7265625      Train: 0.922789115646

Wall time: 34.1 s
