## Neural Network

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
import sklearn.metrics as sk

import pandas as pd
from collections import Counter
import numpy as np
import nltk

In [2]:
modern = pd.read_pickle('data/5color_modern_no_name_hardmode.pkl')
Counter(modern.colors)

Counter({u'Black': 1576,
         u'Blue': 1573,
         u'Green': 1566,
         u'Red': 1575,
         u'White': 1584})

After all the data munging the classes are still amazingly balanced.

## Lets single out blue and red for a binary classification


In [4]:
UG = modern.loc[modern['colors'].isin(['Blue', 'Red'])]

UG.reset_index(inplace=True)
UG.pop('index')

UG[['name', 'colors', 'cmc', 'text']].sample(6)

Unnamed: 0,name,colors,cmc,text
1709,Renegade Doppelganger,Blue,2.0,Whenever another creature enters the battlefie...
2782,Mercurial Pretender,Blue,5.0,You may have This enter the battlefield as a c...
2907,Bloodfire Enforcers,Red,4.0,This has first strike and trample as long as a...
3106,Coastal Discovery,Blue,4.0,Draw two cards. Awaken 4—{5}{1}
2635,Forgestoker Dragon,Red,6.0,Flying {1}{1}: This deals 1 damage to target c...
279,Hearth Kami,Red,2.0,"{1}, Sacrifice This: Destroy target artifact w..."


In [5]:
dummies = pd.get_dummies(UG.colors)
dummies.head()

Unnamed: 0,Blue,Red
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [6]:
vectorizer = CountVectorizer()

vec_X = vectorizer.fit_transform(UG['text'])

xTrain, xTest, yTrain, yTest = train_test_split(vec_X, dummies,
                                             random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

# xTrain = xTrain.reshape(-1, 1, 1, 815)
# xTest = xTest.reshape(-1, 1, 1, 815)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

(2361L, 815L)
(2361L, 2L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(2361L, 815L)
(2361L, 2L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
There are 815 words in the vocabulary.


In [69]:
import theano
from theano import tensor as T
import numpy as np
from math import sqrt


def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * .2  #factors: correct for uni[0,1], glo, glo, softmax deriv
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) \
                                * normalizer))  #code for using Glorot init
    
def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

def adaDelta(cost, params, eta=0.2, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        #calc g-squared
        gSq = theano.shared(p.get_value() * 0.)
        dwSq = theano.shared(p.get_value() * 0.)

        #exp smoothed g squared
        gSqNew = rho * gSq + (1 - rho) * g * g

        #calc dx-squared
        dw = eta * T.sqrt(dwSq + epsilon) * g / T.sqrt(gSq + epsilon)
        dwSqNew = rho * dwSq + (1 - rho) * dw * dw

        updates.append((dwSq, dwSqNew))
        updates.append((gSq, gSqNew))
        updates.append((p, p - dw))
    return updates

X = T.fmatrix()
Y = T.fmatrix()
# grad_list = theano.shared(np.array([0,0]), name='grad_list')

w = init_weights((815 , 2))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.1]]

train = theano.function(inputs=[X, Y], 
                        outputs=[cost, gradient], 
                        updates=update, 
                        allow_input_downcast=True)

predict = theano.function(inputs=[X], 
                          outputs=y_pred, 
                          allow_input_downcast=True)


for i in range(401):
# #     for start, end in zip(range(0, xTrain.shape[0], 128), 
# #                           range(128, xTrain.shape[0], 128)):
# #         cost, gradient = train(xTrain[start:end], yTrain[start:end])
    cost, gradient = train(xTrain, yTrain)
    if i % 30 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr


Round: 0  Test: 0.651842439644  Train: 0.629394324439
Round: 30  Test: 0.810673443456  Train: 0.831427361288
Round: 60  Test: 0.833545108005  Train: 0.849216433715
Round: 90  Test: 0.853875476493  Train: 0.861075815332
Round: 120  Test: 0.852604828463  Train: 0.866581956798
Round: 150  Test: 0.855146124524  Train: 0.873358746294
Round: 180  Test: 0.861499364676  Train: 0.879288437103
Round: 210  Test: 0.864040660737  Train: 0.885218127912
Round: 240  Test: 0.870393900889  Train: 0.889030072003
Round: 270  Test: 0.866581956798  Train: 0.892842016095
Round: 300  Test: 0.866581956798  Train: 0.894536213469
Round: 330  Test: 0.869123252859  Train: 0.894112664125
Round: 360  Test: 0.87166454892  Train: 0.895383312156
Round: 390  Test: 0.87166454892  Train: 0.895806861499


### All Five Vs All Five

And now the main event - simply comparing two colors was too easy. Five way classification of all the colors.

In [3]:
vectorizer = CountVectorizer()

y = pd.get_dummies(modern.colors)

X = vectorizer.fit_transform(modern.text)

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

# xTrain = xTrain.reshape(-1, 1, 1, 815)
# xTest = xTest.reshape(-1, 1, 1, 815)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

(5905L, 1161L)
(5905L, 5L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(5905L, 1161L)
(5905L, 5L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
There are 1,161 words in the vocabulary.


In [9]:
%%time

import theano
from theano import tensor as T
import numpy as np
from math import sqrt

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * .2  #factors: correct for uni[0,1], glo, glo, softmax deriv
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) \
                                * normalizer))  #code for using Glorot init
    
def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

def adaDelta(cost, params, eta=0.2, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        #calc g-squared
        gSq = theano.shared(p.get_value() * 0.)
        dwSq = theano.shared(p.get_value() * 0.)

        #exp smoothed g squared
        gSqNew = rho * gSq + (1 - rho) * g * g

        #calc dx-squared
        dw = eta * T.sqrt(dwSq + epsilon) * g / T.sqrt(gSq + epsilon)
        dwSqNew = rho * dwSq + (1 - rho) * dw * dw

        updates.append((dwSq, dwSqNew))
        updates.append((gSq, gSqNew))
        updates.append((p, p - dw))
    return updates

X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((len(vectorizer.vocabulary_) , yTest.shape[1]))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.1]]

train = theano.function(inputs=[X, Y], 
                        outputs=[cost, gradient], 
                        updates=update, 
                        allow_input_downcast=True)

predict = theano.function(inputs=[X], 
                          outputs=y_pred, 
                          allow_input_downcast=True)

p_v1, p_t1, i1 = [], [], []
for i in range(7000):
    cost, gradient = train(xTrain, yTrain)
    if i % 50 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        p_v1 += [tr]
        p_t1 += [trr]
        i1 += [i]
        print 'Round:', i," Test:", tr, ' Train:', trr


Round: 0  Test: 0.228542407313  Train: 0.230482641829
Round: 50  Test: 0.475368207212  Train: 0.509398814564
Round: 100  Test: 0.501777552057  Train: 0.544623200677
Round: 150  Test: 0.523108176739  Train: 0.564267569856
Round: 200  Test: 0.540883697308  Train: 0.574767146486
Round: 250  Test: 0.557643473845  Train: 0.58899237934
Round: 300  Test: 0.57592686643  Train: 0.598306519898
Round: 350  Test: 0.585576434738  Train: 0.609652836579
Round: 400  Test: 0.590655154901  Train: 0.617781541067
Round: 450  Test: 0.594210259015  Train: 0.626248941575
Round: 500  Test: 0.599288979177  Train: 0.633700254022
Round: 550  Test: 0.605891315389  Train: 0.638441998307
Round: 600  Test: 0.608938547486  Train: 0.644199830652
Round: 650  Test: 0.611477907567  Train: 0.649280270957
Round: 700  Test: 0.613001523616  Train: 0.649788314987
Round: 750  Test: 0.614525139665  Train: 0.653175275191
Round: 800  Test: 0.618080243779  Train: 0.655884843353
Round: 850  Test: 0.621635347892  Train: 0.6591024555

Amazing results, almost equal to logistic regression at 5min. 

### 4-Layer Network

In [13]:
%%time

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.05*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x

X = T.fmatrix()
Y = T.fmatrix()

# w = init_weights((len(vectorizer.vocabulary_) , yTest.shape[1])) # old 
w_h = init_weights((len(vectorizer.vocabulary_), 600))
w_h2 = init_weights((600, 600))
w_o = init_weights((600, yTest.shape[1]))

noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

p_v2, p_t2, i2 = [], [], []
for i in range(200):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])
    if i%10 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        p_v2 += [tr]
        p_t2 += [trr]
        i2  += [i]
        print 'Round:', i," Test:", tr, ' Train:', trr
        

Round: 0  Test: 0.263585576435  Train: 0.28382726503
Round: 10  Test: 0.303707465719  Train: 0.310584250635
Round: 20  Test: 0.490096495683  Train: 0.498899237934
Round: 30  Test: 0.601320467242  Train: 0.63116003387
Round: 40  Test: 0.652615540884  Train: 0.693480101609
Round: 50  Test: 0.675977653631  Train: 0.732768839966
Round: 60  Test: 0.68969019807  Train: 0.756308213378
Round: 70  Test: 0.696292534281  Train: 0.775444538527
Round: 80  Test: 0.701371254444  Train: 0.78933107536
Round: 90  Test: 0.710512950736  Train: 0.806096528366
Round: 100  Test: 0.716607414931  Train: 0.817104149026
Round: 110  Test: 0.717623158964  Train: 0.830990685859
Round: 120  Test: 0.728796343321  Train: 0.844877222693
Round: 130  Test: 0.724225495175  Train: 0.854868755292
Round: 140  Test: 0.726764855256  Train: 0.86316680779
Round: 150  Test: 0.728796343321  Train: 0.871634208298
Round: 160  Test: 0.725749111224  Train: 0.878577476715
Round: 170  Test: 0.731335703403  Train: 0.886367485182
Round: 1

#### Strong signs of overfitting

Next steps:  

Leaky RELU swapped out for ELU, alpha = 1.0   
Train accuracy dropped by 14% (good)  
Test accuracy dropped by 3% (bad)   

Leaky RELU swapped out for ELU, alpha = .5  
Train accuracy dropped by 7% (good)  
Test accuracy dropped by 1% (bad)   

Leaky RELU  alpha = .1
RHO = .99
Train accuracy dropped by 7% (good)  
Test accuracy up by 1% (good)  

In [54]:
%%time

""" glorot 4-layer: dropout, dropout, dropout """

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))


def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, b_h, w_h2, b_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h) + b_h)

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2) + b_h2)

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x



X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((len(vectorizer.vocabulary_), 600))
b_h = theano.shared(floatX(np.zeros(600,)))
w_h2 = init_weights((600, 600))
b_h2 = theano.shared(floatX(np.zeros(600,)))
w_o = init_weights((600, yTest.shape[1]))
# b_values = numpy.zeros((600,), dtype=theano.config.floatX)

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, w_h2, b_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, b_h, w_h2, b_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, w_h2, b_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(301):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])

    if i%10 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.4281361097  Train: 0.435055038103
Round: 10  Test: 0.594718131031  Train: 0.644199830652
Round: 20  Test: 0.65718638903  Train: 0.71566469094
Round: 30  Test: 0.680548501778  Train: 0.755122777307
Round: 40  Test: 0.693245302184  Train: 0.781202370872
Round: 50  Test: 0.703402742509  Train: 0.798814563929
Round: 60  Test: 0.715083798883  Train: 0.815410668925
Round: 70  Test: 0.720162519045  Train: 0.834546994073
Round: 80  Test: 0.722701879126  Train: 0.846909398815
Round: 90  Test: 0.72625698324  Train: 0.859610499577
Round: 100  Test: 0.735906551549  Train: 0.867231160034
Round: 110  Test: 0.728288471305  Train: 0.875867908552
Round: 120  Test: 0.726764855256  Train: 0.88433530906
Round: 130  Test: 0.731335703403  Train: 0.890431837426
Round: 140  Test: 0.732859319451  Train: 0.898391193903
Round: 150  Test: 0.733875063484  Train: 0.904487722269
Round: 160  Test: 0.732859319451  Train: 0.908721422523
Round: 170  Test: 0.730827831386  Train: 0.914140558848
Round: 18

### Next, batch normalization.

In [17]:
""" glorot 4-layer: batch, dropout, dropout """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    mean = X.mean((1,), keepdims=True)
    std = T.ones_like(X.var((0,), keepdims = True))
    X = batch_normalization(X, gamma= gamma, beta= beta, 
                            mean= mean, 
                            std= std, mode='high_mem')    
    h = rectify(X)

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2) + b_h2)

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x



X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((len(vectorizer.vocabulary_), 600))
b_h = theano.shared(floatX(np.zeros(600)))
gamma = theano.shared(floatX(np.ones(600)))
beta = theano.shared(floatX(np.zeros(600)))

w_h2 = init_weights((600, 600))
b_h2 = theano.shared(floatX(np.zeros(600,)))
w_o = init_weights((600, yTest.shape[1]))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, gamma, beta, w_h2, b_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

p_v3, p_t3, i3 = [], [], []
for i in range(121):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])

    if i%5 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        p_v3 += [tr]
        p_t3 += [trr]
        i3  += [i]        
        print 'Round:', i," Test:", tr, ' Train:', trr
        

DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/hollis_win/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_58_Stepping_9_GenuineIntel-2.7.11-64/tmpjufiuq/449ea1846932454968e0663d59c262c0.lib and object C:/Users/hollis_win/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_58_Stepping_9_GenuineIntel-2.7.11-64/tmpjufiuq/449ea1846932454968e0663d59c262c0.exp



Round: 0  Test: 0.44032503809  Train: 0.458933107536
Round: 5  Test: 0.561198577958  Train: 0.591871295512
Round: 10  Test: 0.6124936516  Train: 0.661303979678
Round: 15  Test: 0.647536820721  Train: 0.709229466554
Round: 20  Test: 0.660741493144  Train: 0.739712108383
Round: 25  Test: 0.681056373794  Train: 0.762912785775
Round: 30  Test: 0.692737430168  Train: 0.780694326842
Round: 35  Test: 0.698831894363  Train: 0.795596951736
Round: 40  Test: 0.704926358558  Train: 0.807451312447
Round: 45  Test: 0.711528694769  Train: 0.82133784928
Round: 50  Test: 0.716607414931  Train: 0.833022861981
Round: 55  Test: 0.726764855256  Train: 0.844030482642
Round: 60  Test: 0.723209751143  Train: 0.855546147333
Round: 65  Test: 0.725749111224  Train: 0.867061812024
Round: 70  Test: 0.727780599289  Train: 0.874513124471
Round: 75  Test: 0.730827831386  Train: 0.881456392887
Round: 80  Test: 0.73031995937  Train: 0.887552921253
Round: 85  Test: 0.730827831386  Train: 0.892972057578
Round: 90  Test: 

# 74% !!! 

Lets see if thats repeatable.... 

In [11]:
""" glorot 4-layer: batch, dropout, dropout """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    mean = X.mean((0,), keepdims=True)
    std = T.ones_like(X.var((0,), keepdims = True))
    X = batch_normalization(X, gamma= gamma, beta= beta, 
                            mean= mean, #X.mean((0,), keepdims=True), 
                            std= std, mode='high_mem')    
    h = rectify(X)

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2) + b_h2)

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x



X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((len(vectorizer.vocabulary_), 600))
b_h = theano.shared(floatX(np.zeros(600)))
gamma = theano.shared(floatX(np.ones(600)))
beta = theano.shared(floatX(np.zeros(600)))

w_h2 = init_weights((600, 600))
b_h2 = theano.shared(floatX(np.zeros(600,)))
w_o = init_weights((600, yTest.shape[1]))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, gamma, beta, w_h2, b_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(101):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])

    if i%5 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.419502285424  Train: 0.459102455546
Round: 5  Test: 0.539867953276  Train: 0.585266723116
Round: 10  Test: 0.598781107161  Train: 0.661473327688
Round: 15  Test: 0.647536820721  Train: 0.706689246401
Round: 20  Test: 0.668359573388  Train: 0.743099068586
Round: 25  Test: 0.687658710005  Train: 0.766299745978
Round: 30  Test: 0.695784662265  Train: 0.788484335309
Round: 35  Test: 0.704418486541  Train: 0.80033869602
Round: 40  Test: 0.715083798883  Train: 0.812870448772
Round: 45  Test: 0.717623158964  Train: 0.828281117697
Round: 50  Test: 0.723717623159  Train: 0.838441998307
Round: 55  Test: 0.725241239208  Train: 0.848602878916
Round: 60  Test: 0.730827831386  Train: 0.856392887384
Round: 65  Test: 0.734890807517  Train: 0.865876375953
Round: 70  Test: 0.737430167598  Train: 0.876883996613
Round: 75  Test: 0.737430167598  Train: 0.881964436918
Round: 80  Test: 0.73844591163  Train: 0.890093141406
Round: 85  Test: 0.740985271712  Train: 0.89466553768
Round: 90  Test

# 74% again!

Next, 3-layer batch normalization

In [4]:
""" glorot 4-layer: batch, batch, batch """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(50):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Using gpu device 0: GeForce GTX 970 (CNMeM is disabled, CuDNN not available)


Round: 0  Test: 0.483494159472  Train: 0.528196443692
Round: 1  Test: 0.527171152869  Train: 0.573581710415
Round: 2  Test: 0.566277298121  Train: 0.613547840813
Round: 3  Test: 0.596749619096  Train: 0.650973751058
Round: 4  Test: 0.617572371762  Train: 0.681625740898
Round: 5  Test: 0.641442356526  Train: 0.710584250635
Round: 6  Test: 0.656678517014  Train: 0.731414055885
Round: 7  Test: 0.668359573388  Train: 0.750211685013
Round: 8  Test: 0.675977653631  Train: 0.764944961897
Round: 9  Test: 0.688674454038  Train: 0.776121930567
Round: 10  Test: 0.694768918233  Train: 0.791363251482
Round: 11  Test: 0.70187912646  Train: 0.802201524132
Round: 12  Test: 0.713052310818  Train: 0.813378492803
Round: 13  Test: 0.717115286948  Train: 0.819983065199
Round: 14  Test: 0.716099542915  Train: 0.828281117697
Round: 15  Test: 0.72219400711  Train: 0.836409822185
Round: 16  Test: 0.724225495175  Train: 0.845893310754
Round: 17  Test: 0.723209751143  Train: 0.85249788315
Round: 18  Test: 0.7267

In [141]:
# random indexing for batches 

index = np.random.choice(xTrain.shape[0], 3)
print index
print xTrain[index]
print yTrain[index]

[3558 5373 1030]
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[[ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.]]


In [156]:
""" glorot 4-layer: batch, batch, batch 
    random indexing                       """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(26):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        index = np.random.choice(xTrain.shape[0], batch_size, replace=False)
        cost = train(xTrain[index], yTrain[index])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.479939055358  Train: 0.51532599492
Round: 1  Test: 0.515490096496  Train: 0.5583403895
Round: 2  Test: 0.558151345861  Train: 0.613547840813
Round: 3  Test: 0.583544946673  Train: 0.638272650296
Round: 4  Test: 0.605383443372  Train: 0.667908552075
Round: 5  Test: 0.633316404266  Train: 0.698221845893
Round: 6  Test: 0.660741493144  Train: 0.725486875529
Round: 7  Test: 0.671914677501  Train: 0.744453852667
Round: 8  Test: 0.682579989843  Train: 0.762574089754
Round: 9  Test: 0.684103605891  Train: 0.769348010161
Round: 10  Test: 0.692229558151  Train: 0.785944115157
Round: 11  Test: 0.701371254444  Train: 0.79966130398
Round: 12  Test: 0.70594210259  Train: 0.806943268417
Round: 13  Test: 0.71406805485  Train: 0.818120237087
Round: 14  Test: 0.719146775013  Train: 0.824386113463
Round: 15  Test: 0.717115286948  Train: 0.834716342083
Round: 16  Test: 0.724733367191  Train: 0.844538526672
Round: 17  Test: 0.724225495175  Train: 0.853852667231
Round: 18  Test: 0.7211782

Random idexing (with replacement) no effect

In [157]:
""" glorot 4-layer: batch, batch, batch 
    random indexing, ELU                """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(34):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        index = np.random.choice(xTrain.shape[0], batch_size, replace=False)
        cost = train(xTrain[index], yTrain[index])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.470797359066  Train: 0.517696867062
Round: 1  Test: 0.511427120366  Train: 0.55766299746
Round: 2  Test: 0.544438801422  Train: 0.598306519898
Round: 3  Test: 0.585068562722  Train: 0.644877222693
Round: 4  Test: 0.61655662773  Train: 0.677561388654
Round: 5  Test: 0.626714068055  Train: 0.696697713802
Round: 6  Test: 0.64499746064  Train: 0.725148179509
Round: 7  Test: 0.672422549518  Train: 0.749872988992
Round: 8  Test: 0.670391061453  Train: 0.764436917866
Round: 9  Test: 0.687658710005  Train: 0.77866215072
Round: 10  Test: 0.691721686135  Train: 0.795258255715
Round: 11  Test: 0.69781615033  Train: 0.807620660457
Round: 12  Test: 0.703910614525  Train: 0.816934801016
Round: 13  Test: 0.709497206704  Train: 0.827265029636
Round: 14  Test: 0.712036566785  Train: 0.836240474174
Round: 15  Test: 0.716099542915  Train: 0.841320914479
Round: 16  Test: 0.715591670899  Train: 0.851989839119
Round: 17  Test: 0.71406805485  Train: 0.859779847587
Round: 18  Test: 0.7216861

ELU, no effect

In [10]:
%%time

""" glorot 4-layer: batch, batch, batch 
    random indexing without replacement, ELU                
    epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(41):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print

Round: 0     Test: 0.558333333333 Train: 0.597619047619
Round: 1     Test: 0.599479166667 Train: 0.666836734694
Round: 2     Test: 0.640625       Train: 0.711734693878
Round: 3     Test: 0.6734375      Train: 0.740646258503
Round: 4     Test: 0.688020833333 Train: 0.76343537415
Round: 5     Test: 0.693229166667 Train: 0.786394557823
Round: 6     Test: 0.704166666667 Train: 0.801700680272
Round: 7     Test: 0.710416666667 Train: 0.820408163265
Round: 8     Test: 0.717708333333 Train: 0.825680272109
Round: 9     Test: 0.722916666667 Train: 0.837244897959
Round: 10    Test: 0.711458333333 Train: 0.851530612245
Round: 11    Test: 0.711979166667 Train: 0.866326530612
Round: 12    Test: 0.722916666667 Train: 0.868367346939
Round: 13    Test: 0.7265625      Train: 0.876870748299
Round: 14    Test: 0.725          Train: 0.879591836735
Round: 15    Test: 0.719791666667 Train: 0.892176870748
Round: 16    Test: 0.727604166667 Train: 0.897789115646
Round: 17    Test: 0.728125       Train: 0.899489

70% at round 6!


73% at round 18. 

In [29]:
%%time

""" glorot 4-layer: batch, batch, batch 
    random indexing without replacement, RELU                
    epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(41):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print

Round: 0     Test: 0.540625       Train: 0.60119047619
Round: 1     Test: 0.601041666667 Train: 0.66768707483
Round: 2     Test: 0.647916666667 Train: 0.712414965986
Round: 3     Test: 0.665104166667 Train: 0.743027210884
Round: 4     Test: 0.680208333333 Train: 0.770918367347
Round: 5     Test: 0.696875       Train: 0.787585034014
Round: 6     Test: 0.701041666667 Train: 0.803401360544
Round: 7     Test: 0.706770833333 Train: 0.821768707483
Round: 8     Test: 0.707291666667 Train: 0.833163265306
Round: 9     Test: 0.7109375      Train: 0.840646258503
Round: 10    Test: 0.718229166667 Train: 0.853231292517
Round: 11    Test: 0.719791666667 Train: 0.859013605442
Round: 12    Test: 0.727604166667 Train: 0.87380952381
Round: 13    Test: 0.719270833333 Train: 0.875170068027
Round: 14    Test: 0.7265625      Train: 0.883163265306
Round: 15    Test: 0.721875       Train: 0.887755102041
Round: 16    Test: 0.7078125      Train: 0.894727891156
Round: 17    Test: 0.7234375      Train: 0.89846938

#### Will adding back dropout get us back to 74%?

In [18]:
%%time

""" glorot 4-layer: batch, drop, batch, drop, batch drop 
    random indexing without replacement, ELU, epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem') 
    X = dropout(X, p_drop_hidden)
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho, .0, .2)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho, .0, .0)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(51):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print



Round: 0     Test: 0.515625       Train: 0.56462585034
Round: 1     Test: 0.569791666667 Train: 0.614965986395
Round: 2     Test: 0.609375       Train: 0.666156462585
Round: 3     Test: 0.628125       Train: 0.702380952381
Round: 4     Test: 0.646875       Train: 0.725170068027
Round: 5     Test: 0.669270833333 Train: 0.74574829932
Round: 6     Test: 0.678645833333 Train: 0.764115646259
Round: 7     Test: 0.686458333333 Train: 0.775680272109
Round: 8     Test: 0.693229166667 Train: 0.791836734694
Round: 9     Test: 0.7015625      Train: 0.803401360544
Round: 10    Test: 0.714583333333 Train: 0.809013605442
Round: 11    Test: 0.7125         Train: 0.822619047619
Round: 12    Test: 0.715104166667 Train: 0.829421768707
Round: 13    Test: 0.716145833333 Train: 0.83537414966
Round: 14    Test: 0.721354166667 Train: 0.838775510204
Round: 15    Test: 0.713020833333 Train: 0.852551020408
Round: 16    Test: 0.71875        Train: 0.853231292517
Round: 17    Test: 0.721354166667 Train: 0.86020408

### High of 73.9% with just a little dropout (0.0, 0.2)

In [30]:
%%time

""" glorot 4-layer: batch, drop, batch, drop, batch drop 
    random indexing without replacement, ELU, epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem') 
    X = dropout(X, p_drop_hidden)
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho, .0, .2)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho, .0, .0)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(51):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print



Round: 0     Test: 0.533333333333 Train: 0.571088435374
Round: 1     Test: 0.576041666667 Train: 0.620238095238
Round: 2     Test: 0.610416666667 Train: 0.666496598639
Round: 3     Test: 0.631770833333 Train: 0.695578231293
Round: 4     Test: 0.663020833333 Train: 0.724319727891
Round: 5     Test: 0.670833333333 Train: 0.741836734694
Round: 6     Test: 0.684895833333 Train: 0.76156462585
Round: 7     Test: 0.685416666667 Train: 0.779081632653
Round: 8     Test: 0.6859375      Train: 0.791326530612
Round: 9     Test: 0.697395833333 Train: 0.799659863946
Round: 10    Test: 0.703125       Train: 0.809183673469
Round: 11    Test: 0.7125         Train: 0.819387755102
Round: 12    Test: 0.70625        Train: 0.82925170068
Round: 13    Test: 0.7078125      Train: 0.834353741497
Round: 14    Test: 0.721875       Train: 0.842857142857
Round: 15    Test: 0.719791666667 Train: 0.852380952381
Round: 16    Test: 0.7234375      Train: 0.85425170068
Round: 17    Test: 0.7171875      Train: 0.85714285

In [5]:
%%time

""" glorot 4-layer: batch, drop, batch, drop, batch drop 
    random indexing without replacement, ELU, epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem') 
    X = dropout(X, p_drop_hidden)
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho, .0, .2)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho, .0, .0)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(51):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print



Round: 0     Test: 0.5296875      Train: 0.56768707483
Round: 1     Test: 0.5734375      Train: 0.618537414966
Round: 2     Test: 0.6078125      Train: 0.659183673469
Round: 3     Test: 0.628125       Train: 0.693367346939
Round: 4     Test: 0.652604166667 Train: 0.726360544218
Round: 5     Test: 0.665625       Train: 0.744727891156
Round: 6     Test: 0.675520833333 Train: 0.763605442177
Round: 7     Test: 0.677604166667 Train: 0.773639455782
Round: 8     Test: 0.6890625      Train: 0.78843537415
Round: 9     Test: 0.693229166667 Train: 0.794557823129
Round: 10    Test: 0.715104166667 Train: 0.808333333333
Round: 11    Test: 0.7109375      Train: 0.818367346939
Round: 12    Test: 0.711458333333 Train: 0.827551020408
Round: 13    Test: 0.713020833333 Train: 0.837414965986
Round: 14    Test: 0.716145833333 Train: 0.839285714286
Round: 15    Test: 0.7203125      Train: 0.848639455782
Round: 16    Test: 0.720833333333 Train: 0.855272108844
Round: 17    Test: 0.736979166667 Train: 0.8557823

In [6]:
%%time

""" glorot 4-layer: batch, drop, batch, drop, batch drop 
    random indexing without replacement, ELU, epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def model(X, w_h, g_h, bb_h, w_h2, g_h2, bb_h2, w_o, g_ho, bb_ho):
    
    X = T.dot(X, w_h) 
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h = rectify(X)

    h  = T.dot(h, w_h2)
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h2 = rectify(h)

    h2 = T.dot(h2, w_o)
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem') 
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550
h3_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_h3 = init_weights((h2_size, h3_size))
g_h3 = theano.shared(floatX(np.ones((h3_size))))
bb_h3 = theano.shared(floatX(np.zeros((h3_size))))

w_o = init_weights((h3_size, yTest.shape[1]))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, g_h, bb_h, 
                                      w_h2, g_h2, bb_h2, 
                                      w_o, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, g_h, bb_h, 
                    w_h2, g_h2, bb_h2, 
                    w_o, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, g_h, bb_h, w_h2, g_h2, bb_h2, 
          w_o, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(30):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print



Round: 0     Test: 0.544791666667 Train: 0.605782312925
Round: 1     Test: 0.6046875      Train: 0.66768707483
Round: 2     Test: 0.635416666667 Train: 0.71037414966
Round: 3     Test: 0.665625       Train: 0.741666666667
Round: 4     Test: 0.6734375      Train: 0.767517006803
Round: 5     Test: 0.690104166667 Train: 0.790476190476
Round: 6     Test: 0.698958333333 Train: 0.804081632653
Round: 7     Test: 0.706770833333 Train: 0.822108843537
Round: 8     Test: 0.697395833333 Train: 0.828571428571
Round: 9     Test: 0.715104166667 Train: 0.844387755102
Round: 10    Test: 0.716666666667 Train: 0.848979591837
Round: 11    Test: 0.715625       Train: 0.857993197279
Round: 12    Test: 0.723958333333 Train: 0.862755102041
Round: 13    Test: 0.725520833333 Train: 0.875170068027
Round: 14    Test: 0.725          Train: 0.881972789116
Round: 15    Test: 0.721875       Train: 0.889965986395
Round: 16    Test: 0.729166666667 Train: 0.895238095238
Round: 17    Test: 0.717708333333 Train: 0.8982993

In [35]:
%%time

""" 5-layer """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def model(X, w_h, g_h, bb_h, w_h2, g_h2, bb_h2,
          w_h3, g_h3, bb_h3, w_o, g_ho, bb_ho):
    
    X = T.dot(X, w_h) 
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((1,), keepdims=True),
                            std= T.ones_like(X.std((1,), keepdims = True)), 
                            mode='high_mem') 
    h = rectify(X)

    h  = T.dot(h, w_h2)
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((1,), keepdims=True),
                            std= T.ones_like(h.std((1,), keepdims = True)), 
                            mode='high_mem') 
    h2 = rectify(h)

    h2 = T.dot(h2, w_h3)
    h2 = batch_normalization(h2, gamma= g_h3, beta= bb_h3, 
                            mean= h2.mean((1,), keepdims=True),
                            std= T.ones_like(h2.std((1,), keepdims = True)), 
                            mode='high_mem') 
    h3 = rectify(h2)
    
    h3 = T.dot(h3, w_o)
    h3 = batch_normalization(h3, gamma= g_ho, beta= bb_ho, 
                            mean= h3.mean((1,), keepdims=True),
                            std= T.ones_like(h3.std((1,), keepdims = True)), 
                            mode='high_mem') 
    py_x = softmax(h3)
    return h, h2, h3, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 1000
h2_size = 1000
h3_size = 1000

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_h3 = init_weights((h2_size, h3_size))
g_h3 = theano.shared(floatX(np.ones((h3_size))))
bb_h3 = theano.shared(floatX(np.zeros((h3_size))))

w_o = init_weights((h3_size, yTest.shape[1]))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_h3, noise_py_x = model(X, w_h, g_h, bb_h, 
                                      w_h2, g_h2, bb_h2, 
                                       w_h3, g_h3, bb_h3, 
                                      w_o, g_ho, bb_ho)

h, h2, h3, py_x = model(X, w_h, g_h, bb_h, 
                    w_h2, g_h2, bb_h2, 
                     w_h3, g_h3, bb_h3, 
                    w_o, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, g_h, bb_h, w_h2, g_h2, bb_h2, 
           w_h3, g_h3, bb_h3, w_o, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

p_v4, p_t4, i4 = [], [], []
for i in range(41):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]
    
    p_v4 += [np.mean(tr)]
    p_t4 += [np.mean(trr)]
    i4   += [i]  
    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print

Round: 0     Test: 0.622395833333 Train: 0.688605442177
Round: 1     Test: 0.6703125      Train: 0.751870748299
Round: 2     Test: 0.684895833333 Train: 0.788945578231
Round: 3     Test: 0.699479166667 Train: 0.826530612245
Round: 4     Test: 0.723958333333 Train: 0.855102040816
Round: 5     Test: 0.704166666667 Train: 0.86768707483
Round: 6     Test: 0.706770833333 Train: 0.869727891156
Round: 7     Test: 0.719791666667 Train: 0.906462585034
Round: 8     Test: 0.730729166667 Train: 0.918197278912
Round: 9     Test: 0.730729166667 Train: 0.921258503401
Round: 10    Test: 0.722916666667 Train: 0.928231292517
Round: 11    Test: 0.732291666667 Train: 0.937414965986
Round: 12    Test: 0.722916666667 Train: 0.93537414966
Round: 13    Test: 0.732291666667 Train: 0.943367346939
Round: 14    Test: 0.730208333333 Train: 0.946768707483
Round: 15    Test: 0.728125       Train: 0.94880952381
Round: 16    Test: 0.733333333333 Train: 0.949489795918
Round: 17    Test: 0.721875       Train: 0.94880952

In [39]:
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode() # run at the start of every notebook

trace0 = go.Scatter(
    x = i1[:1000],
    y = p_v1[:1000],
    mode = 'lines',
    name = '2-Layer')

trace2 = go.Scatter(
    x = i2,
    y = p_v2,
    mode = 'lines',
    name = '4-Layer Dropout')

trace4 = go.Scatter(
    x = i3,
    y = p_v3,
    mode = 'lines',
    name = '4-Layer BN')

trace6 = go.Scatter(
    x = i4,
    y = p_v4,
    mode = 'lines',
    name = '5-Layer BN')

data = [trace0, trace2, trace4, trace6 ]

layout = go.Layout(
    xaxis=dict(
        type='linear',
        range=[0,400]
    ),
    yaxis=dict(
        type='linear',
        autorange=True
    ))
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)