## Neural Network

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
import sklearn.metrics as sk

import pandas as pd
from collections import Counter
import numpy as np
import nltk

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [3]:
modern = pd.read_pickle('data/5color_modern_no_name_hardmode.pkl')
Counter(modern.colors)

Counter({u'Black': 1576,
         u'Blue': 1573,
         u'Green': 1566,
         u'Red': 1575,
         u'White': 1584})

After all the data munging the classes are still amazingly balanced.

## Lets single out blue and red for a binary classification


In [4]:
UG = modern.loc[modern['colors'].isin(['Blue', 'Red'])]

UG.reset_index(inplace=True)
UG.pop('index')

UG[['name', 'colors', 'cmc', 'text']].sample(6)

Unnamed: 0,name,colors,cmc,text
1668,Dormant Gomazoa,Blue,3,Flying This enters the battlefield tapped. Thi...
913,Prodigal Pyromancer,Red,3,Tap : This deals 1 damage to target creature o...
2655,Scouring Sands,Red,2,This deals 1 damage to each creature your oppo...
26,Evacuation,Blue,5,Return all creatures to their owners hands.
119,Disarm,Blue,1,Unattach all Equipment from target creature.
2358,Paralyzing Grasp,Blue,3,Enchant creature Enchanted creature doesnt unt...


In [5]:
dummies = pd.get_dummies(UG.colors)
dummies.head()

Unnamed: 0,Blue,Red
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0


In [6]:
vectorizer = CountVectorizer()

vec_X = vectorizer.fit_transform(UG['text'])

xTrain, xTest, yTrain, yTest = train_test_split(vec_X, dummies,
                                             random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

# xTrain = xTrain.reshape(-1, 1, 1, 815)
# xTest = xTest.reshape(-1, 1, 1, 815)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

(2361, 815)
(2361, 2)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(2361, 815)
(2361, 2)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
There are 815 words in the vocabulary.


In [69]:
import theano
from theano import tensor as T
import numpy as np
from math import sqrt


def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * .2  #factors: correct for uni[0,1], glo, glo, softmax deriv
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) \
                                * normalizer))  #code for using Glorot init
    
def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

def adaDelta(cost, params, eta=0.2, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        #calc g-squared
        gSq = theano.shared(p.get_value() * 0.)
        dwSq = theano.shared(p.get_value() * 0.)

        #exp smoothed g squared
        gSqNew = rho * gSq + (1 - rho) * g * g

        #calc dx-squared
        dw = eta * T.sqrt(dwSq + epsilon) * g / T.sqrt(gSq + epsilon)
        dwSqNew = rho * dwSq + (1 - rho) * dw * dw

        updates.append((dwSq, dwSqNew))
        updates.append((gSq, gSqNew))
        updates.append((p, p - dw))
    return updates

X = T.fmatrix()
Y = T.fmatrix()
# grad_list = theano.shared(np.array([0,0]), name='grad_list')

w = init_weights((815 , 2))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.1]]

train = theano.function(inputs=[X, Y], 
                        outputs=[cost, gradient], 
                        updates=update, 
                        allow_input_downcast=True)

predict = theano.function(inputs=[X], 
                          outputs=y_pred, 
                          allow_input_downcast=True)


for i in range(401):
# #     for start, end in zip(range(0, xTrain.shape[0], 128), 
# #                           range(128, xTrain.shape[0], 128)):
# #         cost, gradient = train(xTrain[start:end], yTrain[start:end])
    cost, gradient = train(xTrain, yTrain)
    if i % 30 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr


Round: 0  Test: 0.651842439644  Train: 0.629394324439
Round: 30  Test: 0.810673443456  Train: 0.831427361288
Round: 60  Test: 0.833545108005  Train: 0.849216433715
Round: 90  Test: 0.853875476493  Train: 0.861075815332
Round: 120  Test: 0.852604828463  Train: 0.866581956798
Round: 150  Test: 0.855146124524  Train: 0.873358746294
Round: 180  Test: 0.861499364676  Train: 0.879288437103
Round: 210  Test: 0.864040660737  Train: 0.885218127912
Round: 240  Test: 0.870393900889  Train: 0.889030072003
Round: 270  Test: 0.866581956798  Train: 0.892842016095
Round: 300  Test: 0.866581956798  Train: 0.894536213469
Round: 330  Test: 0.869123252859  Train: 0.894112664125
Round: 360  Test: 0.87166454892  Train: 0.895383312156
Round: 390  Test: 0.87166454892  Train: 0.895806861499


### All Five Vs All Five

And now the main event - simply comparing two colors was too easy. Five way classification of all the colors.

In [7]:
vectorizer = CountVectorizer()

y = pd.get_dummies(modern.colors)

X = vectorizer.fit_transform(modern.text)

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

# xTrain = xTrain.reshape(-1, 1, 1, 815)
# xTest = xTest.reshape(-1, 1, 1, 815)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

(5905, 1161)
(5905, 5)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(5905, 1161)
(5905, 5)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
There are 1,161 words in the vocabulary.


In [81]:
%%time

import theano
from theano import tensor as T
import numpy as np
from math import sqrt
# import matplotlib.pyplot as plt
# import seaborn
# %matplotlib inline

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * .2  #factors: correct for uni[0,1], glo, glo, softmax deriv
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) \
                                * normalizer))  #code for using Glorot init
    
def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

def adaDelta(cost, params, eta=0.2, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        #calc g-squared
        gSq = theano.shared(p.get_value() * 0.)
        dwSq = theano.shared(p.get_value() * 0.)

        #exp smoothed g squared
        gSqNew = rho * gSq + (1 - rho) * g * g

        #calc dx-squared
        dw = eta * T.sqrt(dwSq + epsilon) * g / T.sqrt(gSq + epsilon)
        dwSqNew = rho * dwSq + (1 - rho) * dw * dw

        updates.append((dwSq, dwSqNew))
        updates.append((gSq, gSqNew))
        updates.append((p, p - dw))
    return updates

X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((len(vectorizer.vocabulary_) , yTest.shape[1]))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.1]]

train = theano.function(inputs=[X, Y], 
                        outputs=[cost, gradient], 
                        updates=update, 
                        allow_input_downcast=True)

predict = theano.function(inputs=[X], 
                          outputs=y_pred, 
                          allow_input_downcast=True)


for i in range(10001):
# #     for start, end in zip(range(0, xTrain.shape[0], 128), 
# #                           range(128, xTrain.shape[0], 128)):
# #         cost, gradient = train(xTrain[start:end], yTrain[start:end])
    cost, gradient = train(xTrain, yTrain)
    if i % 500 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr


Round: 0  Test: 0.292026409345  Train: 0.27366638442
Round: 500  Test: 0.601320467242  Train: 0.633192209992
Round: 1000  Test: 0.627729812087  Train: 0.667739204064
Round: 1500  Test: 0.641442356526  Train: 0.686875529213
Round: 2000  Test: 0.655662772981  Train: 0.700423370025
Round: 2500  Test: 0.66124936516  Train: 0.709060118544
Round: 3000  Test: 0.664296597257  Train: 0.716680779001
Round: 3500  Test: 0.676485525648  Train: 0.724132091448
Round: 4000  Test: 0.682579989843  Train: 0.728704487722
Round: 4500  Test: 0.687150837989  Train: 0.732938187976
Round: 5000  Test: 0.688674454038  Train: 0.737849280271
Round: 5500  Test: 0.688674454038  Train: 0.739881456393
Round: 6000  Test: 0.686642965973  Train: 0.741913632515
Round: 6500  Test: 0.691721686135  Train: 0.744623200677
Round: 7000  Test: 0.694261046216  Train: 0.746655376799
Round: 7500  Test: 0.695784662265  Train: 0.749364944962
Round: 8000  Test: 0.695784662265  Train: 0.750381033023
Round: 8500  Test: 0.694768918233  Tr

Amazing results, almost equal to logistic regression at 5min. 

### 4-Layer Network

In [87]:
%%time

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.05*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x

X = T.fmatrix()
Y = T.fmatrix()

# w = init_weights((len(vectorizer.vocabulary_) , yTest.shape[1])) # old 
w_h = init_weights((len(vectorizer.vocabulary_), 600))
w_h2 = init_weights((600, 600))
w_o = init_weights((600, yTest.shape[1]))

noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(401):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])
    if i%10 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.27780599289  Train: 0.302116850127
Round: 10  Test: 0.280853224987  Train: 0.287552921253
Round: 20  Test: 0.46063991874  Train: 0.49483488569
Round: 30  Test: 0.602336211275  Train: 0.63200677392
Round: 40  Test: 0.653631284916  Train: 0.690770533446
Round: 50  Test: 0.672930421534  Train: 0.734462320068
Round: 60  Test: 0.691213814119  Train: 0.759017781541
Round: 70  Test: 0.694261046216  Train: 0.774259102456
Round: 80  Test: 0.699847638395  Train: 0.788653683319
Round: 90  Test: 0.710512950736  Train: 0.805927180356
Round: 100  Test: 0.723209751143  Train: 0.818797629128
Round: 110  Test: 0.723717623159  Train: 0.832345469941
Round: 120  Test: 0.729304215338  Train: 0.847417442845
Round: 130  Test: 0.725241239208  Train: 0.853852667231
Round: 140  Test: 0.727272727273  Train: 0.86316680779
Round: 150  Test: 0.733875063484  Train: 0.872650296359
Round: 160  Test: 0.723717623159  Train: 0.878577476715
Round: 170  Test: 0.732351447435  Train: 0.885012701101
Round: 1

#### Strong signs of overfitting

Next steps:  

Leaky RELU swapped out for ELU, alpha = 1.0   
Train accuracy dropped by 14% (good)  
Test accuracy dropped by 3% (bad)   

Leaky RELU swapped out for ELU, alpha = .5  
Train accuracy dropped by 7% (good)  
Test accuracy dropped by 1% (bad)   

Leaky RELU  alpha = .1
RHO = .99
Train accuracy dropped by 7% (good)  
Test accuracy up by 1% (good)  

Next, batch normalization.

In [54]:
%%time

""" glorot 4-layer: dropout, dropout, dropout """

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))


def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, b_h, w_h2, b_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h) + b_h)

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2) + b_h2)

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x



X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((len(vectorizer.vocabulary_), 600))
b_h = theano.shared(floatX(np.zeros(600,)))
w_h2 = init_weights((600, 600))
b_h2 = theano.shared(floatX(np.zeros(600,)))
w_o = init_weights((600, yTest.shape[1]))
# b_values = numpy.zeros((600,), dtype=theano.config.floatX)

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, w_h2, b_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, b_h, w_h2, b_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, w_h2, b_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(301):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])

    if i%10 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.4281361097  Train: 0.435055038103
Round: 10  Test: 0.594718131031  Train: 0.644199830652
Round: 20  Test: 0.65718638903  Train: 0.71566469094
Round: 30  Test: 0.680548501778  Train: 0.755122777307
Round: 40  Test: 0.693245302184  Train: 0.781202370872
Round: 50  Test: 0.703402742509  Train: 0.798814563929
Round: 60  Test: 0.715083798883  Train: 0.815410668925
Round: 70  Test: 0.720162519045  Train: 0.834546994073
Round: 80  Test: 0.722701879126  Train: 0.846909398815
Round: 90  Test: 0.72625698324  Train: 0.859610499577
Round: 100  Test: 0.735906551549  Train: 0.867231160034
Round: 110  Test: 0.728288471305  Train: 0.875867908552
Round: 120  Test: 0.726764855256  Train: 0.88433530906
Round: 130  Test: 0.731335703403  Train: 0.890431837426
Round: 140  Test: 0.732859319451  Train: 0.898391193903
Round: 150  Test: 0.733875063484  Train: 0.904487722269
Round: 160  Test: 0.732859319451  Train: 0.908721422523
Round: 170  Test: 0.730827831386  Train: 0.914140558848
Round: 18

In [64]:
""" glorot 4-layer: batch, dropout, dropout """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    mean = X.mean((0,), keepdims=True)
    std = T.ones_like(X.var((0,), keepdims = True))
    X = batch_normalization(X, gamma= gamma, beta= beta, 
                            mean= mean, #X.mean((0,), keepdims=True), 
                            std= std, mode='high_mem')    
    h = rectify(X)

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2) + b_h2)

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x



X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((len(vectorizer.vocabulary_), 600))
b_h = theano.shared(floatX(np.zeros(600)))
gamma = theano.shared(floatX(np.ones(600)))
beta = theano.shared(floatX(np.zeros(600)))

w_h2 = init_weights((600, 600))
b_h2 = theano.shared(floatX(np.zeros(600,)))
w_o = init_weights((600, yTest.shape[1]))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, b_h, gamma, beta, w_h2, b_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, gamma, beta, w_h2, b_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(101):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])

    if i%10 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr

DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/hollis_win/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_58_Stepping_9_GenuineIntel-2.7.11-64/tmptwqeg4/c7585c6abe0348b5e3c9444eb9e46735.lib and object C:/Users/hollis_win/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_58_Stepping_9_GenuineIntel-2.7.11-64/tmptwqeg4/c7585c6abe0348b5e3c9444eb9e46735.exp

DEBUG: nvcc STDOUT mod.cu
   Creating library C:/Users/hollis_win/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_58_Stepping_9_GenuineIntel-2.7.11-64/tmpexvlpd/0a62a215d8ac7d63cdc696727d41b45b.lib and object C:/Users/hollis_win/AppData/Local/Theano/compiledir_Windows-10-10.0.10586-Intel64_Family_6_Model_58_Stepping_9_GenuineIntel-2.7.11-64/tmpexvlpd/0a62a215d8ac7d63cdc696727d41b45b.exp



Round: 0  Test: 0.430167597765  Train: 0.462489415749
Round: 10  Test: 0.601828339259  Train: 0.654022015241
Round: 20  Test: 0.664804469274  Train: 0.740220152413
Round: 30  Test: 0.695276790249  Train: 0.786960203218
Round: 40  Test: 0.708481462671  Train: 0.811515664691
Round: 50  Test: 0.726764855256  Train: 0.839796782388
Round: 60  Test: 0.732351447435  Train: 0.860118543607
Round: 70  Test: 0.739969527679  Train: 0.875867908552
Round: 80  Test: 0.740477399695  Train: 0.888907705334
Round: 90  Test: 0.740477399695  Train: 0.900254022015
Round: 100  Test: 0.73844591163  Train: 0.907874682472


In [155]:
""" glorot 4-layer: batch, batch, batch """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(25):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.461655662773  Train: 0.498221845893
Round: 1  Test: 0.518537328593  Train: 0.563420829805
Round: 2  Test: 0.556627729812  Train: 0.605927180356
Round: 3  Test: 0.589639410868  Train: 0.646570702794
Round: 4  Test: 0.617064499746  Train: 0.681287044877
Round: 5  Test: 0.645505332656  Train: 0.709060118544
Round: 6  Test: 0.657694261046  Train: 0.731075359865
Round: 7  Test: 0.668359573388  Train: 0.748856900931
Round: 8  Test: 0.675469781615  Train: 0.765114309907
Round: 9  Test: 0.687150837989  Train: 0.777476714649
Round: 10  Test: 0.694768918233  Train: 0.787806943268
Round: 11  Test: 0.701371254444  Train: 0.79949195597
Round: 12  Test: 0.704926358558  Train: 0.808975444539
Round: 13  Test: 0.709497206704  Train: 0.818458933108
Round: 14  Test: 0.717623158964  Train: 0.828111769687
Round: 15  Test: 0.72219400711  Train: 0.837087214225
Round: 16  Test: 0.724225495175  Train: 0.844538526672
Round: 17  Test: 0.727272727273  Train: 0.850296359018
Round: 18  Test: 0.727

In [141]:
# random indexing 

index = np.random.choice(xTrain.shape[0], 3)
print index
print xTrain[index]
print yTrain[index]

[3558 5373 1030]
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[[ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.]]


In [156]:
""" glorot 4-layer: batch, batch, batch 
    random indexing                       """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(26):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        index = np.random.choice(xTrain.shape[0], batch_size, replace=False)
        cost = train(xTrain[index], yTrain[index])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.479939055358  Train: 0.51532599492
Round: 1  Test: 0.515490096496  Train: 0.5583403895
Round: 2  Test: 0.558151345861  Train: 0.613547840813
Round: 3  Test: 0.583544946673  Train: 0.638272650296
Round: 4  Test: 0.605383443372  Train: 0.667908552075
Round: 5  Test: 0.633316404266  Train: 0.698221845893
Round: 6  Test: 0.660741493144  Train: 0.725486875529
Round: 7  Test: 0.671914677501  Train: 0.744453852667
Round: 8  Test: 0.682579989843  Train: 0.762574089754
Round: 9  Test: 0.684103605891  Train: 0.769348010161
Round: 10  Test: 0.692229558151  Train: 0.785944115157
Round: 11  Test: 0.701371254444  Train: 0.79966130398
Round: 12  Test: 0.70594210259  Train: 0.806943268417
Round: 13  Test: 0.71406805485  Train: 0.818120237087
Round: 14  Test: 0.719146775013  Train: 0.824386113463
Round: 15  Test: 0.717115286948  Train: 0.834716342083
Round: 16  Test: 0.724733367191  Train: 0.844538526672
Round: 17  Test: 0.724225495175  Train: 0.853852667231
Round: 18  Test: 0.7211782

In [157]:
""" glorot 4-layer: batch, batch, batch 
    random indexing, ELU                """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(34):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        index = np.random.choice(xTrain.shape[0], batch_size, replace=False)
        cost = train(xTrain[index], yTrain[index])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.470797359066  Train: 0.517696867062
Round: 1  Test: 0.511427120366  Train: 0.55766299746
Round: 2  Test: 0.544438801422  Train: 0.598306519898
Round: 3  Test: 0.585068562722  Train: 0.644877222693
Round: 4  Test: 0.61655662773  Train: 0.677561388654
Round: 5  Test: 0.626714068055  Train: 0.696697713802
Round: 6  Test: 0.64499746064  Train: 0.725148179509
Round: 7  Test: 0.672422549518  Train: 0.749872988992
Round: 8  Test: 0.670391061453  Train: 0.764436917866
Round: 9  Test: 0.687658710005  Train: 0.77866215072
Round: 10  Test: 0.691721686135  Train: 0.795258255715
Round: 11  Test: 0.69781615033  Train: 0.807620660457
Round: 12  Test: 0.703910614525  Train: 0.816934801016
Round: 13  Test: 0.709497206704  Train: 0.827265029636
Round: 14  Test: 0.712036566785  Train: 0.836240474174
Round: 15  Test: 0.716099542915  Train: 0.841320914479
Round: 16  Test: 0.715591670899  Train: 0.851989839119
Round: 17  Test: 0.71406805485  Train: 0.859779847587
Round: 18  Test: 0.7216861

In [8]:
""" glorot 4-layer: batch, batch, batch """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(25):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.488572879634  Train: 0.521422523285
Round: 1  Test: 0.533265617064  Train: 0.574089754445
Round: 2  Test: 0.563230066023  Train: 0.61049957663
Round: 3  Test: 0.591163026917  Train: 0.647586790855
Round: 4  Test: 0.615033011681  Train: 0.679593564776
Round: 5  Test: 0.637887252412  Train: 0.704149026249
Round: 6  Test: 0.660233621127  Train: 0.728365791702
Round: 7  Test: 0.672422549518  Train: 0.750211685013
Round: 8  Test: 0.684103605891  Train: 0.767146486029
Round: 9  Test: 0.693245302184  Train: 0.779508890771
Round: 10  Test: 0.696800406298  Train: 0.78899237934
Round: 11  Test: 0.704418486541  Train: 0.8
Round: 12  Test: 0.71000507872  Train: 0.809314140559
Round: 13  Test: 0.714575926866  Train: 0.819305673158
Round: 14  Test: 0.72219400711  Train: 0.830313293819
Round: 15  Test: 0.723209751143  Train: 0.839458086367
Round: 16  Test: 0.728796343321  Train: 0.845046570703
Round: 17  Test: 0.73031995937  Train: 0.849957662997
Round: 18  Test: 0.729304215338  Tra

In [30]:
""" glorot 4-layer: batch, batch, batch """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.1*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h,
                            #mean= T.zeros_like(X),
                            #std= T.ones_like(X), 
                            mean= T.zeros_like(X.var((0,), keepdims = True)),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem')    
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            #mean= T.zeros_like(h),
                            #std= T.ones_like(h),
                            mean= T.zeros_like(h.var((0,), keepdims = True)),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem')       
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            #mean= T.zeros_like(h2),
                            #std= T.ones_like(h2),
                            mean= T.zeros_like(h2.var((0,), keepdims = True)),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem')   
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones(h1_size)))
bb_h = theano.shared(floatX(np.zeros(h1_size)))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones(h2_size)))
bb_h2 = theano.shared(floatX(np.zeros(h2_size)))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones(yTest.shape[1])))
bb_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

batch_size = 60

for i in range(25):
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])

    tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
    trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
    print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.483494159472  Train: 0.518204911092
Round: 1  Test: 0.541899441341  Train: 0.579847586791
Round: 2  Test: 0.583544946673  Train: 0.630313293819
Round: 3  Test: 0.613509395632  Train: 0.67315834039
Round: 4  Test: 0.629761300152  Train: 0.696528365792
Round: 5  Test: 0.648044692737  Train: 0.716342082981
Round: 6  Test: 0.662772981209  Train: 0.733107535986
Round: 7  Test: 0.666835957339  Train: 0.749364944962
Round: 8  Test: 0.679024885729  Train: 0.761896697714
Round: 9  Test: 0.690705942103  Train: 0.771549534293
Round: 10  Test: 0.694768918233  Train: 0.780016934801
Round: 11  Test: 0.699339766379  Train: 0.788653683319
Round: 12  Test: 0.702894870493  Train: 0.796613039797
Round: 13  Test: 0.707465718639  Train: 0.806265876376
Round: 14  Test: 0.709497206704  Train: 0.813039796782
Round: 15  Test: 0.711528694769  Train: 0.822015241321
Round: 16  Test: 0.714575926866  Train: 0.827434377646
Round: 17  Test: 0.724225495175  Train: 0.838272650296
Round: 18  Test: 0.72

In [16]:
X[0]

Subtensor{int64}.0

In [20]:
X.var((0,))

var