In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
import sklearn.metrics as sk

import pandas as pd
from collections import Counter
import numpy as np
import nltk

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [2]:
modern = pd.read_pickle('data/5color_modern_no_name_hardmode.pkl')
Counter(modern.colors)

Counter({u'Black': 1576,
         u'Blue': 1573,
         u'Green': 1566,
         u'Red': 1575,
         u'White': 1584})

In [3]:
vectorizer = CountVectorizer()

y = pd.get_dummies(modern.colors)

X = vectorizer.fit_transform(modern.text)

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

def shuffle(x, y):
    # helper function to shuffle indicies each loop 
    index = np.random.choice(len(x), len(x), replace=False)
    return x[index], y[index]


print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

There are 1,161 words in the vocabulary.


In [6]:
%%time

""" glorot 4-layer: batch, drop, batch, drop, batch drop 
    random indexing without replacement, ELU, epsilon=1e-9  """

# batch normalization code adapted from 
# https://groups.google.com/forum/#!topic/theano-users/dMV6aabL1Ds 


import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.bn import batch_normalization
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * np.sqrt(6) / np.sqrt(h + w) * 1.0
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))

def rectify(X, alpha=0.01):
#     return T.maximum(X, 0.)
#    return T.maximum(X, 0.1*X)  #leaky rectifier
     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.99, epsilon=1e-9):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X


def model(X, w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho, p_drop_input, p_drop_hidden):
    X = T.dot(X, w_h) + b_h
    X = batch_normalization(X, gamma= g_h, beta= bb_h, 
                            mean= X.mean((0,), keepdims=True),
                            std= T.ones_like(X.var((0,), keepdims = True)), 
                            mode='high_mem') 
    X = dropout(X, p_drop_hidden)
    h = rectify(X)

    h  = T.dot(h, w_h2) + b_h2
    h = batch_normalization(h, gamma= g_h2, beta= bb_h2, 
                            mean= h.mean((0,), keepdims=True),
                            std= T.ones_like(h.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h)

    h2 = T.dot(h2, w_o) + b_ho
    h2 = batch_normalization(h2, gamma= g_ho, beta= bb_ho, 
                            mean= h2.mean((0,), keepdims=True),
                            std= T.ones_like(h2.var((0,), keepdims = True)), 
                            mode='high_mem') 
    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(h2)
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

batch_size = 60

h1_size = 600
h2_size = 550

w_h = init_weights((len(vectorizer.vocabulary_), h1_size))
b_h = theano.shared(floatX(np.zeros(h1_size)))
g_h = theano.shared(floatX(np.ones((h1_size))))
bb_h = theano.shared(floatX(np.zeros((h1_size))))

w_h2 = init_weights((h1_size, h2_size))
b_h2 = theano.shared(floatX(np.zeros(h2_size)))
g_h2 = theano.shared(floatX(np.ones((h2_size))))
bb_h2 = theano.shared(floatX(np.zeros((h2_size))))

w_o = init_weights((h2_size, yTest.shape[1]))
b_ho = theano.shared(floatX(np.zeros(yTest.shape[1])))
g_ho = theano.shared(floatX(np.ones((yTest.shape[1]))))
bb_ho = theano.shared(floatX(np.zeros((yTest.shape[1]))))

noise_h, noise_h2, noise_py_x = model(X, w_h, b_h, g_h, bb_h, 
                                      w_h2, b_h2, g_h2, bb_h2, 
                                      w_o, b_ho, g_ho, bb_ho, .0, .2)

h, h2, py_x = model(X, w_h, b_h, g_h, bb_h, 
                    w_h2, b_h2, g_h2, bb_h2, 
                    w_o, b_ho, g_ho, bb_ho, .0, .0)

y_x = T.argmax(py_x, axis=1)


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, b_h, g_h, bb_h, w_h2, b_h2, g_h2, bb_h2, 
          w_o, b_ho, g_ho, bb_ho]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)


for i in range(51):

    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):
        cost = train(xTrain[start:end], yTrain[start:end])
        
    xTrain, yTrain = shuffle(xTrain, yTrain)
    xTest, yTest   = shuffle(xTest, yTest)

    trr, tr = [], []
    for start, end in zip(range(0, len(xTrain), batch_size), range(batch_size, len(xTrain), batch_size)):        
        trr += [np.argmax(yTrain[start:end], axis=1) == predict(xTrain[start:end])]

    for start, end in zip(range(0, len(xTest), batch_size), range(batch_size, len(xTest), batch_size)):
        tr += [np.argmax(yTest[start:end], axis=1) == predict(xTest[start:end])]

    print "Round: %-5s Test: %-14s Train: %-8s" % (i, np.mean(tr), np.mean(trr))
    
print



Round: 0     Test: 0.533333333333 Train: 0.575850340136
Round: 1     Test: 0.572916666667 Train: 0.618537414966
Round: 2     Test: 0.605208333333 Train: 0.661904761905
Round: 3     Test: 0.638541666667 Train: 0.700170068027
Round: 4     Test: 0.652083333333 Train: 0.72619047619
Round: 5     Test: 0.685416666667 Train: 0.745408163265
Round: 6     Test: 0.690625       Train: 0.763945578231
Round: 7     Test: 0.692708333333 Train: 0.781292517007
Round: 8     Test: 0.691666666667 Train: 0.793367346939
Round: 9     Test: 0.7015625      Train: 0.800170068027
Round: 10    Test: 0.713541666667 Train: 0.815136054422
Round: 11    Test: 0.70625        Train: 0.822959183673
Round: 12    Test: 0.713541666667 Train: 0.825   
Round: 13    Test: 0.719791666667 Train: 0.839285714286
Round: 14    Test: 0.721875       Train: 0.843027210884
Round: 15    Test: 0.722395833333 Train: 0.840306122449
Round: 16    Test: 0.730729166667 Train: 0.851530612245
Round: 17    Test: 0.729166666667 Train: 0.859693877551