## Neural Network

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
import sklearn.metrics as sk

import pandas as pd
from collections import Counter
import numpy as np
import nltk

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [3]:
modern = pd.read_pickle('data/5color_modern_no_name_hardmode.pkl')
Counter(modern.colors)

Counter({u'Black': 1576,
         u'Blue': 1573,
         u'Green': 1566,
         u'Red': 1575,
         u'White': 1584})

After all the data munging the classes are still amazingly balanced.

## Lets single out blue and red for a binary classification


In [12]:
UG = modern.loc[modern['colors'].isin(['Blue', 'Red'])]

UG.reset_index(inplace=True)
UG.pop('index')

UG[['name', 'colors', 'cmc', 'text']].sample(6)

Unnamed: 0,name,colors,cmc,text
1801,Volcanic Strength,Red,2.0,Enchant creature Enchanted creature gets +2/+2...
341,Unnatural Speed,Red,1.0,Target creature gains haste until end of turn.
1966,Fling,Red,2.0,"As an additional cost to cast This, sacrifice ..."
1435,Master Transmuter,Blue,4.0,"{1}, Tap , Return an artifact you control to i..."
200,Vedalken Engineer,Blue,2.0,Tap : Add two mana of any one color to your ma...
275,Goka the Unjust,Red,4.0,Tap : This deals 4 damage to target creature t...


In [64]:
dummies = pd.get_dummies(UG.colors)
dummies.head()

Unnamed: 0,Blue,Red
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [65]:
vectorizer = CountVectorizer()

vec_X = vectorizer.fit_transform(UG['text'])

xTrain, xTest, yTrain, yTest = train_test_split(vec_X, dummies,
                                             random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

# xTrain = xTrain.reshape(-1, 1, 1, 815)
# xTest = xTest.reshape(-1, 1, 1, 815)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

(2361L, 815L)
(2361L, 2L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(2361L, 815L)
(2361L, 2L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
There are 815 words in the vocabulary.


In [69]:
import theano
from theano import tensor as T
import numpy as np
from math import sqrt
# import matplotlib.pyplot as plt
# import seaborn
# %matplotlib inline

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * .2  #factors: correct for uni[0,1], glo, glo, softmax deriv
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) \
                                * normalizer))  #code for using Glorot init
    
def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

def adaDelta(cost, params, eta=0.2, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        #calc g-squared
        gSq = theano.shared(p.get_value() * 0.)
        dwSq = theano.shared(p.get_value() * 0.)

        #exp smoothed g squared
        gSqNew = rho * gSq + (1 - rho) * g * g

        #calc dx-squared
        dw = eta * T.sqrt(dwSq + epsilon) * g / T.sqrt(gSq + epsilon)
        dwSqNew = rho * dwSq + (1 - rho) * dw * dw

        updates.append((dwSq, dwSqNew))
        updates.append((gSq, gSqNew))
        updates.append((p, p - dw))
    return updates

X = T.fmatrix()
Y = T.fmatrix()
# grad_list = theano.shared(np.array([0,0]), name='grad_list')

w = init_weights((815 , 2))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.1]]

train = theano.function(inputs=[X, Y], 
                        outputs=[cost, gradient], 
                        updates=update, 
                        allow_input_downcast=True)

predict = theano.function(inputs=[X], 
                          outputs=y_pred, 
                          allow_input_downcast=True)


for i in range(401):
# #     for start, end in zip(range(0, xTrain.shape[0], 128), 
# #                           range(128, xTrain.shape[0], 128)):
# #         cost, gradient = train(xTrain[start:end], yTrain[start:end])
    cost, gradient = train(xTrain, yTrain)
    if i % 30 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr


Round: 0  Test: 0.651842439644  Train: 0.629394324439
Round: 30  Test: 0.810673443456  Train: 0.831427361288
Round: 60  Test: 0.833545108005  Train: 0.849216433715
Round: 90  Test: 0.853875476493  Train: 0.861075815332
Round: 120  Test: 0.852604828463  Train: 0.866581956798
Round: 150  Test: 0.855146124524  Train: 0.873358746294
Round: 180  Test: 0.861499364676  Train: 0.879288437103
Round: 210  Test: 0.864040660737  Train: 0.885218127912
Round: 240  Test: 0.870393900889  Train: 0.889030072003
Round: 270  Test: 0.866581956798  Train: 0.892842016095
Round: 300  Test: 0.866581956798  Train: 0.894536213469
Round: 330  Test: 0.869123252859  Train: 0.894112664125
Round: 360  Test: 0.87166454892  Train: 0.895383312156
Round: 390  Test: 0.87166454892  Train: 0.895806861499


### All Five Vs All Five

And now the main event. Five way classification of all the colors.

In [76]:
vectorizer = CountVectorizer()

y = pd.get_dummies(modern.colors)

X = vectorizer.fit_transform(modern.text)

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state=42)

xTrain = np.asarray(xTrain.todense())
xTest  = np.asarray(xTest.todense())
yTrain = np.asarray(yTrain)
yTest  = np.asarray(yTest)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

# xTrain = xTrain.reshape(-1, 1, 1, 815)
# xTest = xTest.reshape(-1, 1, 1, 815)

print xTrain.shape
print yTrain.shape
print type(xTrain)
print type(yTrain)

print "There are {:,} words in the vocabulary.".format(len(vectorizer.vocabulary_))

(5905L, 1161L)
(5905L, 5L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
(5905L, 1161L)
(5905L, 5L)
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
There are 1,161 words in the vocabulary.


In [81]:
%%time

import theano
from theano import tensor as T
import numpy as np
from math import sqrt
# import matplotlib.pyplot as plt
# import seaborn
# %matplotlib inline

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * .2  #factors: correct for uni[0,1], glo, glo, softmax deriv
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) \
                                * normalizer))  #code for using Glorot init
    
def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

def adaDelta(cost, params, eta=0.2, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        #calc g-squared
        gSq = theano.shared(p.get_value() * 0.)
        dwSq = theano.shared(p.get_value() * 0.)

        #exp smoothed g squared
        gSqNew = rho * gSq + (1 - rho) * g * g

        #calc dx-squared
        dw = eta * T.sqrt(dwSq + epsilon) * g / T.sqrt(gSq + epsilon)
        dwSqNew = rho * dwSq + (1 - rho) * dw * dw

        updates.append((dwSq, dwSqNew))
        updates.append((gSq, gSqNew))
        updates.append((p, p - dw))
    return updates

X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((len(vectorizer.vocabulary_) , yTest.shape[1]))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.1]]

train = theano.function(inputs=[X, Y], 
                        outputs=[cost, gradient], 
                        updates=update, 
                        allow_input_downcast=True)

predict = theano.function(inputs=[X], 
                          outputs=y_pred, 
                          allow_input_downcast=True)


for i in range(10001):
# #     for start, end in zip(range(0, xTrain.shape[0], 128), 
# #                           range(128, xTrain.shape[0], 128)):
# #         cost, gradient = train(xTrain[start:end], yTrain[start:end])
    cost, gradient = train(xTrain, yTrain)
    if i % 500 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr


Round: 0  Test: 0.292026409345  Train: 0.27366638442
Round: 500  Test: 0.601320467242  Train: 0.633192209992
Round: 1000  Test: 0.627729812087  Train: 0.667739204064
Round: 1500  Test: 0.641442356526  Train: 0.686875529213
Round: 2000  Test: 0.655662772981  Train: 0.700423370025
Round: 2500  Test: 0.66124936516  Train: 0.709060118544
Round: 3000  Test: 0.664296597257  Train: 0.716680779001
Round: 3500  Test: 0.676485525648  Train: 0.724132091448
Round: 4000  Test: 0.682579989843  Train: 0.728704487722
Round: 4500  Test: 0.687150837989  Train: 0.732938187976
Round: 5000  Test: 0.688674454038  Train: 0.737849280271
Round: 5500  Test: 0.688674454038  Train: 0.739881456393
Round: 6000  Test: 0.686642965973  Train: 0.741913632515
Round: 6500  Test: 0.691721686135  Train: 0.744623200677
Round: 7000  Test: 0.694261046216  Train: 0.746655376799
Round: 7500  Test: 0.695784662265  Train: 0.749364944962
Round: 8000  Test: 0.695784662265  Train: 0.750381033023
Round: 8500  Test: 0.694768918233  Tr

Amazing results, almost equal to logistic regression at 5min. 

### 4-Layer Network

In [87]:
%%time

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

def rectify(X, alpha=1.0):
#     return T.maximum(X, 0.)
    return T.maximum(X, 0.05*X)  #leaky rectifier
#     return T.switch(X > 0, X, alpha * (T.exp(X) - 1)) # ELU

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x

X = T.fmatrix()
Y = T.fmatrix()

# w = init_weights((len(vectorizer.vocabulary_) , yTest.shape[1])) # old 
w_h = init_weights((len(vectorizer.vocabulary_), 600))
w_h2 = init_weights((600, 600))
w_o = init_weights((600, yTest.shape[1]))

noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.2, 0.5)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(401):
    for start, end in zip(range(0, len(xTrain), 128), range(128, len(xTrain), 128)):
        cost = train(xTrain[start:end], yTrain[start:end])
    if i%10 == 0: 
        tr = np.mean(np.argmax(yTest, axis=1) == predict(xTest))
        trr =  np.mean(np.argmax(yTrain, axis=1) == predict(xTrain))
        print 'Round:', i," Test:", tr, ' Train:', trr

Round: 0  Test: 0.27780599289  Train: 0.302116850127
Round: 10  Test: 0.280853224987  Train: 0.287552921253
Round: 20  Test: 0.46063991874  Train: 0.49483488569
Round: 30  Test: 0.602336211275  Train: 0.63200677392
Round: 40  Test: 0.653631284916  Train: 0.690770533446
Round: 50  Test: 0.672930421534  Train: 0.734462320068
Round: 60  Test: 0.691213814119  Train: 0.759017781541
Round: 70  Test: 0.694261046216  Train: 0.774259102456
Round: 80  Test: 0.699847638395  Train: 0.788653683319
Round: 90  Test: 0.710512950736  Train: 0.805927180356
Round: 100  Test: 0.723209751143  Train: 0.818797629128
Round: 110  Test: 0.723717623159  Train: 0.832345469941
Round: 120  Test: 0.729304215338  Train: 0.847417442845
Round: 130  Test: 0.725241239208  Train: 0.853852667231
Round: 140  Test: 0.727272727273  Train: 0.86316680779
Round: 150  Test: 0.733875063484  Train: 0.872650296359
Round: 160  Test: 0.723717623159  Train: 0.878577476715
Round: 170  Test: 0.732351447435  Train: 0.885012701101
Round: 1

#### Strong signs of overfitting

Next steps:  

Leaky RELU swapped out for ELU, alpha = 1.0   
Train accuracy dropped by 14% (good)  
Test accuracy dropped by 3% (bad)   

Leaky RELU swapped out for ELU, alpha = .5  
Train accuracy dropped by 7% (good)  
Test accuracy dropped by 1% (bad)   