# NN Google word vectors
Build a basic nueral net using the google word embeddings

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
import gensim
import re

In [4]:
import sys
sys.path.append('../scripts')
import classifier_helpers as ch

## Helper function

In [5]:
class LemmaTokenizer(object):
    def __init__(self):
         self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [6]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    plt.ylim(ylim)
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    train_sizes, train_scores, test_scores = learning_curve(estimator, 
                                                            X, 
                                                            y, 
                                                            cv=cv, 
                                                            n_jobs=n_jobs, 
                                                            train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

## Build the dataset

In [7]:
categories = ['rec.motorcycles', 'rec.autos']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [8]:
wv = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz',
                                                    binary=True, limit=100000)

In [9]:
wv.doesnt_match(['wheel', 'car', 'wave'])

'wave'

## Neural net

In [10]:
import tensorflow as tf

In [11]:
w2v = dict(zip(wv.index2word, wv.vectors))
vectors = []
for email in newsgroups_train.data:
    email_vector = ch.average_vectors(email, w2v, vec_length=300)
    vectors.append(email_vector)

In [12]:
len(vectors)

1192

In [13]:
vectors = np.vstack(vectors)
vectors.shape

(1192, 300)

In [18]:
with tf.Session() as sess:
    var_input = tf.placeholder(dtype=tf.float32, shape=[None, 300], name='var_input')
    b = tf.zeros([2])
    W = tf.zeros([300, 2])
    product = tf.matmul(var_input, W) + b
    result = sess.run(product, feed_dict={var_input:vectors})
    print(result)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 ...
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [19]:
isess = tf.InteractiveSession()

In [24]:
var_input = tf.placeholder(dtype=tf.float32, shape=[None, 300], name='var_input')
b = tf.zeros([2])
W = tf.zeros([300, 2])
product = tf.matmul(var_input, W) + b
isess.run(tf.global_variables_initializer())

In [29]:
product.eval(feed_dict={var_input:vectors})

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]], dtype=float32)

In [125]:
# Define some placeholder variables
x_ = tf.placeholder(tf.float32, shape=[None, 300], name='input')
y_ = tf.placeholder(tf.float32, shape=[None, 2], name='output')

# Define the network computation
W = tf.Variable(tf.zeros([300, 2]))
b = tf.Variable(tf.zeros([2]))
yhat = tf.nn.softmax(tf.matmul(x_, W) + b)

# Define our loss function
mse_loss = tf.reduce_mean(tf.square(yhat - y_))

# Compute accuracy computation
correct_prediction = tf.equal(tf.argmax(yhat,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Set up the training method
train_step = tf.train.AdamOptimizer(0.1).minimize(mse_loss)
isess.run(tf.global_variables_initializer())

In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, 
                                                     newsgroups_train.target, 
                                                     test_size=0.2, 
                                                     random_state=42)

ytr = np.vstack(((y_train), (1-y_train))).T
yte = np.vstack(((y_test), (1-y_test))).T
ytr

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [1, 0],
       [1, 0]])

In [109]:
y_train[-5:]

array([0, 0, 0, 1, 1])

In [134]:
accuracy.eval(feed_dict={x_: X_train, y_: ytr})

0.5057712

In [135]:
correct_prediction.eval(feed_dict={x_: X_train, y_: ytr})

array([False, False, False,  True, False,  True,  True, False, False,
       False, False, False, False,  True, False,  True, False,  True,
        True,  True, False, False, False,  True,  True,  True,  True,
       False,  True, False, False,  True, False, False,  True, False,
       False,  True,  True, False, False, False,  True, False, False,
       False,  True,  True, False, False, False,  True, False, False,
        True, False, False, False, False,  True,  True,  True, False,
        True, False,  True, False,  True, False, False, False, False,
        True, False,  True, False,  True, False,  True, False, False,
       False,  True,  True,  True, False,  True, False,  True, False,
       False,  True,  True, False, False,  True,  True,  True, False,
        True,  True, False,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True, False, False, False,
       False, False, False,  True, False,  True, False,  True,  True,
       False,  True,

In [136]:
# Train the model!
n_iters = 1500
for i in range(n_iters+1):
    # Run through an iteration of the training process
    train_step.run(feed_dict={x_: X_train, y_: ytr})
    
    # Compute the accuracy and loss
    if i % 100 == 0:
        current_loss = mse_loss.eval(feed_dict={x_: X_train, y_: ytr})
        current_acc  = accuracy.eval(feed_dict={x_: X_test, y_: yte})
        print('Train step: {}, Loss: {}, Accuracy: {}'.format(i, 
                                                                current_loss, 
                                                                current_acc * 100.))

Train step: 0, Loss: 0.27712225914001465, Accuracy: 48.53556454181671
Train step: 100, Loss: 0.22679166495800018, Accuracy: 64.43514823913574
Train step: 200, Loss: 0.22031888365745544, Accuracy: 64.85355496406555
Train step: 300, Loss: 0.21657274663448334, Accuracy: 66.10878705978394
Train step: 400, Loss: 0.213670015335083, Accuracy: 66.10878705978394
Train step: 500, Loss: 0.21140432357788086, Accuracy: 66.10878705978394
Train step: 600, Loss: 0.20961931347846985, Accuracy: 66.52719378471375
Train step: 700, Loss: 0.20813098549842834, Accuracy: 66.94560647010803
Train step: 800, Loss: 0.20686796307563782, Accuracy: 67.78242588043213
Train step: 900, Loss: 0.20582959055900574, Accuracy: 68.20083856582642
Train step: 1000, Loss: 0.20495343208312988, Accuracy: 67.36401915550232
Train step: 1100, Loss: 0.2041929066181183, Accuracy: 67.78242588043213
Train step: 1200, Loss: 0.20361687242984772, Accuracy: 67.36401915550232
Train step: 1300, Loss: 0.20298469066619873, Accuracy: 66.94560647

## Actual NN

In [144]:
# Now we are creating two weight matrices, one that contains the
# weights connecting the input units to the hidden units, and one
# connecting the hidden units to the output units
n_inputs = 300
n_hidden = 300
n_outputs = 2
W_input_to_hidden = tf.Variable(tf.truncated_normal([n_inputs, n_hidden], stddev=0.1))
W_hidden_to_output = tf.Variable(tf.truncated_normal([n_hidden, n_outputs], stddev=0.1))
b_hidden = tf.Variable(tf.constant(0.1, shape=[n_hidden]))
b_output = tf.Variable(tf.constant(0.1, shape=[n_outputs]))

# We now redefine the neural computation. I'm showing it here in
# two steps: one for each layer in the network
hidden_activation = tf.nn.sigmoid(tf.matmul(x_, W_input_to_hidden) + b_hidden)
yhat = tf.nn.softmax(tf.matmul(hidden_activation, W_hidden_to_output) + b_output)

############################
# The rest is the same...
mse_loss = tf.reduce_mean(tf.square(yhat - y_))
correct_prediction = tf.equal(tf.argmax(yhat,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_step = tf.train.AdagradOptimizer(0.1).minimize(mse_loss)
isess.run(tf.initialize_all_variables())
n_iters = 1500
for i in range(n_iters+1):
    train_step.run(feed_dict={x_: X_train, y_: ytr})
    if i % 100 == 0:
        current_loss = mse_loss.eval(feed_dict={x_: X_train, y_: ytr})
        current_acc  = accuracy.eval(feed_dict={x_: X_train, y_: ytr})
        print('Train step: %d, Loss: %.3f, Accuracy: %.3f%%' % (i, 
                                                                current_loss, 
                                                                current_acc * 100.))

Train step: 0, Loss: 0.431, Accuracy: 50.577%
Train step: 100, Loss: 0.281, Accuracy: 50.577%
Train step: 200, Loss: 0.326, Accuracy: 50.577%
Train step: 300, Loss: 0.289, Accuracy: 50.577%
Train step: 400, Loss: 0.271, Accuracy: 50.577%
Train step: 500, Loss: 0.261, Accuracy: 50.577%
Train step: 600, Loss: 0.256, Accuracy: 50.577%
Train step: 700, Loss: 0.253, Accuracy: 50.577%
Train step: 800, Loss: 0.251, Accuracy: 50.577%
Train step: 900, Loss: 0.250, Accuracy: 50.577%
Train step: 1000, Loss: 0.250, Accuracy: 50.577%
Train step: 1100, Loss: 0.250, Accuracy: 51.731%
Train step: 1200, Loss: 0.250, Accuracy: 51.522%
Train step: 1300, Loss: 0.249, Accuracy: 51.731%
Train step: 1400, Loss: 0.249, Accuracy: 51.836%
Train step: 1500, Loss: 0.249, Accuracy: 51.836%
