# NN Google word vectors
Build a basic nueral net using the google word embeddings

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
import gensim
import re

In [4]:
import sys
sys.path.append('../scripts')
import classifier_helpers as ch

## Helper function

In [5]:
class LemmaTokenizer(object):
    def __init__(self):
         self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [6]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    plt.ylim(ylim)
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    train_sizes, train_scores, test_scores = learning_curve(estimator, 
                                                            X, 
                                                            y, 
                                                            cv=cv, 
                                                            n_jobs=n_jobs, 
                                                            train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

## Build the dataset

In [7]:
categories = ['rec.motorcycles', 'rec.autos']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [10]:
len(newsgroups_train.data) / 98

12.16326530612245

In [12]:
from sklearn.model_selection import train_test_split

In [14]:
throwaway_data, data, throwaway_target, target = train_test_split(newsgroups_train.data,
                                                                  newsgroups_train.target,
                                                                  test_size = 1/12, 
                                                                  random_state=42)

In [15]:
wv = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz',
                                                    binary=True, limit=100000)

In [16]:
wv.doesnt_match(['wheel', 'car', 'wave'])

'wave'

## Neural net

In [17]:
import tensorflow as tf

In [20]:
w2v = dict(zip(wv.index2word, wv.vectors))
vectors = []
for email in data:
    email_vector = ch.average_vectors(email, w2v, vec_length=300)
    vectors.append(email_vector)

In [21]:
len(vectors)

100

In [22]:
vectors = np.vstack(vectors)
vectors.shape

(100, 300)

In [23]:
with tf.Session() as sess:
    var_input = tf.placeholder(dtype=tf.float32, shape=[None, 300], name='var_input')
    b = tf.zeros([2])
    W = tf.zeros([300, 2])
    product = tf.matmul(var_input, W) + b
    result = sess.run(product, feed_dict={var_input:vectors})
    print(result)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [24]:
isess = tf.InteractiveSession()

In [25]:
var_input = tf.placeholder(dtype=tf.float32, shape=[None, 300], name='var_input')
b = tf.zeros([2])
W = tf.zeros([300, 2])
product = tf.matmul(var_input, W) + b
isess.run(tf.global_variables_initializer())

In [26]:
product.eval(feed_dict={var_input:vectors})

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.

In [27]:
# Define some placeholder variables
x_ = tf.placeholder(tf.float32, shape=[None, 300], name='input')
y_ = tf.placeholder(tf.float32, shape=[None, 2], name='output')

# Define the network computation
W = tf.Variable(tf.zeros([300, 2]))
b = tf.Variable(tf.zeros([2]))
yhat = tf.nn.softmax(tf.matmul(x_, W) + b)

# Define our loss function
mse_loss = tf.reduce_mean(tf.square(yhat - y_))

# Compute accuracy computation
correct_prediction = tf.equal(tf.argmax(yhat,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Set up the training method
train_step = tf.train.AdamOptimizer(0.1).minimize(mse_loss)
isess.run(tf.global_variables_initializer())

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, 
                                                     target, 
                                                     test_size=0.2, 
                                                     random_state=42)

ytr = np.vstack(((y_train), (1-y_train))).T
yte = np.vstack(((y_test), (1-y_test))).T
ytr

array([[1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1,

In [29]:
accuracy.eval(feed_dict={x_: X_train, y_: ytr})

0.4875

In [30]:
correct_prediction.eval(feed_dict={x_: X_train, y_: ytr})

array([ True,  True,  True, False,  True, False, False,  True,  True,
       False, False, False,  True, False, False, False,  True, False,
       False, False,  True, False,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True, False,  True,
       False,  True,  True, False, False, False, False, False, False,
        True, False,  True,  True, False,  True, False, False,  True,
        True, False, False, False,  True,  True, False,  True, False,
        True,  True, False,  True, False,  True,  True,  True, False,
       False, False,  True,  True, False, False, False,  True])

In [31]:
# Train the model!
n_iters = 1500
for i in range(n_iters+1):
    # Run through an iteration of the training process
    train_step.run(feed_dict={x_: X_train, y_: ytr})
    
    # Compute the accuracy and loss
    if i % 100 == 0:
        current_loss = mse_loss.eval(feed_dict={x_: X_test, y_: yte})
        current_acc  = accuracy.eval(feed_dict={x_: X_test, y_: yte})
        print('Train step: {}, Loss: {}, Accuracy: {}'.format(i, 
                                                                current_loss, 
                                                                current_acc * 100.))

Train step: 0, Loss: 0.30002403259277344, Accuracy: 40.00000059604645
Train step: 100, Loss: 0.13190464675426483, Accuracy: 80.0000011920929
Train step: 200, Loss: 0.11605868488550186, Accuracy: 75.0
Train step: 300, Loss: 0.11584951728582382, Accuracy: 75.0
Train step: 400, Loss: 0.12045522034168243, Accuracy: 75.0
Train step: 500, Loss: 0.12665247917175293, Accuracy: 80.0000011920929
Train step: 600, Loss: 0.13285115361213684, Accuracy: 80.0000011920929
Train step: 700, Loss: 0.1384759247303009, Accuracy: 80.0000011920929
Train step: 800, Loss: 0.1434277594089508, Accuracy: 80.0000011920929
Train step: 900, Loss: 0.14775830507278442, Accuracy: 69.9999988079071
Train step: 1000, Loss: 0.15155087411403656, Accuracy: 69.9999988079071
Train step: 1100, Loss: 0.15491712093353271, Accuracy: 69.9999988079071
Train step: 1200, Loss: 0.15805566310882568, Accuracy: 69.9999988079071
Train step: 1300, Loss: 0.16127391159534454, Accuracy: 69.9999988079071
Train step: 1400, Loss: 0.164878129959106

## Actual NN

In [32]:
# Now we are creating two weight matrices, one that contains the
# weights connecting the input units to the hidden units, and one
# connecting the hidden units to the output units
n_inputs = 300
n_hidden = 300
n_outputs = 2
W_input_to_hidden = tf.Variable(tf.truncated_normal([n_inputs, n_hidden], stddev=0.1))
W_hidden_to_output = tf.Variable(tf.truncated_normal([n_hidden, n_outputs], stddev=0.1))
b_hidden = tf.Variable(tf.constant(0.1, shape=[n_hidden]))
b_output = tf.Variable(tf.constant(0.1, shape=[n_outputs]))

# We now redefine the neural computation. I'm showing it here in
# two steps: one for each layer in the network
hidden_activation = tf.nn.sigmoid(tf.matmul(x_, W_input_to_hidden) + b_hidden)
yhat = tf.nn.softmax(tf.matmul(hidden_activation, W_hidden_to_output) + b_output)

############################
# The rest is the same...
mse_loss = tf.reduce_mean(tf.square(yhat - y_))
correct_prediction = tf.equal(tf.argmax(yhat,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_step = tf.train.AdagradOptimizer(0.1).minimize(mse_loss)
isess.run(tf.initialize_all_variables())
n_iters = 15000
for i in range(n_iters+1):
    train_step.run(feed_dict={x_: X_train, y_: ytr})
    if i % 100 == 0:
        current_loss = mse_loss.eval(feed_dict={x_: X_test, y_: yte})
        current_acc  = accuracy.eval(feed_dict={x_: X_test, y_: yte})
        print('Train step: %d, Loss: %.3f, Accuracy: %.3f%%' % (i, 
                                                                current_loss, 
                                                                current_acc * 100.))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Train step: 0, Loss: 0.569, Accuracy: 40.000%
Train step: 100, Loss: 0.258, Accuracy: 60.000%
Train step: 200, Loss: 0.274, Accuracy: 60.000%
Train step: 300, Loss: 0.251, Accuracy: 60.000%
Train step: 400, Loss: 0.242, Accuracy: 60.000%
Train step: 500, Loss: 0.239, Accuracy: 60.000%
Train step: 600, Loss: 0.238, Accuracy: 60.000%
Train step: 700, Loss: 0.239, Accuracy: 60.000%
Train step: 800, Loss: 0.240, Accuracy: 60.000%
Train step: 900, Loss: 0.241, Accuracy: 60.000%
Train step: 1000, Loss: 0.241, Accuracy: 60.000%
Train step: 1100, Loss: 0.242, Accuracy: 65.000%
Train step: 1200, Loss: 0.242, Accuracy: 65.000%
Train step: 1300, Loss: 0.242, Accuracy: 65.000%
Train step: 1400, Loss: 0.241, Accuracy: 75.000%
Train step: 1500, Loss: 0.241, Accuracy: 75.000%
Train step: 1600, Loss: 0.240, Accuracy: 75.000%
Train step: 1700, Loss: 0.238, Accuracy: 75.000%
Train step: 1800, Loss: 0.237, Accuracy: 70.000%
Train s