# Jonathan Halverson
# Wednesday, January 17, 2018
# Melville versus Austen: With word2vec embeddings

Here we train an RNN to classify sentences as written by either Herman Melville or Jane Austen. We have 12,500 sentences between the two.

In [1]:
import nltk
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [2]:
import re
from collections import Counter

### Load the books

In [3]:
melville_raw = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
austen_raw = list(nltk.corpus.gutenberg.words('austen-sense.txt'))
austen_raw2 = list(nltk.corpus.gutenberg.words('austen-persuasion.txt'))
austen_raw3 = list(nltk.corpus.gutenberg.words('austen-emma.txt'))

In [4]:
melville = melville_raw[melville_raw.index('Loomings'):]
austen = austen_raw[austen_raw.index('The'):]
austen2 = austen_raw2[austen_raw2.index('Sir'):]
austen3 = austen_raw3[austen_raw3.index('I'):]

In [5]:
with open('melville_pierre.txt') as f:
     melville2 = f.read().decode('utf-8').encode('ascii', 'ignore').replace('\n', ' ').split()

### Load the word embeddings as generated by word2vec

In [6]:
with open('word2vec-master/vectors.text', 'r') as f:
     lines = f.readlines()

In [7]:
word_vectors = {}
for line in lines[1:]:
     word_nums = line.split()
     word = word_nums[0]
     word_vectors[word] = np.array(map(float, word_nums[1:]))

In [8]:
embedding_size = int(lines[0].split()[1])
embedding_size

200

### Functions to prepare the data

In [9]:
def make_sentences(x):
     j = ' '.join(x).replace('Mr .', 'Mr').replace('Mrs .', 'Mrs')
     j = j.replace('Ms .', 'Ms').replace('Dr .', 'Dr').replace('\n', ' ')
     j = j.replace('?', '.').replace('!', '.').replace('CHAPTER', ' ')
     sentences = j.split('.')
     s = [re.sub("[^a-zA-Z]", " ", sentence) for sentence in sentences]
     s = [sentence.lower().split() for sentence in s]
     return s

In [10]:
def remove_single_letters_except_ia(sentences):
     new_sentences = []
     for sentence in sentences:
          cleaned_sentence = []
          for word in sentence:
               if len(word) > 1:
                    cleaned_sentence.append(word)
               else:
                    if word in ['a', 'i']:
                         cleaned_sentence.append(word)
          new_sentences.append(cleaned_sentence)
     return new_sentences

In [11]:
def remove_short_and_long_sentences(sentences, low, high):
     new_sentences = []
     for sentence in sentences:
          if (len(sentence) >= low and len(sentence) <= high):
               new_sentences.append(sentence)
     return new_sentences

In [12]:
def replace_missing_words_with_UNK(sentences, missing):
     new_sentences = []
     for sentence in sentences:
          cleaned_sentence = []
          for word in sentence:
               if word in missing:
                    cleaned_sentence.append('</s>')
               else:
                    cleaned_sentence.append(word)
          new_sentences.append(cleaned_sentence)
     return new_sentences

In [13]:
def replace_word_with_index_and_zero_pad(sentences, dictionary, high):
     number_sentences = []
     for sentence in sentences:
          # how to handle words not in vocabulary
          number_sentence = [dictionary[word] for word in sentence]
          for _ in range(high - len(number_sentence)):
               number_sentence.append(0)
          number_sentences.append(number_sentence)
     return number_sentences

### Prepare the data

In [14]:
s1 = remove_single_letters_except_ia(make_sentences(melville)) + \
     remove_single_letters_except_ia(make_sentences(melville2))
s2 = remove_single_letters_except_ia(make_sentences(austen)) + \
     remove_single_letters_except_ia(make_sentences(austen2)) + \
     remove_single_letters_except_ia(make_sentences(austen3))

In [15]:
upper_bound = 15
s1 = remove_short_and_long_sentences(s1, 5, upper_bound)
s2 = remove_short_and_long_sentences(s2, 5, upper_bound)

Replace missing words with 'ukn':

In [16]:
print len(s1), len(s2)

5247 7298


In [17]:
seq_length = np.array([len(sentence) for sentence in s1] + [len(sentence) for sentence in s2])
target = np.append(np.ones(len(s1)), np.zeros(len(s2))).astype(np.int)

In [18]:
all_words = [word for sentence in s1 for word in sentence] + \
            [word for sentence in s2 for word in sentence]

In [19]:
len(all_words)

119046

In [20]:
unique_words = set(all_words + ['</s>'])
vocabulary_size = len(unique_words)
vocabulary_size

9888

In [21]:
unique_word_vector_words = set(word_vectors.keys())

In [22]:
ct = 0
for word in unique_words:
     if word not in unique_word_vector_words:
          ct += 1
print ct

1949


In [23]:
missing = unique_words - unique_word_vector_words

In [24]:
s1 = replace_missing_words_with_UNK(s1, missing)
s2 = replace_missing_words_with_UNK(s2, missing)

In [25]:
dictionary = dict([(word, index) for index, word in enumerate(unique_words - missing)])

In [26]:
all_sentences = replace_word_with_index_and_zero_pad(s1, dictionary, high=upper_bound) + \
                replace_word_with_index_and_zero_pad(s2, dictionary, high=upper_bound)
all_sentences = np.array(all_sentences)

In [27]:
idx = np.arange(target.size)
np.random.shuffle(idx)
all_sentences = all_sentences[idx]
target = target[idx]
seq_length = seq_length[idx]


#seq_length = np.array(seq_length.size * [1])

In [28]:
test_size = 0.2
idx_cut = int((1.0 - test_size) * target.size)
X_training = all_sentences[:idx_cut]
X_test = all_sentences[idx_cut:]
y_training = target[:idx_cut]
y_test = target[idx_cut:]
L_training = seq_length[:idx_cut]
L_test = seq_length[idx_cut:]

In [29]:
def fetch_training_batch(A, b, c, batch_size):
     idx = np.random.choice(np.arange(y_training.size), size=batch_size, replace=False)
     return A[idx], b[idx], c[idx]

### Construct the graph

In [30]:
tf.reset_default_graph()

In [31]:
embedding_size = 200
n_inputs = embedding_size
n_steps = 15
n_neurons = 64

In [32]:
X = tf.placeholder(tf.int32, shape=(None, n_steps))
y = tf.placeholder(tf.int32, shape=(None))
L = tf.placeholder(dtype=tf.int32, shape=(None))
training = tf.placeholder_with_default(False, shape=(None))

In [33]:
dictionary['</s>']

1369

In [34]:
[k for k in word_vectors.keys() if '<' in k]

['</s>']

In [35]:
np.linalg.norm(word_vectors['are'])

2.293899368440778

In [36]:
np.linalg.norm(np.random.random(200))

8.4046846819093144

In [37]:
inverse_dictionary = dict([(index, word) for (word, index) in dictionary.items()])

In [38]:
inverse_dictionary[0]

u'yellow'

In [39]:
dictionary['yellow']

0

In [40]:
word_vectors['yellow'][:3]

array([-0.136894, -0.037584, -0.352259])

In [66]:
from scipy.spatial import distance

In [70]:
distance.cosine(word_vectors['king'], word_vectors['queen'])

0.35946757265379703

In [68]:
q = word_vectors['king'] - word_vectors['man'] + word_vectors['woman']
distance.cosine(q, word_vectors['queen'])

0.3669084423242156

In [41]:
ee = []
for index in xrange(len(inverse_dictionary.keys())):
     ee.append(word_vectors[inverse_dictionary[index]])
embeddings_matrix = np.array(ee)

In [42]:
embeddings_matrix.shape

(7939, 200)

In [43]:
embeddings = tf.constant(embeddings_matrix, dtype=tf.float32)

In [44]:
#embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), trainable=True)
embed = tf.nn.embedding_lookup(params=embeddings, ids=X)

In [45]:
cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32, sequence_length=L)
fc_drop = tf.layers.dropout(states, rate=0.9, training=training)
logits_2d = tf.layers.dense(fc_drop, units=1, activation=None)
logits = tf.squeeze(logits_2d)
y_proba = tf.nn.sigmoid(logits)

In [46]:
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(y, tf.float32), logits=logits)
loss = tf.reduce_mean(xentropy)

In [47]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
training_op = optimizer.minimize(loss)

In [48]:
y_pred = tf.cast(tf.greater(logits, 0.0), tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(y, y_pred), tf.float32))

In [49]:
init = tf.global_variables_initializer()

In [50]:
batch_size = 75
epochs = 50

In [51]:
with tf.Session() as sess:
     init.run()
     for epoch in xrange(epochs + 1):
          for iteration in xrange(y_training.size // batch_size):
               X_batch, y_batch, L_batch = fetch_training_batch(X_training, y_training, L_training, batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch, L:L_batch, training:True})
          if epoch % 5 == 0:
               loss_batch, acc_batch = sess.run([loss, accuracy], feed_dict={X:X_batch, y:y_batch, L:L_batch, training:False})
               loss_test, acc_test = sess.run([loss, accuracy], feed_dict={X:X_test, y:y_test, L:L_test, training:False})
               print epoch, loss_batch, acc_batch, loss_test, acc_test

0 0.266384 0.893333 0.397742 0.808689
5 0.136174 0.973333 0.334062 0.851335
10 0.172522 0.933333 0.394252 0.843364
15 0.127781 0.96 0.395126 0.852531
20 0.138739 0.96 0.523594 0.848944
25 0.0845923 0.96 0.483823 0.852531
30 0.112889 0.946667 0.498843 0.850937
35 0.120944 0.96 0.465441 0.849342
40 0.223942 0.96 0.451217 0.850538
45 0.0804978 0.973333 0.446511 0.854922
50 0.210791 0.946667 0.393126 0.834994


|n_neurons|embedding_size|embeddings_trainable|dropout_rate|dropout_during_testing|sequence|peak accuracy|
|------|------|------|------|------|------|
| 2 | 200| no| 0.9|no|var|81.0%|
| 64 | 200| no| 0.9|no|var|85.4%|

We see that the word2vec word embeddings give similar results to trainable word embeddings from the previous notebook.