# Jonathan Halverson
# Wednesday, January 17, 2018
# Melville versus Austen

Here we train an RNN to classify sentences as written by either Herman Melville or Jane Austen. We have 12,500 sentences between the two.

In [1]:
import nltk
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [2]:
import re
from collections import Counter

### Load the books

In [3]:
melville_raw = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
austen_raw = list(nltk.corpus.gutenberg.words('austen-sense.txt'))
austen_raw2 = list(nltk.corpus.gutenberg.words('austen-persuasion.txt'))
austen_raw3 = list(nltk.corpus.gutenberg.words('austen-emma.txt'))

In [4]:
melville = melville_raw[melville_raw.index('Loomings'):]
austen = austen_raw[austen_raw.index('The'):]
austen2 = austen_raw2[austen_raw2.index('Sir'):]
austen3 = austen_raw3[austen_raw3.index('I'):]

In [5]:
with open('melville_pierre.txt') as f:
     melville2 = f.read().decode('utf-8').encode('ascii', 'ignore').replace('\n', ' ').split()

### Functions to prepare the data

In [6]:
def make_sentences(x):
     j = ' '.join(x).replace('Mr .', 'Mr').replace('Mrs .', 'Mrs')
     j = j.replace('Ms .', 'Ms').replace('Dr .', 'Dr').replace('\n', ' ')
     j = j.replace('?', '.').replace('!', '.').replace('CHAPTER', ' ')
     sentences = j.split('.')
     s = [re.sub("[^a-zA-Z]", " ", sentence) for sentence in sentences]
     s = [sentence.lower().split() for sentence in s]
     return s

In [7]:
def remove_single_letters_except_ia(sentences):
     new_sentences = []
     for sentence in sentences:
          cleaned_sentence = []
          for word in sentence:
               if len(word) > 1:
                    cleaned_sentence.append(word)
               else:
                    if word in ['a', 'i']:
                         cleaned_sentence.append(word)
          new_sentences.append(cleaned_sentence)
     return new_sentences

In [8]:
def remove_short_and_long_sentences(sentences, low, high):
     new_sentences = []
     for sentence in sentences:
          if (len(sentence) >= low and len(sentence) <= high):
               new_sentences.append(sentence)
     return new_sentences

In [9]:
def replace_word_with_index_and_zero_pad(sentences, dictionary, high):
     number_sentences = []
     for sentence in sentences:
          # how to handle words not in vocabulary
          number_sentence = [dictionary[word] for word in sentence]
          for _ in range(high - len(number_sentence)):
               number_sentence.append(0)
          number_sentences.append(number_sentence)
     return number_sentences

### Prepare the data

In [10]:
s1 = remove_single_letters_except_ia(make_sentences(melville)) + \
     remove_single_letters_except_ia(make_sentences(melville2))
s2 = remove_single_letters_except_ia(make_sentences(austen)) + \
     remove_single_letters_except_ia(make_sentences(austen2)) + \
     remove_single_letters_except_ia(make_sentences(austen3))

In [11]:
upper_bound = 15
s1 = remove_short_and_long_sentences(s1, 5, upper_bound)
s2 = remove_short_and_long_sentences(s2, 5, upper_bound)

In [12]:
print len(s1), len(s2)

5247 7298


In [13]:
seq_length = np.array([len(sentence) for sentence in s1] + [len(sentence) for sentence in s2])
target = np.append(np.ones(len(s1)), np.zeros(len(s2))).astype(np.int)

In [14]:
all_words = [word for sentence in s1 for word in sentence] + \
            [word for sentence in s2 for word in sentence]

In [15]:
len(all_words)

119046

In [16]:
unique_words = set(all_words)
vocabulary_size = len(unique_words)
vocabulary_size

9887

In [17]:
dictionary = dict([(word, index) for index, word in enumerate(unique_words)])

In [18]:
all_sentences = replace_word_with_index_and_zero_pad(s1, dictionary, high=upper_bound) + \
                replace_word_with_index_and_zero_pad(s2, dictionary, high=upper_bound)
all_sentences = np.array(all_sentences)

In [19]:
idx = np.arange(target.size)
np.random.shuffle(idx)
all_sentences = all_sentences[idx]
target = target[idx]
seq_length = seq_length[idx]


seq_length = np.array(seq_length.size * [3])

In [20]:
test_size = 0.2
idx_cut = int((1.0 - test_size) * target.size)
X_training = all_sentences[:idx_cut]
X_test = all_sentences[idx_cut:]
y_training = target[:idx_cut]
y_test = target[idx_cut:]
L_training = seq_length[:idx_cut]
L_test = seq_length[idx_cut:]

In [21]:
def fetch_batch(A, b, c, batch_size):
     idx = np.random.choice(np.arange(y_training.size), size=batch_size, replace=False)
     return A[idx], b[idx], c[idx]

### Construct the graph

In [22]:
tf.reset_default_graph()

In [23]:
embedding_size = 32
n_inputs = embedding_size
n_steps = 15
n_neurons = 32

In [24]:
X = tf.placeholder(tf.int32, shape=(None, n_steps))
y = tf.placeholder(tf.int32, shape=(None))
L = tf.placeholder(dtype=tf.int32, shape=(None))
training = tf.placeholder_with_default(False, shape=(None))

In [25]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), trainable=True)
embed = tf.nn.embedding_lookup(params=embeddings, ids=X)

In [26]:
cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32, sequence_length=L)
fc_drop = tf.layers.dropout(states, rate=0.9, training=training)
logits_2d = tf.layers.dense(fc_drop, units=1, activation=None)
logits = tf.squeeze(logits_2d)
y_proba = tf.nn.sigmoid(logits)

In [27]:
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(y, tf.float32), logits=logits)
loss = tf.reduce_mean(xentropy)

In [28]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
training_op = optimizer.minimize(loss)

In [29]:
y_pred = tf.cast(tf.greater(logits, 0.0), tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(y, y_pred), tf.float32))

In [30]:
init = tf.global_variables_initializer()

In [31]:
batch_size = 75
epochs = 50

In [32]:
with tf.Session() as sess:
     init.run()
     for epoch in xrange(epochs + 1):
          for iteration in xrange(y_training.size // batch_size):
               X_batch, y_batch, L_batch = fetch_batch(X_training, y_training, L_training, batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch, L:L_batch, training:True})
          if epoch % 5 == 0:
               loss_batch, acc_batch = sess.run([loss, accuracy], feed_dict={X:X_batch, y:y_batch, L:L_batch, training:False})
               loss_test, acc_test = sess.run([loss, accuracy], feed_dict={X:X_test, y:y_test, L:L_test, training:False})
               print epoch, loss_batch, acc_batch, loss_test, acc_test

0 0.535514 0.706667 0.534351 0.722599
5 0.342327 0.853333 0.574346 0.763252
10 0.218669 0.893333 0.70296 0.758868
15 0.248945 0.893333 0.727395 0.759267
20 0.287286 0.906667 0.81924 0.757672
25 0.18529 0.906667 0.84474 0.750897
30 0.21924 0.893333 0.874485 0.75568
35 0.166572 0.946667 0.922816 0.761259
40 0.207608 0.906667 0.974304 0.754882
45 0.194142 0.893333 0.974424 0.75847
50 0.256801 0.906667 0.95818 0.75847


|n_neurons|embedding_size|embeddings_trainable|dropout_rate|dropout_during_testing|sequence|peak accuracy|
|------|------|------|------|------|------|
| 64 | 64| yes| 0.9|no|var|87.0%|
| 64 | 64| yes| 0.9|yes|var|85.6%|
| 64 | 64| no| 0.9|no|var|79.9%|
| 32 | 32| yes| 0.9|no|var|87.2%|
| 16 | 16| yes| 0.9|no|var|86.2%|
| 8 | 8| yes| 0.9|no|var|87.4%|
| 2 | 2| yes| 0.9|no|var|86.7%|
| 32 | 32| yes| 0.9|no|all 25|86.1%|
| 32 | 32| yes| 0.9|no|all 5|79.7%|
| 32 | 32| yes| 0.9|no|all 3|76.3%|

### Conventional model

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
RandomForestClassifier(n_estimators=25).fit(X_training, y_training).score(X_test, y_test)

0.60900757273814266

### Aside on sequence length

https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html

This model doesn't make much sense given the zero padding.

In [35]:
tf.reset_default_graph()

In [36]:
n_steps = 2
n_inputs = 3
n_neurons = 5

In [37]:
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
seq_length = tf.placeholder(tf.int32, (None))
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32, sequence_length=seq_length)

# this a general way of obtaining the last output (and may be different to states (e.g., stacked cells))
idx = tf.range(4) * tf.shape(output_seqs)[1] + (seq_length - 1)
last_rnn_output = tf.gather(tf.reshape(output_seqs, [-1, n_neurons]), idx)

In [38]:
init = tf.global_variables_initializer()

In [39]:
X_batch = np.array([
        # t = 0      t = 1 
        [[0, 1, 2], [9, 8, 7]], # instance 1
        [[3, 4, 5], [1e6, 1e6, 1e6]], # instance 2
        [[6, 7, 8], [6, 5, 4]], # instance 3
        [[9, 0, 1], [3, 2, 1]], # instance 4
    ])
seq_length_batch = [2, 1, 2, 2]

with tf.Session() as sess:
     init.run()
     outputs_val = output_seqs.eval(feed_dict={X: X_batch, seq_length:seq_length_batch})
     states_val = states.eval(feed_dict={X: X_batch, seq_length:seq_length_batch})
     last_rnn_output = states.eval(feed_dict={X: X_batch, seq_length:seq_length_batch})

In [40]:
print(outputs_val)

[[[ 0.51830071 -0.55948418 -0.03514713  0.94566482  0.74650013]
  [ 0.99999982  0.99294418  0.93391019  1.          0.99999839]]

 [[ 0.99788857 -0.05489642  0.53856391  0.99999613  0.99871683]
  [ 0.          0.          0.          0.          0.        ]]

 [[ 0.99999297  0.47937977  0.84529084  1.          0.99999428]
  [ 0.99976665  0.99478036  0.70588583  0.99999905  0.99995583]]

 [[ 0.99777275  0.71935076  0.99963731  0.99936914  0.99851298]
  [ 0.91072702  0.98585087  0.25284082  0.98582256  0.99335402]]]


In [41]:
print(states_val)

[[ 0.99999982  0.99294418  0.93391019  1.          0.99999839]
 [ 0.99788857 -0.05489642  0.53856391  0.99999613  0.99871683]
 [ 0.99976665  0.99478036  0.70588583  0.99999905  0.99995583]
 [ 0.91072702  0.98585087  0.25284082  0.98582256  0.99335402]]


In [42]:
print(last_rnn_output)

[[ 0.99999982  0.99294418  0.93391019  1.          0.99999839]
 [ 0.99788857 -0.05489642  0.53856391  0.99999613  0.99871683]
 [ 0.99976665  0.99478036  0.70588583  0.99999905  0.99995583]
 [ 0.91072702  0.98585087  0.25284082  0.98582256  0.99335402]]


In [43]:
np.allclose(states_val, last_rnn_output)

True

This test shows that no matter the input, the final state is set by the sequence length as expected. This is important because the word embeddings for key 0 will be entered but the sequence length will not allow the result to be used. Hence one does not need to zero pad but could put in any inputs beyond the sequence length such as 1e6.