# Jonathan Halverson
# Wednesday, January 17, 2018
# Melville versus Austen

Here we train an RNN to classify sentences as written by either Herman Melville or Jane Austen. We have 12,500 sentences between the two.

In [1]:
import nltk
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [2]:
import re
from collections import Counter

### Load the books

In [3]:
melville_raw = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
austen_raw = list(nltk.corpus.gutenberg.words('austen-sense.txt'))
austen_raw2 = list(nltk.corpus.gutenberg.words('austen-persuasion.txt'))
austen_raw3 = list(nltk.corpus.gutenberg.words('austen-emma.txt'))

In [4]:
melville = melville_raw[melville_raw.index('Loomings'):]
austen = austen_raw[austen_raw.index('The'):]
austen2 = austen_raw2[austen_raw2.index('Sir'):]
austen3 = austen_raw3[austen_raw3.index('I'):]

In [5]:
with open('melville_pierre.txt') as f:
     melville2 = f.read().decode('utf-8').encode('ascii', 'ignore').replace('\n', ' ').split()

### Functions to prepare the data

In [6]:
def make_sentences(x):
     j = ' '.join(x).replace('Mr .', 'Mr').replace('Mrs .', 'Mrs')
     j = j.replace('Ms .', 'Ms').replace('Dr .', 'Dr').replace('\n', ' ')
     j = j.replace('?', '.').replace('!', '.').replace('CHAPTER', ' ')
     sentences = j.split('.')
     s = [re.sub("[^a-zA-Z]", " ", sentence) for sentence in sentences]
     s = [sentence.lower().split() for sentence in s]
     return s

In [7]:
def remove_single_letters_except_ia(sentences):
     new_sentences = []
     for sentence in sentences:
          cleaned_sentence = []
          for word in sentence:
               if len(word) > 1:
                    cleaned_sentence.append(word)
               else:
                    if word in ['a', 'i']:
                         cleaned_sentence.append(word)
          new_sentences.append(cleaned_sentence)
     return new_sentences

In [8]:
def remove_short_and_long_sentences(sentences, low, high):
     new_sentences = []
     for sentence in sentences:
          if (len(sentence) >= low and len(sentence) <= high):
               new_sentences.append(sentence)
     return new_sentences

In [9]:
def replace_word_with_index_and_zero_pad(sentences, dictionary, high):
     number_sentences = []
     for sentence in sentences:
          # how to handle words not in vocabulary
          number_sentence = [dictionary[word] for word in sentence]
          for _ in range(high - len(number_sentence)):
               number_sentence.append(0)
          number_sentences.append(number_sentence)
     return number_sentences

### Prepare the data

In [10]:
s1 = remove_single_letters_except_ia(make_sentences(melville)) + \
     remove_single_letters_except_ia(make_sentences(melville2))
s2 = remove_single_letters_except_ia(make_sentences(austen)) + \
     remove_single_letters_except_ia(make_sentences(austen2)) + \
     remove_single_letters_except_ia(make_sentences(austen3))

In [11]:
upper_bound = 15
s1 = remove_short_and_long_sentences(s1, 5, upper_bound)
s2 = remove_short_and_long_sentences(s2, 5, upper_bound)

In [12]:
print len(s1), len(s2)

5247 7298


In [13]:
seq_length = np.array([len(sentence) for sentence in s1] + [len(sentence) for sentence in s2])
target = np.append(np.ones(len(s1)), np.zeros(len(s2))).astype(np.int)

In [14]:
all_words = [word for sentence in s1 for word in sentence] + \
            [word for sentence in s2 for word in sentence]

In [15]:
len(all_words)

119046

In [16]:
unique_words = set(all_words)
vocabulary_size = len(unique_words)
vocabulary_size

9887

In [17]:
dictionary = dict([(word, index) for index, word in enumerate(unique_words)])
#dictionary['UNK'] = vocabulary_size

In [18]:
all_sentences = replace_word_with_index_and_zero_pad(s1, dictionary, high=upper_bound) + \
                replace_word_with_index_and_zero_pad(s2, dictionary, high=upper_bound)
all_sentences = np.array(all_sentences)

In [19]:
idx = np.arange(target.size)
np.random.shuffle(idx)
all_sentences = all_sentences[idx]
target = target[idx]
seq_length = seq_length[idx]


#seq_length = np.array(seq_length.size * [25])

In [20]:
test_size = 0.2
idx_cut = int((1.0 - test_size) * target.size)
X_training = all_sentences[idx_cut:]
X_test = all_sentences[:idx_cut]
y_training = target[idx_cut:]
y_test = target[:idx_cut]
L_training = seq_length[idx_cut:]
L_test = seq_length[:idx_cut]

In [21]:
def fetch_batch(A, b, c, batch_size):
     idx = np.random.choice(np.arange(y_training.size), size=batch_size, replace=False)
     return A[idx], b[idx], c[idx]

### Construct the graph

In [22]:
tf.reset_default_graph()

In [23]:
embedding_size = 64/2
n_inputs = embedding_size
n_steps = 15
n_neurons = 75

In [24]:
X = tf.placeholder(tf.int32, shape=(None, n_steps))
y = tf.placeholder(tf.int32, shape=(None))
L = tf.placeholder(dtype=tf.int32, shape=(None))
training = tf.placeholder_with_default(False, shape=(None))

In [25]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), trainable=True)
embed = tf.nn.embedding_lookup(params=embeddings, ids=X)

In [26]:
cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32, sequence_length=L)
fc_drop = tf.layers.dropout(states, rate=0.9, training=training)
logits_2d = tf.layers.dense(fc_drop, units=1, activation=None)
logits = tf.squeeze(logits_2d)
y_proba = tf.nn.sigmoid(logits)

In [27]:
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(y, tf.float32), logits=logits)
loss = tf.reduce_mean(xentropy)

In [28]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
training_op = optimizer.minimize(loss)

In [29]:
y_pred = tf.cast(tf.greater(logits, 0.0), tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(y, y_pred), tf.float32))

In [30]:
init = tf.global_variables_initializer()

In [31]:
batch_size = 75
epochs = 25

In [32]:
with tf.Session() as sess:
     init.run()
     for epoch in xrange(epochs + 1):
          for iteration in xrange(y_training.size // batch_size):
               X_batch, y_batch, L_batch = fetch_batch(X_training, y_training, L_training, batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch, L:L_batch, training:True})
          if epoch % 5 == 0:
               loss_batch, acc_batch = sess.run([loss, accuracy], feed_dict={X:X_batch, y:y_batch, L:L_batch})
               loss_test, acc_test = sess.run([loss, accuracy], feed_dict={X:X_test, y:y_test, L:L_test})
               print epoch, loss_batch, acc_batch, loss_test, acc_test
     #Lu = embed.eval(feed_dict={X:[[123, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]})

0 0.507736 0.72 0.548803 0.725189
5 0.13829 0.973333 0.781159 0.802112
10 0.00724304 1.0 1.31518 0.796333
15 0.0175522 0.986667 1.37547 0.805998
20 0.00079619 1.0 1.43582 0.799322
25 7.21237e-05 1.0 1.49908 0.804404


# Check if zero padding is necessary for small case with sequence length

In [33]:
X_batch, y_batch, L_batch = fetch_batch(X_training, y_training, L_training, batch_size)

In [34]:
X_batch

array([[4748, 2450, 9473, ...,    0,    0,    0],
       [8236, 8173, 3847, ...,    0,    0,    0],
       [1130, 9434, 2122, ...,    0,    0,    0],
       ..., 
       [2925, 1569, 7522, ...,    0,    0,    0],
       [4748, 7113, 4964, ...,    0,    0,    0],
       [2925, 1089, 8559, ..., 4108,    0,    0]])

In [35]:
y_batch

array([1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0])

In [36]:
L_batch

array([ 5,  6, 12, 13, 15, 14, 14, 13,  5,  8,  5, 13, 14,  6,  8, 15,  9,
        7, 10, 10,  9, 15, 10,  9,  8,  7, 10,  7, 12, 14,  7,  6, 10,  9,
        5, 12, 12, 13, 13,  9, 11,  5,  6, 12, 11, 13,  6,  7, 13, 12,  8,
        6,  6, 15, 14, 15, 15, 15,  6, 15,  6,  6, 13, 12, 13, 10,  8, 10,
       10,  5, 11, 15,  6,  7, 13])

In [37]:
X_batch.shape

(75, 15)

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
RandomForestClassifier(n_estimators=25).fit(X_training, y_training).score(X_test, y_test)

0.59027500996412918

This model doesn't make much sense given the zero padding.