# Jonathan Halverson
# Wednesday, January 17, 2018
# Melville versus Austen

Here we train an RNN to classify sentences as written by either Melville or Jane Austen.

In [1]:
import nltk
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [2]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### Load the two books

In [3]:
melville_raw = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
austen_raw = list(nltk.corpus.gutenberg.words('austen-sense.txt'))

In [4]:
melville = melville_raw[melville_raw.index('Loomings'):]
austen = austen_raw[austen_raw.index('The'):]

In [5]:
def make_sentences(x):
     j = ' '.join(x).replace('Mr .', 'Mr').replace('Mrs .', 'Mrs')
     j = j.replace('Ms .', 'Ms').replace('Dr .', 'Dr')
     j = j.replace('?', '.').replace('!', '.').replace('CHAPTER', ' ')
     sentences = j.split('.')
     s = [re.sub("[^a-zA-Z]", " ", sentence) for sentence in sentences]
     s = [sentence.lower().split() for sentence in s]
     return s

In [6]:
def remove_single_letters_except_ia(sentences):
     new_sentences = []
     for sentence in sentences:
          cleaned_sentence = []
          for word in sentence:
               if len(word) > 1:
                    cleaned_sentence.append(word)
               else:
                    if word in ['a', 'i']:
                         cleaned_sentence.append(word)
          new_sentences.append(cleaned_sentence)
     return new_sentences

In [7]:
def remove_short_and_long_sentences(sentences, low, high):
     new_sentences = []
     for sentence in sentences:
          if (len(sentence) >= low and len(sentence) <= high):
               new_sentences.append(sentence)
     return new_sentences

In [8]:
def replace_word_with_index_and_zero_pad(sentences, dictionary, high):
     number_sentences = []
     for sentence in sentences:
          # how to handle words not in vocabulary
          number_sentence = [dictionary[word] for word in sentence]
          for _ in range(high - len(number_sentence)):
               number_sentence.append(6311)
          number_sentences.append(number_sentence)
     return number_sentences

In [9]:
s1 = remove_single_letters_except_ia(make_sentences(melville))
s2 = remove_single_letters_except_ia(make_sentences(austen))

In [10]:
s1 = remove_short_and_long_sentences(s1, 5, 15)
s2 = remove_short_and_long_sentences(s2, 5, 15)

In [11]:
seq_length = np.array([len(sentence) for sentence in s1] + [len(sentence) for sentence in s2])
target = np.append(np.zeros(len(s2)), np.ones(len(s1))).astype(np.int)

In [12]:
all_words = [word for sentence in s1 for word in sentence] + \
            [word for sentence in s2 for word in sentence]

In [13]:
len(all_words)

46517

In [14]:
unique_words = set(all_words)
vocabulary_size = len(unique_words)
vocabulary_size

6311

In [15]:
dictionary = dict([(word, index) for index, word in enumerate(unique_words)])
dictionary['UNK'] = 6311

In [16]:
all_sentences = replace_word_with_index_and_zero_pad(s1, dictionary, high=15) + \
                replace_word_with_index_and_zero_pad(s2, dictionary, high=15)
all_sentences = np.array(all_sentences)

In [17]:
idx = np.arange(target.size)
np.random.shuffle(idx)
all_sentences = all_sentences[idx]
target = target[idx]
seq_length = seq_length[idx]

In [18]:
test_size = 0.2
idx_cut = int((1.0 - test_size) * target.size)
X_training = all_sentences[idx_cut:]
X_test = all_sentences[:idx_cut]
y_training = target[idx_cut:]
y_test = target[:idx_cut]
L_training = seq_length[idx_cut:]
L_test = seq_length[:idx_cut]

In [19]:
X_training[:3]

array([[4342, 3007, 4844, 1671, 6202,  764, 5775, 3403, 5771, 1589, 3007,
        4844, 5303, 4400, 6311],
       [ 228, 1816, 6191, 3441, 1038, 2138,  130, 6311, 6311, 6311, 6311,
        6311, 6311, 6311, 6311],
       [5296, 1916, 6138, 6138, 3361,  753, 1145, 4832, 4851, 5201, 6311,
        6311, 6311, 6311, 6311]])

In [20]:
def fetch_batch(A, b, c, batch_size):
     idx = np.random.choice(np.arange(y_training.size), size=batch_size, replace=False)
     return A[idx], b[idx], c[idx]

In [21]:
tf.reset_default_graph()

In [22]:
embedding_size = 64
n_inputs = embedding_size
n_steps = 15
n_neurons = 75

In [23]:
X = tf.placeholder(tf.int32, shape=(None, n_steps))
y = tf.placeholder(tf.int32, shape=(None))
L = tf.placeholder(dtype=tf.int32, shape=(None))

In [24]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size+1, embedding_size], -1.0, 1.0), trainable=False)

In [25]:
embed = tf.nn.embedding_lookup(params=embeddings, ids=X)

In [26]:
cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32, sequence_length=L)
logits = tf.layers.dense(states, units=1, activation=None)
y_proba = tf.nn.sigmoid(logits)

In [27]:
y_reshaped = tf.reshape(y, shape=(-1, 1))
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(y_reshaped, tf.float32), logits=logits)
loss = tf.reduce_mean(xentropy)

In [28]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
training_op = optimizer.minimize(loss)

In [29]:
y_pred = tf.cast(logits > 0.0, tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(y_reshaped, y_pred), tf.float32))

In [30]:
init = tf.global_variables_initializer()

In [31]:
batch_size = 75
epochs = 100

In [32]:
with tf.Session() as sess:
     init.run()
     for epoch in xrange(epochs):
          for iteration in xrange(y_training.size // batch_size):
               X_batch, y_batch, L_batch = fetch_batch(X_training, y_training, L_training, batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch, L:L_batch})
          if epoch % 5 == 0:
               J = loss.eval(feed_dict={X:X_test, y:y_test, L:L_test})
               acc = accuracy.eval(feed_dict={X:X_test, y:y_test, L:L_test})
               print epoch, J, acc
     Lu = embed.eval(feed_dict={X:[[123, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]})

0 0.662811 0.61619
5 1.17589 0.627171
10 1.79255 0.631767
15 2.29416 0.633299
20 2.4663 0.632533
25 2.57224 0.631767
30 2.66237 0.633555
35 2.74168 0.632278
40 2.80693 0.631767
45 2.86326 0.630235
50 2.9137 0.629724
55 2.96342 0.628447
60 3.00783 0.628192
65 3.05466 0.62666
70 3.08883 0.62666
75 3.12692 0.62666
80 3.15938 0.627681
85 3.19075 0.628447
90 3.22458 0.628192
95 3.25563 0.627681


In [33]:
Lu.shape

(1, 15, 64)

In [34]:
Lu[0]

array([[-0.02775908,  0.29109645,  0.91255116,  0.7115109 ,  0.65152168,
         0.07488012,  0.08151841,  0.98675632,  0.68231773, -0.33854389,
        -0.5219202 ,  0.97048092,  0.03694224, -0.39062023,  0.8427918 ,
        -0.92966485,  0.32885909, -0.13017726, -0.18814492,  0.03984714,
        -0.23912024, -0.65398192, -0.79693699,  0.91067529, -0.85411906,
        -0.34985018, -0.50667191,  0.14627624,  0.01683545,  0.87692904,
         0.25295019, -0.56621623,  0.23305345, -0.97114682,  0.16731238,
         0.2812736 ,  0.36082029,  0.47126794, -0.70702434,  0.58425045,
        -0.06201434,  0.46570754, -0.52647257,  0.0876224 , -0.84057474,
         0.6703577 , -0.26879883, -0.75048065, -0.73296309, -0.65019655,
         0.59756827,  0.82586837, -0.32293487, -0.13865614,  0.11608887,
         0.07847476, -0.01987076,  0.11737585, -0.92109537, -0.38474751,
        -0.36645174, -0.44975066,  0.87866354,  0.03913093],
       [-0.59564805,  0.3346982 ,  0.31214857, -0.32719088,  0.

In [35]:
embeddings

<tf.Variable 'Variable:0' shape=(6312, 64) dtype=float32_ref>

In [36]:
X

<tf.Tensor 'Placeholder:0' shape=(?, 15) dtype=int32>

In [37]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(?, 15, 64) dtype=float32>

In [38]:
X_batch, y_batch, L_batch = fetch_batch(X_training, y_training, L_training, batch_size)

In [39]:
X_batch

array([[3398, 1916, 1387, ..., 2728, 6311, 6311],
       [4831, 3071,  294, ..., 6311, 6311, 6311],
       [2455, 2331, 1597, ..., 6311, 6311, 6311],
       ..., 
       [5589, 4832, 2138, ..., 3714, 5515, 6311],
       [3346, 3007, 3952, ..., 6311, 6311, 6311],
       [5649, 5624, 4832, ..., 3570, 6311, 6311]])

In [40]:
y_batch

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0])

In [41]:
L_batch

array([13, 11, 10,  6, 14,  6,  9, 12, 15, 10, 10,  9,  5, 13, 12, 12,  6,
       14, 15, 10, 11,  7,  6,  6, 14,  5,  8, 12,  9,  7,  7, 11, 13,  6,
       14,  5, 11, 15, 13,  6, 15,  5, 12, 10, 15, 14,  7,  9,  7, 11,  7,
       10, 12,  6, 10, 10,  7, 14, 10,  5,  8, 15, 14,  7, 11, 12,  8, 10,
        7,  7,  8,  5, 14, 11, 13])

In [42]:
X_batch.shape

(75, 15)

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
RandomForestClassifier(n_estimators=25).fit(X_training[:, :3], y_training).score(X_test[:, :3], y_test)

0.56409601634320738

This model doesn't make much sense given the zero padding.