# Jonathan Halverson
# Wednesday, January 17, 2018
# Melville versus Austen

Here we train an RNN to classify sentences as written by either Melville or Jane Austen.

In [38]:
import nltk
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

### Load the two books

In [44]:
melville_raw = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
austen_raw = list(nltk.corpus.gutenberg.words('austen-sense.txt'))

In [45]:
melville = melville_raw[melville_raw.index('Ishmael') - 2:]
austen = austen_raw[austen_raw.index('The'):]

In [47]:
def make_sentences(x):
     j = ' '.join(x).replace('Mr .', 'Mr').replace('Mrs .', 'Mrs')
     j = j.replace('Ms .', 'Ms').replace('Dr .', 'Dr')
     j = j.replace('?', '.').replace('!', '.')
     return j.split('.')

In [48]:
make_sentences(melville)[:10]

[u'Call me Ishmael ',
 u' Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world ',
 u' It is a way I have of driving off the spleen and regulating the circulation ',
 u" Whenever I find myself growing grim about the mouth ; whenever it is a damp , drizzly November in my soul ; whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people ' s hats off -- then , I account it high time to get to sea as soon as I can ",
 u' This is my substitute for pistol and ball ',
 u' With a philosophical flourish Cato throws himself upon his sword ; I quietly take to the ship ',
 u' There 

In [49]:
make_sentences(austen)[:10]

[u'The family of Dashwood had long been settled in Sussex ',
 u' Their estate was large , and their residence was at Norland Park , in the centre of their property , where , for many generations , they had lived in so respectable a manner as to engage the general good opinion of their surrounding acquaintance ',
 u' The late owner of this estate was a single man , who lived to a very advanced age , and who for many years of his life , had a constant companion and housekeeper in his sister ',
 u' But her death , which happened ten years before his own , produced a great alteration in his home ; for to supply her loss , he invited and received into his house the family of his nephew Mr Henry Dashwood , the legal inheritor of the Norland estate , and the person to whom he intended to bequeath it ',
 u" In the society of his nephew and niece , and their children , the old Gentleman ' s days were comfortably spent ",
 u' His attachment to them all increased ',
 u' The constant attention of 

In [23]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def tokenize(raw_text):
     # keep only alphabetical characters and split on whitespace
     letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
     words = letters_only.lower().split()
    
     # count the words and filter based on count and stopwords, apply stemming
     count = Counter(words)
     #porter = PorterStemmer()
     #stops = stopwords.words("english")
     #words = [porter.stem(word) for word in words if (word not in stops) and (count[word] > 1) and (len(word) > 1)]
     words = [word for word in words if (count[word] > 0) and (len(word) > 1)]

     return words

In [44]:
words = tokenize(' '.join(md))
len(set(words))

16672

In [45]:
len(words)

205697

In [46]:
with open('text8_moby_dick', 'w') as f:
     f.write(' '.join(words))

In [10]:
words[-10:]

[u'in',
 u'her',
 u'search',
 u'after',
 u'her',
 u'missing',
 u'children',
 u'only',
 u'found',
 u'another']

In [11]:
count = Counter(words)

In [12]:
count.most_common(37)

[(u'the', 14175),
 (u'of', 6469),
 (u'and', 6325),
 (u'to', 4539),
 (u'in', 4077),
 (u'that', 3045),
 (u'it', 2497),
 (u'his', 2495),
 (u'he', 1876),
 (u'but', 1805),
 (u'as', 1720),
 (u'with', 1692),
 (u'is', 1690),
 (u'was', 1627),
 (u'for', 1593),
 (u'all', 1515),
 (u'this', 1382),
 (u'at', 1304),
 (u'by', 1175),
 (u'whale', 1150),
 (u'not', 1142),
 (u'from', 1072),
 (u'him', 1058),
 (u'so', 1053),
 (u'on', 1040),
 (u'be', 1032),
 (u'one', 907),
 (u'you', 884),
 (u'there', 854),
 (u'now', 779),
 (u'had', 767),
 (u'have', 754),
 (u'or', 689),
 (u'were', 677),
 (u'they', 649),
 (u'like', 639),
 (u'me', 630)]

In [14]:
dictionary = dict()
for word, _ in count.most_common(): # loop over all words in vocabulary
     dictionary[word] = len(dictionary) # each word is assigned a unique id

In [15]:
# form running list of id's
data = list()
for word in words:
     index = dictionary[word]
     data.append(index)

In [16]:
inverted_dict = dict(zip(dictionary.values(), dictionary.keys()))

In [17]:
print('Sample data', data[:10], [inverted_dict[i] for i in data[:10]])

('Sample data', [397, 36, 1007, 40, 241, 600, 134, 283, 105, 77], [u'call', u'me', u'ishmael', u'some', u'years', u'ago', u'never', u'mind', u'how', u'long'])


The word 'me' is the 37-th most common.

In [18]:
data_index = 0

In [19]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

In [20]:
vocab_size = len(count)

In [21]:
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [22]:
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))

In [23]:
std = 1.0 / np.sqrt(embedding_size)
nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=std))
nce_biases = tf.Variable(tf.zeros([vocab_size]))

In [24]:
batch_size = 50

In [25]:
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [26]:
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [27]:
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels,
                                     inputs=embed, num_sampled=num_sampled, num_classes=vocab_size))

In [28]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

In [29]:
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [30]:
init = tf.global_variables_initializer()

In [31]:
def generate_batch():
     return inputs, labels

In [32]:
with tf.Session() as sess:
     init.run()
     for inputs, labels in generate_batch(...):
          feed_dict = {train_inputs: inputs, train_labels: labels}
          _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)

SyntaxError: invalid syntax (<ipython-input-32-178ea3fb6027>, line 3)