In [1]:
import numpy as np
import os
import re

## Load In Some Plays

In [2]:
base = "./TXT"
tokens = []

replacer = re.compile(r"[^A-z \t\n,.:;'?!]")

for genre in os.listdir(base):
    genre_path = os.path.join(base, genre)
    for handle in os.listdir(genre_path):
        if handle.endswith(".txt"):
            play = os.path.join(genre_path, handle)
            print "reading {p}".format(p=handle)
            with open(play, "r") as f:
                for line in f.readlines():
                    line = replacer.sub("", line)
                    if line.startswith("\t"):
                        for token in line[1:]:
                            tokens.append(token.upper())
            

reading A Midsummer-Night's Dream.txt
reading All's Well that Ends Well.txt
reading As You Like It.txt
reading Cymbeline.txt
reading Love's Labour's Lost.txt
reading Measure for Measure.txt
reading Much Ado About Nothing.txt
reading Pericles, Prince of Tyre.txt
reading The Comedy of Errors.txt
reading The Merchant Of Venice.txt
reading The Merry Wives of Windsor.txt
reading The Taming of the Shrew.txt
reading The Tempest.txt
reading The Two Gentlemen of Verona.txt
reading The Winter's Tale.txt
reading Troilus and Cressida.txt
reading Twelfth-Night; or What You Will.txt
reading The Famous History of the Life of King Henry VIII.txt
reading The First Part of King Henry IV.txt
reading The First Part of King Henry VI.txt
reading The Life and Death of King John.txt
reading The Life of King Henry V.txt
reading The Second Part of King Henry IV.txt
reading The Second Part of King Henry VI.txt
reading The Third Part of King Henry VI.txt
reading The Tragedy of King Richard II.txt
reading The Trag

In [3]:
print len(tokens)
print len(set(tokens))

4252756
37


In [4]:
print "".join(tokens[:300])

NOW, FAIR HIPPOLYTA, OUR NUPTIAL HOUR
DRAWS ON APACE: FOUR HAPPY DAYS BRING IN
ANOTHER MOON; BUT O! METHINKS HOW SLOW
THIS OLD MOON WANES; SHE LINGERS MY DESIRES,
LIKE TO A STEP DAME, OR A DOWAGER
LONG WITHERING OUT A YOUNG MAN'S REVENUE.
FOUR DAYS WILL QUICKLY STEEP THEMSELVES IN NIGHT;
FOUR NIGHTS


In [5]:
num_unique_tokens = len(set(tokens))
token_to_index = dict((b, a) for a, b in enumerate(list(set(tokens))))
index_to_token = dict((b, a) for a, b in token_to_index.iteritems())

In [6]:
def TokenToInput(tok):
    idx = token_to_index[tok]
    arr = np.zeros(num_unique_tokens)
    arr[idx] = 1
    return arr

def ArrayToToken(arr):
    idx = arr.argmax()
    return index_to_token[idx]

In [7]:
corpus = np.array([TokenToInput(tok) for tok in tokens])
corpus.shape

(4252756, 37)

In [8]:
print "".join([ArrayToToken(arr) for arr in corpus[:300]])

NOW, FAIR HIPPOLYTA, OUR NUPTIAL HOUR
DRAWS ON APACE: FOUR HAPPY DAYS BRING IN
ANOTHER MOON; BUT O! METHINKS HOW SLOW
THIS OLD MOON WANES; SHE LINGERS MY DESIRES,
LIKE TO A STEP DAME, OR A DOWAGER
LONG WITHERING OUT A YOUNG MAN'S REVENUE.
FOUR DAYS WILL QUICKLY STEEP THEMSELVES IN NIGHT;
FOUR NIGHTS


## Build Tensorflow Models to Predict the Next Character

In [9]:
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell

### Naive Softmax Regression

In [11]:
batch_size = 300000
learning_rate = 0.00002
num_epochs = 1

with tf.Session() as sess:
    
    # Define the computation graph
    inputs = tf.placeholder(tf.float32, [None, num_unique_tokens])
    W = tf.Variable(tf.zeros([num_unique_tokens, num_unique_tokens]))
    b = tf.Variable(tf.zeros([num_unique_tokens]))
    outputs = tf.nn.softmax(tf.matmul(inputs, W) + b)
    targets = tf.placeholder(tf.float32, [None, num_unique_tokens])
    xentropy = -tf.reduce_sum(targets * tf.log(outputs))
    
    # Initialization
    init = tf.initialize_all_variables()
    sess.run(init)
    
    # Training
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(xentropy)
    for epoch in xrange(num_epochs):
        i = 0
        move_to_next_epoch = False
        while not move_to_next_epoch:
            try:
                batch_inputs = corpus[i * batch_size:(i + 1) * batch_size]
                batch_targets = corpus[(i * batch_size) + 1:((i + 1) * batch_size) + 1]
                sess.run(train_step, feed_dict={inputs: batch_inputs, targets: batch_targets})
                correct = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))
                accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
                print epoch, i, sess.run(
                    accuracy, feed_dict={inputs: corpus[-1001:-1], targets: corpus[-1000:]})
                i += 1
            except Exception as e:
                if type(e) == KeyboardInterrupt:
                    raise(e)
                move_to_next_epoch = True

0 0 0.168
0 1 0.168
0 2 0.168
0 3 0.168
0 4 0.168
0 5 0.168
0 6 0.199
0 7 0.199
0 8 0.199
0 9 0.199
0 10 0.199
0 11 0.199
0 12 0.199
0 13 0.199


### Generalize to N-Gram Softmax With a Totally Unnecessary And Harmful Compression Layer

In [12]:
batch_size = 100000
learning_rate = 0.00001
grams = 5
hidden_layer_size = num_unique_tokens / 2
num_epochs = 1

with tf.Session() as sess:

    # Define the computation graph
    inputs = tf.placeholder(tf.float32, [None, grams * num_unique_tokens])
    W0 = tf.Variable(tf.zeros([grams * num_unique_tokens, hidden_layer_size]))
    b0 = tf.Variable(tf.zeros([hidden_layer_size]))
    hidden = tf.nn.sigmoid(tf.matmul(inputs, W0) + b0)
    W1 = tf.Variable(tf.zeros([hidden_layer_size, num_unique_tokens]))
    b1 = tf.Variable(tf.zeros([num_unique_tokens]))
    outputs = tf.nn.softmax(tf.matmul(hidden, W1) + b1)
    targets = tf.placeholder(tf.float32, [None, num_unique_tokens])
    xentropy = -tf.reduce_sum(targets * tf.log(outputs))
    
    # Initialization
    init = tf.initialize_all_variables()
    sess.run(init)
    
    # Training
    learning_rate_control = tf.placeholder(tf.float32, [])
    train_step = tf.train.GradientDescentOptimizer(
        learning_rate * learning_rate_control).minimize(xentropy)
    for epoch in xrange(num_epochs):
        i = 0
        move_to_next_epoch = False
        while not move_to_next_epoch:
            try:
                batch_inputs = np.concatenate(
                    [corpus[(i * batch_size) + grams - g:((i + 1) * batch_size) + grams - g]
                     for g in xrange(grams)],
                    axis=1)
                batch_targets = corpus[(i * batch_size) + grams + 1:((i + 1) * batch_size) + grams + 1]
                sess.run(train_step, feed_dict={
                        inputs: batch_inputs,
                        learning_rate_control: 0.25 ** epoch,
                        targets: batch_targets})
                correct = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))
                accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
                print epoch, i, sess.run(
                    accuracy, feed_dict={
                        inputs: np.concatenate(
                            [corpus[-1001 - g:-1 - g]
                             for g in xrange(grams)],
                            axis=1),
                        targets: corpus[-1000:]})
                i += 1
            except Exception as e:
                if type(e) == KeyboardInterrupt:
                    raise(e)
                move_to_next_epoch = True

0 0 0.168
0 1 0.168
0 2 0.168
0 3 0.168
0 4 0.168
0 5 0.168
0 6 0.168
0 7 0.168
0 8 0.168
0 9 0.168
0 10 0.168
0 11 0.168
0 12 0.168
0 13 0.168
0 14 0.168
0 15 0.168
0 16 0.168
0 17 0.168
0 18 0.168
0 19 0.168
0 20 0.168
0 21 0.168
0 22 0.168
0 23 0.168
0 24 0.168
0 25 0.168
0 26 0.168
0 27 0.168
0 28 0.168
0 29 0.168
0 30 0.168
0 31 0.168
0 32 0.168
0 33 0.168
0 34 0.168
0 35 0.168
0 36 0.168
0 37 0.168
0 38 0.168
0 39 0.168
0 40 0.168
0 41 0.168
