# Load the Data

In [1]:
# Load necessary libraries 
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import one_hot
import numpy as np
import random
import sys

Using TensorFlow backend.


In [2]:
path = get_file('mobydick.txt', origin='http://www.gutenberg.org/files/2701/2701-0.txt')

In [3]:
with open(path, encoding='utf8') as f:
    storyStart = f.readlines()[340:]

In [4]:
wordList = [word for storyStart in storyStart for word in storyStart.split()]

In [5]:
wordList[0:10]

['ETYMOLOGY.',
 '(Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar']

In [6]:
wordList = list(map(lambda each:each.strip("("), wordList))
wordList = list(map(lambda each:each.strip(")"), wordList))
wordList[0:10]

['ETYMOLOGY.',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar']

In [7]:
wordSet = sorted(set(wordList))

In [8]:
nWordSet = len(wordSet)

In [9]:
nWordList = len(wordList)

In [10]:
encoding = {j: i for i, j in enumerate(wordSet)}
decoding = {i: j for i, j in enumerate(wordSet)}
print()
print("Moby Dick contains {0} words/numbers/characters.".format(nWordList))
print()
print("Moby Dick contains {0} unique words/numbers/characters".format(nWordSet))


Moby Dick contains 214986 words/numbers/characters.

Moby Dick contains 33415 unique words/numbers/characters


# Prepare for Input

In [11]:
X_data, y_data = [], []
sentenceLength = 30

In [12]:
for i in range(0, nWordList - sentenceLength, 20):
    sentence = wordList[i:i+sentenceLength]
    next_ = wordList[i + sentenceLength]
    X_data.append([encoding[j] for j in sentence])
    y_data.append([encoding[next_]])
    
print("We have {0} sentences of length {1} from our Moby Dick story".format(len(X_data), sentenceLength))

We have 10748 sentences of length 30 from our Moby Dick story


In [13]:
# Vectorize data
X = np.zeros((len(X_data), sentenceLength, nWordSet), dtype=np.bool)
y = np.zeros((len(X_data), nWordSet), dtype=np.bool)
for i, sentence in enumerate(X_data):
    for t, encoded_char in enumerate(sentence):
        X[i, t, encoded_char] = 1
    y[i, y_data[i]] = 1
    
print("Vectorization complete")

Vectorization complete


# Design Network 

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(sentenceLength, nWordSet)))
model.add(
model.add(LSTM())
model.add(Dense(nWordSet))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
netArchitecture = model.to_yaml()
with open('model.yaml', 'a') as model_file:
    model_file.write(netArchitecture)

# Check before running 

In [None]:
print("Double check y. Dimension: {0} # Sentences: {1} Words in Moby Dick: {2}".format(y.shape, len(X_data), nWordSet))
print("Double check X. Dimension: {0} Sentence length: {1}".format(X.shape, sentenceLength))

# Run model 

# Generate new writings!