In [None]:
# Load necessary libraries 
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint
import numpy as np
import random
import sys

In [2]:
path = get_file('mobydick.txt', origin='http://www.gutenberg.org/files/2701/2701-0.txt')

In [3]:
with open(path, encoding='utf8') as f:
    story_start = f.readlines()[340:]

In [4]:
word_list = [word for story_start in story_start for word in story_start.split()]

In [5]:
word_list[0:10]

['ETYMOLOGY.',
 '(Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar']

In [6]:
word_list = list(map(lambda each:each.strip("("), word_list))
word_list = list(map(lambda each:each.strip(")"), word_list))
word_list[0:10]

['ETYMOLOGY.',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar']

In [7]:
word_set = sorted(set(word_list))

In [8]:
num_word_set = len(word_set)
num_word_set

33415

In [9]:
num_word_list = len(word_list)
num_word_list

214986

In [10]:
encoding = {j: i for i, j in enumerate(word_set)}
decoding = {i: j for i, j in enumerate(word_set)}
print()
print("Moby Dick contains {0} words/symbols.".format(num_word_list))
print()
print("Moby Dick contains {0} unique words/symbols.".format(num_word_set))


Moby Dick contains 214986 words/symbols.

Moby Dick contains 33415 unique words/symbols.


In [11]:
X_data, y_data = [], []
sentence_length = 30

In [12]:
for i in range(0, num_word_list - sentence_length, 20):
    sentence = word_list[i:i+sentence_length]
    next_ = word_list[i + sentence_length]
    X_data.append([encoding[j] for j in sentence])
    y_data.append([encoding[next_]])
    
print("We have {0} sentences of length {1} from our Moby Dick story".format(len(X_data), sentence_length))

We have 10748 sentences of length 30 from our Moby Dick story


In [13]:
X = np.zeros((len(X_data), sentence_length, num_word_set), dtype=np.bool)
y = np.zeros((len(X_data), num_word_set), dtype=np.bool)
for i, sentence in enumerate(X_data):
    for t, encoded_char in enumerate(sentence):
        X[i, t, encoded_char] = 1
    y[i, y_data[i]] = 1
    
print("Vectorization complete")

Vectorization complete


In [14]:
# Review dimensions for train and testing sets 
print("Dimensions for X is {0} with each sentence being {1} words/symbols long".format(X.shape, sentence_length))
print("Dimensions for y is {0} with {1} setences made from {2} unique words/symbols".format(y.shape, len(X_data), num_word_set))

Dimensions for X is (10748, 30, 33415) with each sentence being 30 words/symbols long
Dimensions for y is (10748, 33415) with 10748 setences made from 33415 unique words/symbols


# Build the model 

In [16]:
model = Sequential()
model.add(LSTM(128,input_shape=(sentence_length, num_word_set), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(num_word_set))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 30, 128)           17174528  
_________________________________________________________________
dropout_3 (Dropout)          (None, 30, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 33415)             2171975   
_________________________________________________________________
activation_2 (Activation)    (None, 33415)             0         
Total params: 19,395,911
Trainable params: 19,395,911
Non-trainable params: 0
________________________________________________________________

In [17]:
net_model = model.to_yaml()
with open('model.yaml', 'a') as model_file:
    model_file.write(net_model)

In [18]:
file_path="weights-{epoch:02d}-{loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor="loss", verbose=1, save_best_only=True, mode="min")
callbacks = [checkpoint]

In [None]:
model.fit(X, y, nb_epoch=10, batch_size=32, callbacks=callbacks)