# Springboard Data Science Career Track

## Capstone Project #2: Learning Neural Networks through Text Generation

## Extended Modeling #3: Changing Sequence Length of Word-Level Model

##### By Logan Larson

In [6]:
import numpy
from numpy import array
import string
import re
from random import randint
from pickle import load, dump
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint

In [3]:
# load text and convert to lowercase
filename = "AdventuresOfSherlockHolmes.txt"
raw_text = open(filename).read()
print(raw_text[:102])

To Sherlock Holmes she is always THE woman. I have seldom heard
him mention her under any other name. 


In [4]:
# convert to lowercase
raw_text = raw_text.lower()
# replace dashes
raw_text = raw_text.replace('--', '')
# split words by white space
tokens = raw_text.split()
# remove punctuation from each word
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [re_punc.sub('', w) for w in tokens]
# remove any non-alphabetic tokens
tokens = [word for word in tokens if word.isalpha()]
print(tokens[:19])

['to', 'sherlock', 'holmes', 'she', 'is', 'always', 'the', 'woman', 'i', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name']


In [5]:
seq_length = 100
words = sorted(list(set(tokens)))
token_to_int = dict((c, i) for i, c in enumerate(words))
dataX = []
dataY = []
n_tokens = len(tokens)
for i in range(0, n_tokens - seq_length, 1):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    dataX.append([token_to_int[token] for token in seq_in])
    dataY.append(token_to_int[seq_out])
n_patterns = len(dataX)
print('Number of Tokens: ', len(tokens))
print("Total Patterns: ", n_patterns)

Number of Tokens:  104202
Total Patterns:  104102


In [7]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(len(words))

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

print(X.shape[1], X.shape[2])

100 1


In [8]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [9]:
# fit the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/20

Epoch 00001: loss improved from inf to 6.80120, saving model to weights-improvement-01-6.8012.hdf5
Epoch 2/20

Epoch 00002: loss improved from 6.80120 to 6.69458, saving model to weights-improvement-02-6.6946.hdf5
Epoch 3/20

Epoch 00003: loss improved from 6.69458 to 6.68769, saving model to weights-improvement-03-6.6877.hdf5
Epoch 4/20

Epoch 00004: loss improved from 6.68769 to 6.68766, saving model to weights-improvement-04-6.6877.hdf5
Epoch 5/20

Epoch 00005: loss did not improve from 6.68766
Epoch 6/20

Epoch 00006: loss did not improve from 6.68766
Epoch 7/20

Epoch 00007: loss did not improve from 6.68766
Epoch 8/20

Epoch 00008: loss did not improve from 6.68766
Epoch 9/20

Epoch 00009: loss did not improve from 6.68766
Epoch 10/20

Epoch 00010: loss did not improve from 6.68766
Epoch 11/20

Epoch 00011: loss did not improve from 6.68766
Epoch 12/20

Epoch 00012: loss did 

<keras.callbacks.History at 0x111c484a8>

In [16]:
# load the network weights
filename = "weights-improvement-04-6.6877.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [17]:
import sys

int_to_token = dict((i, c) for i, c in enumerate(words))
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" you will i am sure forgive anything that may be wanting in our arrangements when you consider the blow which has come so suddenly upon us my dear madam said i i am an old campaigner and if i were not i can very well see that no apology is needed if i can be of any assistance either to you or to my friend here i shall be indeed happy now mr sherlock holmes said the lady as we entered a welllit diningroom upon the table of which a cold supper had been laid out i should very much "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [18]:
start2 = numpy.random.randint(576, len(dataX)-1)
pattern = dataX[start2]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" to look at him you are looking for a situation miss he asked yes sir as governess yes sir and what salary do you ask i had pounds a month in my last place with colonel spence munro oh tut tut sweatingrank sweating he cried throwing his fat hands out into the air like a man who is in a boiling passion how could anyone offer so pitiful a sum to a lady with such attractions and accomplishments my accomplishments sir may be less than you imagine said i a little french a little german music and drawing tut tut "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [19]:
start3 = numpy.random.randint(1291, len(dataX)-1)
pattern = dataX[start3]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" set the engine at work the lamp still stood upon the floor where i had placed it when examining the trough by its light i saw that the black ceiling was coming down upon me slowly jerkily but as none knew better than myself with a force which must within a minute grind me to a shapeless pulp i threw myself screaming against the door and dragged with my nails at the lock i implored the colonel to let me out but the remorseless clanking of the levers drowned my cries the ceiling was only a foot or two above "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [20]:
start4 = numpy.random.randint(291, len(dataX)-1)
pattern = dataX[start4]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" pointed over the meadows look there said he a heavily timbered park stretched up in a gentle slope thickening mto a grove at the highest point from amid the branches there jutted out the gray gables and high rooftree of a very old mansion stoke moran said he yes sir that be the house of dr grimesby roylott remarked the driver there is some building going on there said holmes that is where we are going theres the village said the driver pointing to a cluster of roofs some distance to the left but if you want to get to "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [21]:
start5 = numpy.random.randint(2291, len(dataX)-1)
pattern = dataX[start5]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" in and planked down four golden sovereigns for my weeks work it was the same next week and the same the week after every morning i was there at ten and every afternoon i left at two by degrees mr duncan ross took to coming in only once of a morning and then after a time he did not come in at all still of course i never dared to leave the room for an instant for i was not sure when he might come and the billet was such a good one and suited me so well that i "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.
