# Springboard Data Science Career Track

## Capstone Project #2: Learning Neural Networks through Text Generation

## Extended Model #2: Using Word-Level Inputs

##### By Logan Larson

In [3]:
import numpy
from numpy import array
import string
import re
from random import randint
from pickle import load, dump
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding
from keras.callbacks import ModelCheckpoint

In [4]:
# load text and convert to lowercase
filename = "AdventuresOfSherlockHolmes.txt"
raw_text = open(filename).read()
print(raw_text[:102])

To Sherlock Holmes she is always THE woman. I have seldom heard
him mention her under any other name. 


In [13]:
# convert to lowercase
raw_text = raw_text.lower()
# replace dashes
raw_text = raw_text.replace('--', '')
# split words by white space
tokens = raw_text.split()
# remove punctuation from each word
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [re_punc.sub('', w) for w in tokens]
# remove any non-alphabetic tokens
tokens = [word for word in tokens if word.isalpha()]
print(tokens[:19])

['to', 'sherlock', 'holmes', 'she', 'is', 'always', 'the', 'woman', 'i', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name']


In [14]:
len(tokens)

104202

In [19]:
seq_length = 8
words = sorted(list(set(tokens)))
token_to_int = dict((c, i) for i, c in enumerate(words))
dataX = []
dataY = []
n_tokens = len(tokens)
for i in range(0, n_tokens - seq_length, 1):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    dataX.append([token_to_int[token] for token in seq_in])
    dataY.append(token_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  104194


In [22]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(len(words))

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

print(X.shape[1], X.shape[2])

8 1


In [23]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [24]:
# fit the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/20

Epoch 00001: loss improved from inf to 6.61080, saving model to weights-improvement-01-6.6108.hdf5
Epoch 2/20

Epoch 00002: loss improved from 6.61080 to 6.45732, saving model to weights-improvement-02-6.4573.hdf5
Epoch 3/20

Epoch 00003: loss improved from 6.45732 to 6.44729, saving model to weights-improvement-03-6.4473.hdf5
Epoch 4/20

Epoch 00004: loss improved from 6.44729 to 6.44195, saving model to weights-improvement-04-6.4419.hdf5
Epoch 5/20

Epoch 00005: loss improved from 6.44195 to 6.43669, saving model to weights-improvement-05-6.4367.hdf5
Epoch 6/20

Epoch 00006: loss improved from 6.43669 to 6.42758, saving model to weights-improvement-06-6.4276.hdf5
Epoch 7/20

Epoch 00007: loss improved from 6.42758 to 6.41364, saving model to weights-improvement-07-6.4136.hdf5
Epoch 8/20

Epoch 00008: loss improved from 6.41364 to 6.39614, saving model to weights-improvement-08-6.

<keras.callbacks.History at 0x131fa7fd0>

In [26]:
# load the network weights
filename = "weights-improvement-20-5.7552.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
import sys

int_to_token = dict((i, c) for i, c in enumerate(words))
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" sure of it i had almost overcome my "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [32]:
start2 = numpy.random.randint(576, len(dataX)-1)
pattern = dataX[start2]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" was a very ordinary black hat of the "
the and the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [33]:
start3 = numpy.random.randint(1291, len(dataX)-1)
pattern = dataX[start3]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" reptiles neck he drew it from its horrid "
the and the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [34]:
start4 = numpy.random.randint(291, len(dataX)-1)
pattern = dataX[start4]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" the case is the disposition of the child "
of the the of the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.


In [35]:
start5 = numpy.random.randint(2291, len(dataX)-1)
pattern = dataX[start5]
print("Seed:")
print("\"", ' '.join([int_to_token[value] for value in pattern]), "\"")

# generate characters
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(words))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_token[index]
    seq_in = [int_to_token[value] for value in pattern]
    sys.stdout.write(result + " ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" way as ever came before me now let "
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 
Done.
