In [1]:
# import the project Gutenberg dataset
from nltk.corpus import gutenberg

In [2]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# preprocess Shakespeare plays
text = ''
for txt in gutenberg.fileids():
    if 'shakespeare' in txt:
        text += gutenberg.raw(txt).lower()
chars = sorted(list(set(text)))
#make a dict of character to an index, for reference in the one-hot encoding
char_indices = dict((c, i) for i, c in enumerate(chars))
#make the opposite dict for lookup when interpreting the one-hot encoding back to the char
indices_char = dict((i, c) for i, c in enumerate(chars))

f"corpus length: {len(text)} total chars: {len(chars)}"

'corpus length: 375542 total chars: 50'

In [4]:
print(text[:500])

[the tragedie of julius caesar by william shakespeare 1599]


actus primus. scoena prima.

enter flauius, murellus, and certaine commoners ouer the stage.

  flauius. hence: home you idle creatures, get you home:
is this a holiday? what, know you not
(being mechanicall) you ought not walke
vpon a labouring day, without the signe
of your profession? speake, what trade art thou?
  car. why sir, a carpenter

   mur. where is thy leather apron, and thy rule?
what dost thou with thy best apparrell on


Next we’re going to chop up the source text into data samples, each with a fixed, maxlen set of characters.

In [5]:
list(range(0, 50, 3))

[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]

In [6]:
maxlen = 40
step = 3
sentences = []
next_chars = []

#step by 3 characters, so the generated training sample will overlap, but not be identical
for i in range(0, len(text) - maxlen, step):
    #grab a slice of text
    sentences.append(text[i: i + maxlen])
    #collect the next expected char
    next_chars.append(text[i + maxlen])
print('nb sequences: ', len(sentences))

nb sequences:  125168


So we have 125,168 training samples and the character that follows each of them, the target for our model.

In [7]:
import numpy as np

#one-hot encode the training examples
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [8]:
#assemble a character-based LSTM model for generating text
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

model = Sequential()

model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(learning_rate=0.01)

model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               91648     
                                                                 
 dense (Dense)               (None, 50)                6450      
                                                                 
 activation (Activation)     (None, 50)                0         
                                                                 
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [9]:
#train shakespearean chatbot
epochs = 6
batch_size = 128
model_structure = model.to_json()

with open('shakes_lstm_model.json', "w") as json_file:
    json_file.write(model_structure)
    
for i in range(5):
    model.fit(X, y, batch_size=batch_size, epochs=epochs)
    model.save_weights(f"shakes_lstm_weights_{i+1}.h5")

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [10]:
#sampler to generate character sequences
import random

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [11]:
#generate three texts with three diversity levels
import sys

start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5, 1.0]:
    print()
    print('------ diversity:', diversity)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)
    
    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        #sys.stdout.write(next_char)
        sys.stdout.flush()
    print()


------ diversity: 0.2
----- Generating with seed: "h,
by many of these traines, hath sought"
h,
by many of these traines, hath soughting:
that i shall i haue seene of the purpose
of my selfe to the marches and for they shake
to the selfe of the winde of march of the state
to the coursty of the pretter the selues:
and then the loot a country of the selues
all the convvsw-words: that we will for the strong of the selues
all the seate of my stranne of the state of the death,
and that i must be common him to the selues
all the seat

------ diversity: 0.5
----- Generating with seed: "h,
by many of these traines, hath sought"
h,
by many of these traines, hath sought,
that we to do? did you are need, and marke,
which i my lord, stand strong into much his march.
or your sweet marry then to make him to the prayers hand,
that i may thou shalt be done, and march it forreparted;
it shall follow them condent friends so behildonight
hath made him ear't them done to the selugue,
well senstlesse of h