In [1]:
# Adapted from Chapter 12 of LDL book

In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

EPOCHS = 32
BATCH_SIZE = 256

INPUT_FILE_NAME = 'data/frankenstein.txt'

In [3]:
# Data is from here: https://www.gutenberg.org/files/84/84-0.txt

In [4]:
WINDOW_LENGTH = 40
WINDOW_STEP = 3
PREDICT_LENGTH = 3
MAX_WORDS = 10000
EMBEDDING_WIDTH = 100

In [5]:
# Open and read file
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8')
text = file.read()
file.close()

In [6]:
# Make lower case and split into individual words
text = text_to_word_sequence(text)

In [7]:
text[0:10]

['\ufeffthe',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'frankenstein',
 'by',
 'mary',
 'wollstonecraft',
 'godwin']

In [8]:
# Create training examples.
fragments = []
targets = []
for i in range(0, len(text) - WINDOW_LENGTH, WINDOW_STEP):
  fragments.append(text[i: i + WINDOW_LENGTH])
  targets.append(text[i + WINDOW_LENGTH])

In [9]:
# Converting training input to word indices and then output to one-hot encoding
tokenizer = Tokenizer(num_words = MAX_WORDS, oov_token='UNK')
tokenizer.fit_on_texts(text)

In [10]:
type(tokenizer)

keras_preprocessing.text.Tokenizer

In [11]:
fragments_indexed = tokenizer.texts_to_sequences(fragments)

In [12]:
targets_indexed = tokenizer.texts_to_sequences(targets)

In [13]:
fragments[0:1]

[['\ufeffthe',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'frankenstein',
  'by',
  'mary',
  'wollstonecraft',
  'godwin',
  'shelley',
  'this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'in',
  'the',
  'united',
  'states',
  'and',
  'most',
  'other',
  'parts',
  'of',
  'the',
  'world',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions',
  'whatsoever']]

In [14]:
fragments_indexed[0]

[4294,
 100,
 88,
 687,
 5,
 305,
 24,
 2380,
 2381,
 2382,
 2383,
 26,
 687,
 32,
 23,
 2,
 387,
 5,
 982,
 3074,
 9,
 2,
 433,
 458,
 3,
 92,
 96,
 1633,
 5,
 2,
 182,
 31,
 53,
 2384,
 3,
 13,
 203,
 53,
 3075,
 2385]

In [15]:
targets[0]

'you'

In [16]:
targets_indexed[0]

[16]

In [17]:
# Convert to appropriate input and output formats
X = np.array(fragments_indexed, dtype=np.int)
y = np.zeros((len(targets_indexed), MAX_WORDS))

for i, target_index in enumerate(targets_indexed):
  y[i, target_index] = 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [18]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
# Now, build and train the model!
training_model = Sequential()
training_model.add(Embedding(output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True, input_length=None))
training_model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
training_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
training_model.add(Dense(128, activation='relu'))
training_model.add(Dense(MAX_WORDS, activation='softmax'))

In [20]:
training_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
training_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 10000)             1290000   
                                                                 
Total params: 2,555,344
Trainable params: 2,555,344
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = training_model.fit(X, y, validation_split=0.05, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, shuffle=True)

Epoch 1/32
98/98 - 61s - loss: 7.1969 - val_loss: 7.7315 - 61s/epoch - 627ms/step
Epoch 2/32
98/98 - 46s - loss: 6.4788 - val_loss: 7.8823 - 46s/epoch - 468ms/step
Epoch 3/32
98/98 - 38s - loss: 6.3339 - val_loss: 8.1629 - 38s/epoch - 387ms/step
Epoch 4/32
98/98 - 40s - loss: 6.2233 - val_loss: 8.3384 - 40s/epoch - 408ms/step
Epoch 5/32
98/98 - 42s - loss: 6.0975 - val_loss: 8.3084 - 42s/epoch - 432ms/step
Epoch 6/32
98/98 - 46s - loss: 5.9743 - val_loss: 8.3917 - 46s/epoch - 467ms/step
Epoch 7/32
98/98 - 41s - loss: 5.8852 - val_loss: 8.4861 - 41s/epoch - 418ms/step
Epoch 8/32
98/98 - 40s - loss: 5.8399 - val_loss: 8.3894 - 40s/epoch - 409ms/step
Epoch 9/32
98/98 - 45s - loss: 5.7699 - val_loss: 8.5485 - 45s/epoch - 463ms/step
Epoch 10/32
98/98 - 41s - loss: 5.6885 - val_loss: 8.6191 - 41s/epoch - 422ms/step
Epoch 11/32
98/98 - 44s - loss: 5.6368 - val_loss: 8.5958 - 44s/epoch - 451ms/step
Epoch 12/32
98/98 - 44s - loss: 5.5637 - val_loss: 8.6279 - 44s/epoch - 446ms/step
Epoch 13/32
9

In [28]:
# Build a stateful model used for prediction
inference_model = Sequential()
inference_model.add(Embedding(output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True, batch_input_shape=(1,1)))
inference_model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, stateful=True))
inference_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, stateful=True))
inference_model.add(Dense(128, activation='relu'))
inference_model.add(Dense(MAX_WORDS, activation='softmax'))
weights = training_model.get_weights()
inference_model.set_weights(weights)

In [29]:
# Feeding the predicted output back as input with one word each time
first_words = ['i', 'saw']
first_words_indexed = tokenizer.texts_to_sequences(first_words)

inference_model.reset_states() # this is important!
predicted_string = ''

# Feed initial words to the model
for i, word_index in enumerate(first_words_indexed):
  x = np.zeros((1,1), dtype=np.int)
  x[0][0] = word_index[0]
  predicted_string += first_words[i]
  predicted_string += ' '
  y_predict = inference_model.predict(x, verbose=0)[0]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # Remove the CWD from sys.path while we load stuff.


In [30]:
# Predict PREDICT_LENGTH words
for i in range(PREDICT_LENGTH):
  new_word_index = np.argmax(y_predict)
  word = tokenizer.sequences_to_texts([[new_word_index]])
  x[0][0] = new_word_index
  predicted_string += word[0]
  predicted_string += ' '
  y_predict = inference_model.predict(x, verbose=0)[0]
print(predicted_string)

i saw i had been 


In [31]:
# Explore embedding similarities
embeddings = training_model.layers[0].get_weights()[0]
lookup_words = ['the', 'saw', 'see', 'of', 'and', 'monster', 'frankenstein', 'read', 'eat']

In [32]:
for lookup_word in lookup_words:
  lookup_word_indexed = tokenizer.texts_to_sequences([lookup_word])
  print('words close to: ', lookup_word)
  lookup_embedding = embeddings[lookup_word_indexed[0]]
  word_indices = {}
  # Calculate distances.
  for i, embedding in enumerate(embeddings):
    distance = np.linalg.norm(embedding - lookup_embedding)
    word_indices[distance] = i

  for distance in sorted(word_indices.keys())[:5]:
    word_index = word_indices[distance]
    word = tokenizer.sequences_to_texts([[word_index]])[0]
    print(word + ': ', distance)
  print(' ')

words close to:  the
the:  0.0
“the:  1.2943733
insensible:  1.4316504
adam’s:  1.4418707
perpendicular:  1.447996
 
words close to:  saw
saw:  0.0
if:  0.5204753
asked:  0.5256026
incomplete:  0.52671164
howlings:  0.526892
 
words close to:  see
see:  0.0
UNK:  0.44503567
copied:  0.49491465
UNK:  0.49917632
UNK:  0.5021899
 
words close to:  of
of:  0.0
UNK:  0.36455932
UNK:  0.37026545
accepting:  0.37322485
UNK:  0.37345725
 
words close to:  and
and:  0.0
not:  0.39689806
UNK:  0.43762556
UNK:  0.4407267
UNK:  0.4459028
 
words close to:  monster
monster:  0.0
UNK:  0.39784324
UNK:  0.41628847
UNK:  0.42031655
UNK:  0.42256182
 
words close to:  frankenstein
frankenstein:  0.0
to:  0.49774316
is:  0.5118301
not:  0.5363105
UNK:  0.55768377
 
words close to:  read
read:  0.0
jumped:  0.49513093
what:  0.5026335
misery:  0.50475585
inferiors:  0.5058931
 
words close to:  eat
eat:  0.0
chaise:  0.49543878
irresistible:  0.50338644
pure:  0.52177197
adrift:  0.5284761
 
