In [None]:
#danaderp July'19
#GenerativeLSTM

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dot, Input, Dense, Reshape, LSTM, Conv2D, Flatten, MaxPooling1D, Dropout, MaxPooling2D
from tensorflow.keras.layers import Embedding, Multiply, Subtract
from tensorflow.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Lambda
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

In [None]:
from keras.utils import np_utils

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline

In [None]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [None]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars)) #Lookup Table

In [None]:
print(chars)

In [None]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

we will split the book text up into subsequences with a fixed length of 100 characters, an arbitrary length. We could just as easily split the data up by sentences and pad the shorter sequences and truncate the longer ones.

In [None]:
range(0, n_chars - 100, 1)

In [None]:
#Input Sequence
raw_text[0:0 + 100]

In [None]:
print([char_to_int[char] for char in raw_text[0:0 + 100]])

In [None]:
#Char Prediction
raw_text[0 + 100]

In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100 #<------- [Hyperparameter]
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])

In [None]:
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

First we must transform the list of input sequences into the form [samples, time steps, features] expected by an LSTM network.
Next we need to rescale the integers to the range 0-to-1 to make the patterns easier to learn by the LSTM network that uses the sigmoid activation function by default.
Finally, we need to convert the output patterns (single characters converted to integers) into a one hot encoding.
Each y value is converted into a sparse vector with a length of 47, full of zeros except with a 1 in the column for the letter (integer) that the pattern represents.

In [None]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [None]:
X.shape

In [None]:
y

We can now define our LSTM model. Here we define a single hidden LSTM layer with 256 memory units. The network uses dropout with a probability of 20. The output layer is a Dense layer using the softmax activation function to output a probability prediction for each of the 47 characters between 0 and 1.

The problem is really a single character classification problem with 47 classes and as such is defined as optimizing the log loss (cross entropy), here using the ADAM optimization algorithm for speed.

In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

There is no test dataset. We are modeling the entire training dataset to learn the probability of each character in a sequence.

-----> We are not interested in the most accurate (classification accuracy) model of the training dataset. This would be a model that predicts each character in the training dataset perfectly. Instead we are interested in a generalization of the dataset that minimizes the chosen loss function. We are seeking a balance between generalization and overfitting but short of memorization

------> The network is slow to train (about 300 seconds per epoch on an Nvidia K520 GPU). Because of the slowness and because of our optimization requirements, we will use model checkpointing to record all of the network weights to file each time an improvement in loss is observed at the end of the epoch. We will use the best set of weights (lowest loss) to instantiate our generative model in the next section.

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [1]:
###Generative Network from Tensorflow https://www.tensorflow.org/beta/tutorials/text/text_generation

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [3]:
#!pip install -q tensorflow-gpu==2.0.0-beta1
import tensorflow as tf

import numpy as np
import os
import time

In [4]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [5]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [6]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [7]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

65 unique characters


In [8]:
# Creating a mapping from unique characters to indices
# Vectorize the text
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [9]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...
}


In [10]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ---- characters mapped to int ---- > [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [11]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

F
i
r
s
t


In [12]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [13]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [14]:
dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>

In [15]:
#Print the first examples input and target values:
for input_example, target_example in  dataset.take(2):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
Input data:  'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
Target data: 're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [16]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 39 ('a')
  expected output: 56 ('r')
Step    1
  input: 56 ('r')
  expected output: 43 ('e')
Step    2
  input: 43 ('e')
  expected output: 1 (' ')
Step    3
  input: 1 (' ')
  expected output: 39 ('a')
Step    4
  input: 39 ('a')
  expected output: 50 ('l')


In [17]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [18]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [19]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [20]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

Try the model
Now run the model to see that it behaves as expected.

In [21]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 512)           1574912   
_________________________________________________________________
dense (Dense)                (64, None, 65)            33345     
Total params: 1,624,897
Trainable params: 1,624,897
Non-trainable params: 0
_________________________________________________________________


In [23]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [26]:
sampled_indices

array([13, 57, 46, 57, 40,  8,  0, 11, 40, 11, 13, 44,  5,  0, 47, 17, 27,
       29, 10, 47, 46, 29, 63,  7, 57, 38,  3, 41, 43, 48,  6, 10, 64, 41,
       60, 64, 64, 47, 53, 47,  0, 17, 48, 55, 12, 53, 14,  9, 20, 64,  0,
       25, 41, 20, 45, 14, 32, 57, 63, 55, 14, 63,  0, 34, 63, 48, 32, 57,
       38, 20, 56, 35, 29, 19, 51, 31, 12, 43,  2, 29, 21, 14, 49, 25, 62,
       47, 52, 49, 60, 21, 26, 58, 52, 52, 50, 37, 17, 39, 16, 19])

In [27]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 "e not yet been seen in any house,\nNor can we lie distinguish'd by our faces\nFor man or master; then "

Next Char Predictions: 
 "Ashsb.\n;b;Af'\niEOQ:ihQy-sZ$cej,:zcvzzioi\nEjq?oB3Hz\nMcHgBTsyqBy\nVyjTsZHrWQGmS?e!QIBkMxinkvINtnnlYEaDG"


#Train the model

In [39]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.173676


In [40]:
model.compile(optimizer='adam', loss=loss)

In [41]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [43]:
EPOCHS=30

In [44]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

ValueError: The batch output shape of your `Dataset` is 64, which is incompatible with the specified batch size of your Input Layer: 1

Generate text

In [33]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [34]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 512)            1574912   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             33345     
Total params: 1,624,897
Trainable params: 1,624,897
Non-trainable params: 0
_________________________________________________________________


The prediction loop

In [36]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [37]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: fAUXFDKX
MQ33JxNxKYCLAQt3RVG?&CTSfQS&X3jXC3ZSKMxVNQKQWxjPYMLGLZLNWQJ$pKQV3FVF&VYK&QSFMZQB$HUHQ$WLYVCNK$QJjHY;;N EWMQVNQCZAJwWPJKQbSXQVQJJGjQHXHVSVZp.NN?QJJPBQSUQeXS!V$ASM$SQjx$3IEQLMxBQ!SXxV&GzJjQxYzQWIqOQN!3P!EKKQKXQ
:GRV$Gz3GCVSbIX.PXG$YVYNxuV;EQJINjVgQL3SjQV3VQQxNJNQZ$z3VKQ&v$qxN-EJ&x;-xqX$QZQjJFQYPLUVGzV!3OQCYQ3HSQQOWQQH&F$QKN3Qg3L3QvB;;NINVLQJ33VUXAVJHQYI3CZQTZEQHgKQGjCzPSz-IUWRVVIiZYVjNPZyZOHBHZkVBH3&$DXxYZX3RVYVLVXRJKLEZGQQxSZ3K?QTXKGGKCXIQQVCXXQCQU3X3&XYQ:XJQ$LgK;GVxY&ZjDCQQSEMGzQcJ&WSJJJKQ&pYXZjDQGZToTXhNICQ&3VX,xP$FzVLqqYQkULVX$QKCQ&
XZQPRXTZCIHKKOSINZ;SXN$Hz!QPXbmYXjVJJjVVSQQOQ$KJ3XXHxPJPTHXKQDU3WZYNNUASQNXZSGMBOIYSkJQMYS,N-JQWRZL.BGDKXxWq$Q&EHqaEI$zJPXXW$QZNXK,VDZkQmzYG3SCVVXDQVQKCQSESBIVXHDKT&QHZSYQ&3RZMScRKL3BjUKQ&KQ3YQnLM$3JVG$DJxDJnQ3OZqOIQjVGZCQVjVLXRPJQLFJQZBUVYGUOQ3WjFGV,LCFJKVXQxHURNZxVVXYVEYS&FQBGZQZHUJQJRD:C$QSCQPjR3VK3XQMB$YSYQQLQSPHU$OBL&3MZXDQTXNHJQ!Px-QQVQJKV!xYFCQSDRQORjMxXjfY3GQjQLjBSQVNHX&UZFQMVQQKSQNDAYS&j-KPUMQA;QZw;CIJZHNPQZX3KQ&OJ;MNa3xYJjHjQLZ$Q