### Character-level Text Generation - [Tensorflow tutorial](https://www.tensorflow.org/tutorials/sequences/text_generation)

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time

In [0]:
# Download the Shakespeare dataset
data_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# Read all the text in the file into a single array
text = open(data_file, 'r').read()
# length of text is the number of characters in it
print ('Length of text: {} characters\n'.format(len(text)))
# Take a look at the first 250 characters in text
print(text[:250])

# The unique characters in the file. 
# Vocab contains all the characters like 'a' - 'Z', '!', '\n', '?', '&', ' ', '.' and so on
vocab = sorted(set(text))
# Length of the vocabulary in chars
vocab_size = len(vocab)

print ('{} unique characters'.format(len(vocab)))

Length of text: 1115394 characters

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

65 unique characters


In [0]:
# Vectorize the text - map strings to a numerical representation. 
# Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# map the character as indexes from 0 to len(unique)
text_as_int = np.array([char2idx[c] for c in text])

print ('=========== Char -> Index\n', char2idx, '\n========== Index -> Char\n', idx2char)
print ('=========== Sample of first 100 charaters\n', repr(text[0:100]), '\n===========\n', text_as_int[0:100])

 {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64} 
 ['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou' 
 [18 47 56 57 58  1 15 47 58 47 64 43 52 1

In [0]:
# Create training examples and targets. Divide the text into example sequences. Each input sequence will contain seq_length characters from the text.
# For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.
# The maximum length sentence we want for a single input in characters

seq_length = 100 # number of time-steps
examples_per_epoch = len(text)//seq_length

# Create a TF Dataset object from the text indices
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# take(5) returns a tensor of the first 5 elements (ie. numeric indices) from the Dataset
# So 'i' is also a tensor with one element of the Dataset. i.numpy() converts that to a plain number
for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

F
i
r
s
t


In [0]:
# Use batch() to convert the individual characters into sequences of the desired size
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  # Use join() to join the individual characters into a single string
  print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [0]:
# For each sequence, duplicate and shift it to form the input and target text by using map() to apply a simple function to each batch

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# After the map, each element of the dataset is a tuple with two tensors in it.
# The first tensor is an input text sequence, and the second tensor is a target text sequence
dataset = sequences.map(split_input_target)

for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

print ('======================')
  
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


In [0]:
# Create training batches - shuffle the data and pack it into batches

# Batch size
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

# Now, each element of the dataset will be a tuple with two tensors.
# The first tensor is an input batch and the second tensor is an target batch
# The input batch tensor has 'BATCH_SIZE' items in it, each item being an input sequence of length 'seq_length'
# The target batch tensor has 'BATCH_SIZE' items in it, each item being a target sequence of length 'seq_length'
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

![alt text](https://tensorflow.org/tutorials/sequences/images/text_generation_training.png)

In [0]:
# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Build The Model with three layers:
#     tf.keras.layers.Embedding: The input layer. A trainable lookup table that will map the numbers of each character to a vector with embedding_dim dimensions;
#     tf.keras.layers.LSTM: A type of RNN with size units=rnn_units
#     tf.keras.layers.Dense: The output layer, with vocab_size outputs.

if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNLSTM
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.LSTM, recurrent_activation='sigmoid')

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
cu_dnnlstm_8 (CuDNNLSTM)     (64, None, 1024)          5251072   
_________________________________________________________________
dense_8 (Dense)              (64, None, 65)            66625     
Total params: 5,334,337
Trainable params: 5,334,337
Non-trainable params: 0
_________________________________________________________________


In [0]:
# We feed it sequences of length 'seq_length' but the model can be run on inputs of any length

# Try the untrained model for one example, and check the shape of the output
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
  
# To get actual predictions from the model we need to sample from the output distribution, 
# to get actual character indices. This distribution is defined by the logits over the 
# character vocabulary. We must sample, not use an argmax of the output

# Get a prediction for the first example from the batch, by sampling from its outputs.
# example_batch_predictions[0] gives us the first example from the batch. It has
# shape [seq_length, vocab_size]
#
# We ask it to return to us one sample from the distribution ie. 'num_samples = 1'
# The returned 'sampled_indices' will have shape [seq_length, num_samples] ie [seq_length, 1]
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

# Squeeze() removes the second dimension since it has size=1
# 'sampled_indices' now has shape [seq_length, ] ie. there is one predicted output
# value for each character in the input sequence
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print ('Sample indices shape ', sampled_indices.shape)
sampled_indices

print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))

print("\nNext Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

(64, 100, 65) # (batch_size, sequence_length, vocab_size)
Sample indices shape  (100,)
Input: 
 'd and father, madam,\nI will not marry yet; and, when I do, I swear,\nIt shall be Romeo, whom you know'

Next Char Predictions: 
 'pmrcO3wj$tpoZheM. b?zw?oDJbblWRkASjHxdHjzeSPekW$G.m3G33T3cYWshk \nNlPVdK;I3NtikstVu.v\nQ.i.WoWVylFXNl?'


In [0]:
# Train the model - as a standard classification problem. Given the previous RNN state, and the 
# input for this time step, predict the class of the next character.

# Attach an optimizer, and a loss function
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

# Configure checkpoints - with tf.keras.callbacks.ModelCheckpoint to ensure that checkpoints are saved during training

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

# Execute the training
EPOCHS=3

history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.1742887
Epoch 1/3


W0705 09:56:43.711948 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer
W0705 09:56:43.713227 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.optimizer
W0705 09:56:43.714274 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.global_step
W0705 09:56:43.717313 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.optimizer.beta1_power
W0705 09:56:43.720095 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.optimizer.beta2_power
W0705 09:56:43.722656 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.optimizer's state 'm' for (root).layer_with_weights-0.embeddings
W0705 09:56:43.723775 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.optimizer's state 'm' for (root).layer_with_weights-1.kernel
W0705 09:56:43.724933 140673951852416 util.py:244] Unresolved object in checkpoint: (root).optimizer.

Epoch 2/3
Epoch 3/3


In [0]:
# Restore the latest checkpoint. To keep this prediction step simple, use a batch size of 1.
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
cu_dnnlstm_9 (CuDNNLSTM)     (1, None, 1024)           5251072   
_________________________________________________________________
dense_9 (Dense)              (1, None, 65)             66625     
Total params: 5,334,337
Trainable params: 5,334,337
Non-trainable params: 0
_________________________________________________________________


![alt text](https://tensorflow.org/tutorials/sequences/images/text_generation_sampling.png)

In [0]:
# Generate text 
#
# Start by choosing a start string, initializing the RNN state and setting the number of characters to generate.
# Get the prediction distribution of the next character using the start string and the RNN state.
# Sample from the distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.
# 
# The RNN state returned by the model is fed back into the model so that it now has more context, instead 
# of only one word. After predicting the next word, the modified RNN states are again fed back into the 
# model, which is how it learns as it gets more context from the previously predicted words.

def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  # Add an extra dimension of size 1 for the batch. The shape is [1, len(start_string)]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      # 'input_eval' is a single character, except for the first time through
      # the loop, when it is the same as the start_string
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature
      
      # The predicted value from the distribution is of shape [len(input_eval), 1]
      # We take [-1, 0] ie. the single character which is the predicted value for the 
      # last character in input_eval
      # So predicted_id is a single scalar number
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      # 'input_eval' now is a single character and has shape [1, 1] 
      input_eval = tf.expand_dims([predicted_id], 0)

      # Save away the single predicted character
      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: VIjRjE&VEJPHEqQD3YEC$NVYZNYTHIOADAUKxCZDBOFAHFMHEHBUNWNVGOCHlZAdZVMONGjOD&.ZYOZEEMEEL$ZKUCGAUABZNUUD$QYZVIQFP3VQGCKQE$ZKDZOKBRA3DP&DYUK;3RHZY:NWEARTHSGEMWONVQY
HAKENVONMENGERD LORETCOF Eame the othel men
Asheer'd when he Lith my here:
My have one-brother out ofmeniby for the gaven,
And this tould their curest complessige.

Battban:
We can thou, changles, pare, dy I cannoty:
Who, that your lordser: to more hand be more?

Second Sen:
Whure is be a king, let him betite.

LORD EDWARD:
But thind unong steke palies
To the gelour brew himself, singe
Before I am I was bud uned I ame least I know more in not
les dead? I subj, for 'tway, if:
Ifford get wish house.

Second Co:
Mur douns
And ancolth, hor how may have how he canist those fair groly'd
F, inse, betreate Hercince but bein
When deed can terus by the lire, to geet int
To sen to will bed: by soup in the clown;
That back her own pather, Hord you word.
A good knews sir, and thou lovess you,
he
Tould to faight,
Brows for their facce 

In [0]:
!rm -r training_checkpoints