In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np 
import os
import time

In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [0]:
import tensorflow as tf

In [0]:
try: 
  from google.colab import files
  BASE_PATH = "/content"
except: 
  BASE_PATH = "./datasets"

In [0]:
DATASET_URL = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
DATASET_NAME = "shakespeare.txt"

In [0]:
file_handler = tf.keras.utils.get_file(DATASET_NAME, DATASET_URL, cache_dir=BASE_PATH)

In [50]:
# Read the file and decode it 
text = open(file_handler, 'rb').read().decode(encoding='utf=8')

# print number of characters on it
print("Length of text: {} characters".format(len(text)))

Length of text: 1115394 characters


In [51]:
# check first 250 chars
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [52]:
# Get the unique characters in the text 
vocab = sorted(set(text))
print("{} unique characters".format(len(vocab)))

65 unique characters


# Process the text 

Let's vectorize the text by mapping to numerical representation. 

For doing so, we will create 2 maps:


*   chars to nums
*   nums to chars

In [0]:
chars_to_nums = { char:idx for idx, char in enumerate(vocab)}
nums_to_char = np.array(vocab)

text_as_int = np.array([chars_to_nums[c] for c in text])

In [54]:
# Print the representation 
print("{")
for char, _ in zip(chars_to_nums, range(20)):
  print(" {:4s}: {:3d},".format(repr(char), chars_to_nums[char]))
print(" ...\n}")

{
 '\n':   0,
 ' ' :   1,
 '!' :   2,
 '$' :   3,
 '&' :   4,
 "'" :   5,
 ',' :   6,
 '-' :   7,
 '.' :   8,
 '3' :   9,
 ':' :  10,
 ';' :  11,
 '?' :  12,
 'A' :  13,
 'B' :  14,
 'C' :  15,
 'D' :  16,
 'E' :  17,
 'F' :  18,
 'G' :  19,
 ...
}


In [55]:
# Show how the first 13 chars from text are mapped to integers
print("{} ----- characters mapped to int ----> {}".format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ----- characters mapped to int ----> [18 47 56 57 58  1 15 47 58 47 64 43 52]


# Prediction Time 

Given an character or sequence of characters, what is the most probable next character ? 

We will use an RNN for this, since it mantains an internal state of what it has seen previously.

In [0]:
# Max length of the sentence we want for a single input in characters
seq_length = 100 
examples_per_epoch = len(text)//(seq_length+1)

In [0]:
# Create training examples 
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [58]:
for i in char_dataset.take(14):
  print(nums_to_char[i.numpy()])

F
i
r
s
t
 
C
i
t
i
z
e
n
:


In [0]:
# Let's create sequences :) using the batch method
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [60]:
for seq in sequences.take(5):
  print(repr("".join(nums_to_char[seq.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [0]:
# let's split between the input and the predicted text 
def split_input_target(chunk):
  input_text = chunk[:-1] # all except the last one
  target_text = chunk[1:] # all except the first one
  return input_text, target_text

In [0]:
dataset = sequences.map(split_input_target)

In [63]:
# Print some examples
for input_example, target_example in dataset.take(1):
  print("Input data: ", repr("".join(nums_to_char[input_example.numpy()])))
  print("Target data: ", repr("".join(nums_to_char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data:  'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [64]:
for idx, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
  print("Step: {:4d}".format(idx))
  print(" input: {} ({:s})".format(input_idx, repr(nums_to_char[input_idx])))
  print(" expected output: {} ({:s})".format(target_idx, repr(nums_to_char[target_idx])))

Step:    0
 input: 18 ('F')
 expected output: 47 ('i')
Step:    1
 input: 47 ('i')
 expected output: 56 ('r')
Step:    2
 input: 56 ('r')
 expected output: 57 ('s')
Step:    3
 input: 57 ('s')
 expected output: 58 ('t')
Step:    4
 input: 58 ('t')
 expected output: 1 (' ')


# Create training batches

Before building the actual model, let's shuffle the data and pack it into batches

In [65]:
# Batch sizevocab_size
BATCH_SIZE = 64 

'''
Buffer size to shuffle the dataset 
TF data work with possibly infinite sequences
it doesn't shuffle all the sequence in memory 
Instead mantains a buffer in which it shuffles elements
'''
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [0]:
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
#Number of RNN units 
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential(
      [
       tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
       tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
       tf.keras.layers.Dense(vocab_size)
      ]
  )
  return model

In [0]:
model = build_model(vocab_size=VOCAB_SIZE,
                    embedding_dim=EMBEDDING_DIM,
                    rnn_units=rnn_units,
                    batch_size=BATCH_SIZE
                    )

In [69]:
# Let's check the shape of the output
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print("(batch_size, seq_length, vocab_size)")
  print(example_batch_predictions.shape)


(batch_size, seq_length, vocab_size)
(64, 100, 65)


In [70]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_1 (Dense)              (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [71]:
# Try out the frist example on the batch 
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
print(sampled_indices.shape)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

(100, 1)


In [72]:
sampled_indices

array([48, 44,  6, 37, 39, 12, 26, 64, 12, 20, 61, 54, 36, 61, 23, 10, 57,
       26, 12, 63, 10, 59, 49, 45, 23, 20, 43,  6, 21, 31, 49, 30, 16,  4,
       25, 56, 23, 20,  9, 46, 18, 28, 22, 51, 22, 23,  4, 26,  0, 50, 28,
       63, 58,  0, 27, 57, 34, 46, 52, 40, 40, 29, 26, 43, 64, 47, 39, 23,
        1,  4, 24, 37, 29, 33, 57,  7,  6, 50, 16, 33, 20,  7, 24,  3, 44,
       18,  9, 20, 29, 22, 56,  7, 40, 37,  4, 10, 11, 15, 33,  1])

In [73]:
# Decode the text predicted by the untrained model
print("Input: \n", repr("".join(nums_to_char[input_example_batch[0]])))
print()
print("Next Char predictions: \n", repr("".join(nums_to_char[sampled_indices])))

Input: 
 "is\nThe rarest of all women.\n\nLEONTES:\nGo, Cleomenes;\nYourself, assisted with your honour'd friends,\n"

Next Char predictions: 
 'jf,Ya?Nz?HwpXwK:sN?y:ukgKHe,ISkRD&MrKH3hFPJmJK&N\nlPyt\nOsVhnbbQNeziaK &LYQUs-,lDUH-L$fF3HQJr-bY&:;CU '


# Train the model.

The problem will be treated as a standard classification problem. Given the previous RNN state and the input, predict the class of next character.

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [75]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, "#(batch_size, seq_length, vocab_size)")
print("Scalar loss       ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65) #(batch_size, seq_length, vocab_size)
Scalar loss        4.174521


In [0]:
# Configure the  model 
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=loss)

In [0]:
# Configure checkpoints to ensure those are saved during training 
CHECKPOINT_DIR = "./training_checkpoints"

# Name of checkpoint files
checkpoint_prefix = os.path.join(CHECKPOINT_DIR, "ckpt_shake_{epoch}")

In [0]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [0]:
# Amount of epochs to train for
EPOCHS = 10

In [80]:
# Training step
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Generate Text 

Restore the last checkpoint stored.

We will keep the prediction step simple, and will use a batch size of 1 

As the RNN state is pased from timestep to timestep, the model only accepts a fixed batch size once is built 

So we need to rebuild and restore the weights from the checkpoint

In [81]:
tf.train.latest_checkpoint(CHECKPOINT_DIR) # Find the name of the latest saved checkpoint

'./training_checkpoints/ckpt_shake_10'

In [0]:
# Rebuild the model by loading the weights from latest checkpoint
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(CHECKPOINT_DIR))

model.build(tf.TensorShape([1,None]))

In [84]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_3 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model, start_string):
  # Evaluation step ( generating text from the learned model)

  # Number of chars to generate 
  num_to_generate = 1000

  # Convert start_string to numbers ( vectorizing step )
  input_eval = [chars_to_nums[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store the results
  text_generated = [] 

  # Low temperature results in more predictable text 
  # Higher temperature results in more surprising text 
  # Experiment to find the best setting 
  temperature = 1.0 

  model.reset_states()
  for i in range(num_to_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # Use a categorical distribution to preduct the word returned by the model 
    predictions = predictions / temperature 
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # Pass the predicted word as the next input to the model along with the previous 
    # hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(nums_to_char[predicted_id])
  return(start_string + ''.join(text_generated))

In [90]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: I say, eat, and God Sir Margaret
Phall be in our aptemplay'd my brother ill?

MARIANA:
Well strong as it.

KING HENRY VI:
Haw not, my gate! 'pay you?

MortAR:
I know his cousin Ence, your queen, do not
Until well assust my butcher blior.
I bide my blood is love from Rome,
O'ercount we will not be so above to Claudio,
And bade my father day love our subjects?

WISTY:
Yes, I pray, and I, with a man do ait
But one had I repore myself to have a pite
Is my acquit her to be much,
If you were in consent too? dwells, the letters ffort in your died.

NASTINCE:
His new-made brief, how you must so: But soul,
You shoul dineath from the worst cunnot do;
For I am too good.

CALIBLIHA:
No, no; but I cut from maids?

ANTONIO:
God welcome, thy troth, I tall your grace and leave unto a balt.

GHOMSO, hold of that hath fonethy thing.

LADY CAPULET:
That is my wounded, sir, you know.
O, force actionanted myself.

CAMILLO:
Should thy fault so brief, one foul sorrow
In the discords of my business wel