## We are building a character-level NLP model

In [2]:
#from __future__ import absolute_import, division, print_fuction, unicode_literals

import tensorflow as tf

import numpy as np
import os
import time

### Download data

In [5]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


### Import data

In [6]:
# read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [7]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [8]:
# the unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

65 unique characters


### Convert characters to numerical index representation

In [9]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [10]:
print('{')
for char, _ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...



In [15]:
# Show ohw the first 13 characters from the text are mapped to integers
print('{} ---- characters mapped to int ----> {}'.format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ---- characters mapped to int ----> [18 47 56 57 58  1 15 47 58 47 64 43 52]


### Creating training examples and targets

In [17]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create stream of characters for training dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


2024-03-04 10:25:08.525486: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-03-04 10:25:08.525504: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 96.00 GB
2024-03-04 10:25:08.525509: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 36.00 GB
2024-03-04 10:25:08.525540: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-04 10:25:08.525553: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [20]:
# Each sequence has 101 charcters: we'll predict next char given the preceding ones,
# up to 100 chars of input
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [21]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [22]:
for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data: ', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data:  'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [23]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


### Shuffle data for stochastic gradient descent

In [24]:
# batch size
BATCH_SIZE = 64

# buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory.
# Instead, it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

### Desing neural network

In [26]:
# length of the vocabulary in chars
vocab_size = len(vocab)

# embedding dimension
embedding_dim = 256

# number of RNN units
rnn_units = 1024

In [29]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            # last state for index in batch is used as initial state in next batch
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [32]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [42]:
# Examine the shape of the output
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           16640     
                                                                 
 gru_2 (GRU)                 (64, None, 1024)          3938304   
                                                                 
 dense_1 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [45]:
sampled_indices

array([ 6, 16, 42, 22,  9, 15, 55, 22, 28,  7, 50, 24,  3, 61, 47, 58, 48,
       54, 26, 62, 21,  9,  3, 42, 61, 12, 16,  3, 41,  9, 52, 62, 50,  1,
        8, 10, 15, 22, 22, 12, 19, 16,  0, 52, 32, 39, 55,  3, 30, 40, 37,
       27, 29, 61, 19, 64, 30, 57, 36, 63, 35,  7, 23,  2, 57, 33, 50, 31,
       20, 41,  3, 36, 50, 59, 11, 22, 59, 44, 13, 34, 51, 31,  4, 29, 26,
        6, 24, 37, 19, 53, 40, 44, 40, 41, 59, 17, 63, 42, 27, 59])

In [46]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

Input: 
 ' is too much sad:\nYou promised, when you parted with the king,\nTo lay aside life-harming heaviness\nA'

Next Char Predictions: 
 ',DdJ3CqJP-lL$witjpNxI3$dw?D$c3nxl .:CJJ?GD\nnTaq$RbYOQwGzRsXyW-K!sUlSHc$Xlu;JufAVmS&QN,LYGobfbcuEydOu'


### Compile model

In [47]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.174061


In [48]:
model.compile(optimizer='adam', loss=loss)

In [49]:
# directory where the checkpoints will be saved
checkpoint_dir = './model_output/seqGen'

# name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Train

In [50]:
EPOCHS = 30

In [51]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30


2024-03-04 10:54:47.406029: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### Rebuild model with single output for generating text one char at a time

In [52]:
tf.train.latest_checkpoint(checkpoint_dir)

'./model_output/seqGen/ckpt_30'

In [54]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [55]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_3 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_2 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [60]:
def generate_text(model, start_string):
    # evaluation step (generating text using the learned model)

    # number of characters to generate
    num_generate = 10000

    # converting out start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # empty string to store out results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the char returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted char as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [61]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: two may, sir, when gaze one word.

GREMIO:
No, not a whit: I will recond Saying.

DUKE VINCENTIO:
Sirrah, tell me, by Saito back.
Put them to our authority,
God that the in this night I am afraid.

NORTHUMBERLAND:
The noble gratigious myself self:
Then wishou may: what man shall be
so: yea, a beggar, that there's nothing else
But father,
Whether it be not one: Let have I loved thee warm again.
D III:
Catesby!

CAPULET:
Alas, all dissembling royal grandsire steeds,
Which if they but awake, thy creature bless
My son I have fought,
And hold you for a holy many have and others that had been many;
For 'twas, good cheer!

PETRUCHIO:
But say, what man that be abhort's tongue,
I would do so recompear of it; it must be past
Break out a parlimadeneath from otherwise, but valiant growital powerful for your bosom.

KING EDWARD IV:
Now tell me what, will haze; for layour hay behind;
And say'st thou me as at home: he's said yet not meet,
Did burneing placket with charitable degreeting!
How si

In [62]:
print(generate_text(model, start_string=u"HAMLET: To be, or not to be: that is the question:"))

HAMLET: To be, or not to be: that is the question: that we bear
I'll play among the stones' reason, that had made heart
I'll give them by a sacrifice,
A gentleman ever Duke of York.

GLOUCESTER:
It is a spend; and th Lord Nay delight
Did muster your vanquisher,
As ours o' the common bound:
Be veried our dutation, what with deceisable are
Slains my gains, Jace! Friar John, be avoided,
As 'twere to catch them. Nay, but giddy life
And halth of graves, herself,
A tallow-cornerance, that feel now his death,
Having my heart were fly. For this business
Will be as stronger bassible act are so light deep.

CLEOMENES:
Bliss and joy.
Well, sir. There's it nigh, by this time attook him here,
And not the better.

CALIBLE:
It may be so deposed.

Second Citizen:
And so dear is in the skies?

All:
Those mocked places in my sorrow's daughter Christ.

Shepherd:
Call it sad thee in him than an assure
Thou came, where le's some piece was never in the scene.

POLIXENES:
I do beseech thee,
On thy heart of E