### Generating Shakespearean Text Using a Character RNN

**Creating the Training Dataset**

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Set TensorFlow to use CPU only
# tf.config.set_visible_devices([], 'GPU')

In [9]:
shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [10]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [11]:
# Encode the text with a TextVectorization layer
text_vec_layer = tf.keras.layers.TextVectorization(split="character",
                                                   standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [12]:
text_vec_layer.vocabulary_size()

41

In [13]:
# 0 is reserved for padding tokens and 1 for unknown characters
text_vec_layer.get_vocabulary()

['',
 '[UNK]',
 ' ',
 'e',
 't',
 'o',
 'a',
 'i',
 'h',
 's',
 'r',
 'n',
 '\n',
 'l',
 'd',
 'u',
 'm',
 'y',
 'w',
 ',',
 'c',
 'f',
 'g',
 'b',
 'p',
 ':',
 'k',
 'v',
 '.',
 "'",
 ';',
 '?',
 '!',
 '-',
 'j',
 'q',
 'x',
 'z',
 '3',
 '&',
 '$']

In [14]:
encoded -=2  # drop tokens 0 (pad) and 1 (unknown), which we will not use
n_tokens = text_vec_layer.vocabulary_size() - 2  # number of distinct characters
dataset_size = len(encoded) # total number of characters - 1,115,394

In [15]:
# Helper function to prepare the dataset for a Sequence-to-Sequence RNN
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    # increase the length by 1 since we need the next character for the target
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

Let's run the logic of the function line by line to understand it better:

In [16]:
ds = tf.data.Dataset.from_tensor_slices(encoded)
for item in ds.take(1):
    print(item)

tf.Tensor(19, shape=(), dtype=int64)


In [17]:
ds = ds.window(10, shift=1, drop_remainder=True)


In [18]:
# ds.window() returns a dataset of windows, i.e. a nested dataset
for item in ds.take(1):
    for c in item.take(1):
        print(c)

tf.Tensor(19, shape=(), dtype=int64)


2024-05-24 10:21:16.446959: W tensorflow/core/framework/dataset.cc:959] Input of Window will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


In [19]:
# That's why we use the flat_map method
ds = ds.flat_map(lambda window_ds: window_ds.batch(10))
for item in ds.take(1):
    print(item)

tf.Tensor([19  5  8  7  2  0 18  5  2  5], shape=(10,), dtype=int64)


In [20]:
ds = ds.batch(4)
for item in ds.take(1):
    print(item)

tf.Tensor(
[[19  5  8  7  2  0 18  5  2  5]
 [ 5  8  7  2  0 18  5  2  5 35]
 [ 8  7  2  0 18  5  2  5 35  1]
 [ 7  2  0 18  5  2  5 35  1  9]], shape=(4, 10), dtype=int64)


In [21]:
ds = ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)
for item in ds.take(1):
    print(item)

(<tf.Tensor: shape=(4, 9), dtype=int64, numpy=
array([[19,  5,  8,  7,  2,  0, 18,  5,  2],
       [ 5,  8,  7,  2,  0, 18,  5,  2,  5],
       [ 8,  7,  2,  0, 18,  5,  2,  5, 35],
       [ 7,  2,  0, 18,  5,  2,  5, 35,  1]])>, <tf.Tensor: shape=(4, 9), dtype=int64, numpy=
array([[ 5,  8,  7,  2,  0, 18,  5,  2,  5],
       [ 8,  7,  2,  0, 18,  5,  2,  5, 35],
       [ 7,  2,  0, 18,  5,  2,  5, 35,  1],
       [ 2,  0, 18,  5,  2,  5, 35,  1,  9]])>)


In this batch, the first input is `[19,  5,  8,  7,  2,  0, 18,  5,  2]` and the corresponding output is `[ 5,  8,  7,  2,  0, 18,  5,  2,  5]`.

In [22]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,
                       seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [23]:
for input, target in train_set.take(1):
    print(input)
    print(target)

tf.Tensor(
[[15  0  9 ... 11 11  0]
 [25  1  0 ...  3  9  7]
 [13  7 23 ... 17  0  4]
 ...
 [25  1  0 ...  6  1  0]
 [11 12  0 ... 21 13  2]
 [ 1  0  5 ...  5  2  6]], shape=(32, 100), dtype=int64)
tf.Tensor(
[[ 0  9  3 ... 11  0  2]
 [ 1  0  3 ...  9  7  2]
 [ 7 23 10 ...  0  4  7]
 ...
 [ 1  0 21 ...  1  0 18]
 [12  0 15 ... 13  2  0]
 [ 0  5  2 ...  2  6  0]], shape=(32, 100), dtype=int64)


**Building and Training the Char-RNN Model**

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    # We train to predict the next character, so we can't just let
    # the Dense layer output anything it wants
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          624       
                                                                 
 gru (GRU)                   (None, None, 128)         56064     
                                                                 
 dense (Dense)               (None, None, 39)          5031      
                                                                 
Total params: 61719 (241.09 KB)
Trainable params: 61719 (241.09 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# # If you think about it, the targets are integer numbers representing character IDs,
# # which can be considered class IDs. So the use of sparse_categorical_crossentropy
# # loss makes sense
# model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
#               metrics=["accuracy"])
# model_ckpt = tf.keras.callbacks.ModelCheckpoint(
#     "my_shakespeare_model", monitor="val_accuracy", save_best_only=True
# )
# history = model.fit(train_set, validation_data=valid_set, epochs=5,
#                     callbacks=[model_ckpt])

In [21]:
# shakespeare_model = tf.keras.Sequential([
#     text_vec_layer,
#     tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
#     model
# ])

Unfortunately, training the above model hangs or kills the jupyter kernel on my Mac. So, we will go with the

pretrained model made available by Jullien.

In [22]:
# extra code – downloads a pretrained model
url = "https://github.com/ageron/data/raw/main/shakespeare_model.tgz"
path = tf.keras.utils.get_file("shakespeare_model.tgz", url, extract=True)
model_path = Path(path).with_name("shakespeare_model")
shakespeare_model = tf.keras.models.load_model(model_path)

In [23]:
# Predict the next character in a sentence
y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)  # choose the most probable character
text_vec_layer.get_vocabulary()[y_pred + 2]



'e'

In [24]:
model_output = shakespeare_model.predict(["To be or not to b"])
model_output



array([[[2.40598023e-02, 8.39524642e-02, 2.56986148e-03, 6.81132674e-01,
         5.57369692e-03, 1.28980115e-01, 3.45016830e-02, 3.45355220e-04,
         2.05263589e-03, 1.39599069e-06, 2.25605490e-03, 1.98339135e-03,
         1.65986719e-07, 5.30304445e-04, 5.72810995e-06, 2.27981023e-02,
         1.22251199e-03, 2.89343833e-03, 2.08538680e-04, 5.40604669e-05,
         1.53302904e-06, 3.27079724e-05, 1.37306401e-04, 1.58490217e-03,
         3.71308772e-07, 2.42348968e-08, 1.97694171e-03, 2.71507044e-04,
         1.92895808e-04, 4.48364037e-04, 1.67209073e-04, 6.41081497e-05,
         5.02575084e-08, 4.98080066e-09, 4.04996481e-09, 7.58086642e-08,
         1.15317922e-09, 6.51099503e-18, 2.61223176e-09],
        [3.84590983e-01, 1.80969892e-06, 6.02273317e-03, 4.99450229e-03,
         1.43065963e-05, 5.44819981e-03, 3.13025108e-03, 7.48898042e-03,
         1.60934106e-02, 2.84634650e-01, 1.17216809e-02, 1.89255908e-01,
         2.26248503e-05, 9.31028393e-04, 7.63864140e-04, 1.8984002

In [25]:
model_output.shape

(1, 17, 39)

The shape of the model's output is (batch, time_steps, class_probabilities).

So we take output[0,-1] which means the first batch (or the first item in the batch - I'm not sure) and the last time step (since we only care about the next character).

The result is the 39 class probabilities (probabilities for each character in the vocabulary)

**Generating Fake Shakespearean Text**

In [26]:
# Example use of tf.random.categorical() function
log_probas = tf.math.log([[0.5, 0.4, 0.1]])  # probas = 50%, 40%, 10%
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)  # draw 8 samples

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 2, 1, 0, 0, 1]])>

In [24]:
# helper function that uses the supplied temperature
def next_char(text, temperature=1):
    y_proba = shakespeare_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

In [25]:
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [29]:
with tf.device("/cpu:0"):
    tf.random.set_seed(42)
    print(extend_text("To be or not to be", temperature=0.01))

To be or not to be the duke
as it is a proper strange death,
and the


In [30]:
with tf.device("/cpu:0"):
    print(extend_text("To be or not to be", temperature=1))

To be or not to behold?

second push:
gremio, lord all, a sistermen,


In [31]:
with tf.device("/cpu:0"):
    print(extend_text("To be or not to be", temperature=100))

To be or not to bef ,mt'&o3fpadm!$
wh!nse?bws3est--vgerdjw?c-y-ewznq


**Stateful RNN**

In [26]:
# helper function to prepare sequential, non-overlapping sequences with
# batch size of 1 so each batch continues where the previous batch left off.
# This technique is appropriate for stateful RNNs.
def to_dataset_for_stateful_rnn(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    # shift=length because we can't use overlapping windows
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [27]:
stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000],
                                                 length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)

In [28]:
for i, (input, target) in enumerate(stateful_train_set.take(2)):
    print(f"Input {i}:")
    print(input)
    print(f"target {i}:")
    print(target)

Input 0:
tf.Tensor(
[[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
   0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
   4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
  17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
  10 15  3 13]], shape=(1, 100), dtype=int64)
target 0:
tf.Tensor(
[[ 5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1  0
  22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1  4
   8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24 17
   0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23 10
  15  3 13  0]], shape=(1, 100), dtype=int64)
Input 1:
tf.Tensor(
[[ 0  4  8  1  0  4 11 11  0  8  1  7  3 11 25  1 12  0  8  4  2  6  1  8
   0  2  3  0 12  5  1  0  2  6  4  9  0  2  3  0 19  4 14  5  7  6 29 10
  10  4 11 11 23 10  8  1  7  3 11 25  1 12 26  0  8  1  7  3 11 25  1 12
  26 10 10 19  5  8  7  2  0 18  

In [35]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    # batch input shape has all the dimensions except the last one (n_features)
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16,
                              batch_input_shape=[1, None]),
    # stateful=True to preserve the state between training iterations
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True) ,
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [29]:
# Reset the states at the end of each epoch
class ResetStatesCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [30]:
# extra code – use a different directory to save the checkpoints
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_stateful_shakespeare_model",
    monitor="val_accuracy",
    save_best_only=True)

In [52]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(stateful_train_set, validation_data=stateful_valid_set,
                    epochs=10, callbacks=[ResetStatesCallback(), model_ckpt])


Epoch 1/10
   9997/Unknown - 165s 16ms/step - loss: 1.8691 - accuracy: 0.4499

2024-05-22 18:44:46.889270: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 12238912183930890742
2024-05-22 18:44:46.889285: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 16135087444076161340
2024-05-22 18:44:46.889288: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 5515724840522351501
2024-05-22 18:44:46.889307: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 10482379356136224177
2024-05-22 18:44:46.889312: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 10453229208265020866
2024-05-22 18:44:46.889326: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 18347746450447521286
2024-05-22 18:44:46.889333: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous rec

INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 2/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 3/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 4/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 5/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 6/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 7/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 8/10
Epoch 9/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets


Epoch 10/10


INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets




### Sentiment Analysis

In [31]:
# Load and split the imdb movie reviews dataset
raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name="imdb_reviews",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True
)
tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

In [32]:
for review, label in raw_train_set.take(4):
    print(review.numpy().decode("utf-8"))
    print("Label:", label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Label: 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development

2024-05-24 10:22:34.696730: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [33]:
# Tokenize the text to the word level with TextVectorization layer
vocab_size = 1000  # limit max number of tokens
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

In [45]:
# Create the model for sentiment analysis and train it
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=2)

Epoch 1/2
Epoch 2/2


No good performance because of the many padding tokens.

**Masking**

In [34]:
from time import strftime

def get_run_logdir(root_logdir="my_logs"):
    return Path(root_logdir) / strftime("run_%Y_%m_%d_%H_%M_%S")

In [35]:
tf.keras.backend.clear_session()
# For some reason the accuracy doesn't go up on the  Mac GPU
with tf.device("/cpu:0"):
    embed_size = 128
    tf.random.set_seed(42)
    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
        tf.keras.layers.GRU(128),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    tensorboard_cb = tf.keras.callbacks.TensorBoard(get_run_logdir())

    model.compile(loss="binary_crossentropy", optimizer="nadam",
                metrics=["accuracy"])
    history = model.fit(train_set, validation_data=valid_set, epochs=3,
                        callbacks=[tensorboard_cb])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [51]:
# Pass the mask manually 
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
mask = tf.math.not_equal(token_ids, 0)
Z = tf.keras.layers.Embedding(vocab_size, embed_size)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

In [52]:
# Instead of masking, use ragged tensors
text_vec_layer_ragged = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, ragged=True
)
text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))
text_vec_layer_ragged(["Great movie!", "This is DiCaprio's best role."])

<tf.RaggedTensor [[86, 18], [11, 7, 1, 116, 217]]>

In [53]:
# Contrast with padded output of text_vec_layer
text_vec_layer(["Great movie!", "This is DiCaprio's best role."])

<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[ 86,  18,   0,   0,   0],
       [ 11,   7,   1, 116, 217]])>

**Reusing Pretrained Embeddings and Language Models**

In [37]:
import os
import tensorflow_hub as hub

tf.keras.backend.clear_session()

os.environ["TFHUB_CACHE_DIR"] = "my_tfhub_cache"
# https://github.com/tensorflow/hub/issues/70
model = tf.keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                trainable=True, dtype=tf.string, input_shape=[]),
    tf.keras.layers.Dense(64, activation="swish"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
            metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=3)

Epoch 1/3


KeyboardInterrupt: 

### An Encoder-Decoder Network for Neural Machine Translation

In [5]:
# Download the dataset (pairs of English-Spanish sentences)
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

In [6]:
print(text[:100])

Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.
Run!	¡Corre!
Run.	Corred.
Who?	¿Quién?
Fire!	¡Fueg


In [7]:
# remove special characters, shuffle and split to two separate lists
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [8]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

The artist always painted alone. => El artista siempre pintaba solo.
I drove all night. => Conduje toda la noche.
Where is the stop for the airport buses? => Dónde está la parada de los autobuses al aeropuerto?


In [9]:
# Create the TextVectorization layers and adapt them
vocab_size = 1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

2024-05-27 10:42:32.783909: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [10]:
# the first 10 words of the vocabulary, sorted by decreasing frequency
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [11]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [12]:
# Create training and validation sets
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec  = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])


In [13]:
X_train[:3]

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'The artist always painted alone.', b'I drove all night.',
       b'Where is the stop for the airport buses?'], dtype=object)>

In [14]:
y_train[:3]

<tf.Tensor: shape=(3, 50), dtype=int64, numpy=
array([[ 10,   1, 100,   1,  79,   3,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1, 245,   9, 138,   3,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 88,  22,   9,   1,   4,  21,   1,  34, 800,   3,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>

In [15]:
# Start building the model
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [16]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [35]:
decoder_embeddings.shape

TensorShape([None, 50, 128])

In [50]:
encoder = tf.keras.layers.LSTM(512, return_state=True)
# we need the state to pass it to the decoder
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [51]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [52]:
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [54]:
# accuracy does not increase on the GPU sadly
with tf.device("/cpu:0"):
    model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                        outputs=[Y_proba])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
                metrics=["accuracy"])
    model.fit((X_train, X_train_dec), y_train, epochs=3,
            validation_data=((X_valid, X_valid_dec), y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [2]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # encoder input
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

**Biderictional RNNs**

In [3]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True)
)

2024-05-27 10:36:00.070775: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-05-27 10:36:00.070796: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-05-27 10:36:00.070802: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-05-27 10:36:00.070833: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-27 10:36:00.070851: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [17]:
# The Bidrectional RNN returns four states, so we need to concatenate them
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)

In [18]:
# extra code — completes the model and trains it
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), y_valid))

Epoch 1/10


2024-05-27 10:44:41.021634: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	for Tuple type infernce function 0
	while inferring type of node 'cond_40/output/_23'


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x365847cd0>

In [20]:
translate("I like soccer")



'tom [UNK] a mary'

### Attention mechanisms

In [19]:
# we need all the encoder's outputs, so we set return_sequences=True
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
)

In [None]:
# We normally pass the decoder's hidden state to the attention layer.
# But to access the decoder's states at each time step we would need to write
# a custom memory cell. So, for simplicity, we just pass the decoder outputs
# instead of the decoder's states.
# Then we pass the attention layer's output to the output layer as suggested
# in the Luong attention paper
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [21]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3b621aa70>

**Positional encodings**

In [38]:
# Encode word positions using an Embedding layer

max_length = 50  # max length in the whole training set
embed_size = 128
pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))  # the positions of the words in the sentence
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))


In [39]:
# Encode word positions using sine / cosine functions (fixed encodings)

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0, "embed_size must be even"
        # a set of coordinates to evaluate sin and cos on
        p, i = np.meshgrid(np.arange(max_length),
                           2 * np.arange(embed_size // 2))
        pos_emb = np.empty((1, max_length, embed_size))
        pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.pos_encodings[:, :batch_max_length]

In [40]:
pos_embed_layer = PositionalEncoding(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

In [41]:
# Same as the word embeddings matrix shape
print(encoder_in.shape)
print(decoder_in.shape)

(None, 50, 128)
(None, 50, 128)


In [42]:
# Implementation of the transformer architecture's encoder
N = 2  # instead of 6
num_heads = 8
dropout_rate = 0.1
n_units = 128  # for the first dense layer in each feedforward block
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
Z = encoder_in
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate
    )
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.Dropout(dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))



In [43]:
batch_max_len_dec

<KerasTensor: shape=() dtype=int32 inferred_value=[50] (created by layer 'tf.__operators__.getitem_9')>

In [44]:
# Handle masking in the decoder
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
causal_mask = tf.linalg.band_part(  # creates a lower triangular matrix
    tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

In [46]:
# decoder
encoder_outputs = Z  # let's save the encoder's final outputs
Z = decoder_in  # the decoder starts with its own inputs
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate
    )
    Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate
    )
    Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [47]:
Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x434bc82e0>

In [48]:
translate("I like football")



'me gusta el [UNK]'

In [49]:
translate("The weather is nice today")



'el tiempo es [UNK] hoy'

### Hugging Face's Transformers Library

In [50]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")  # many other tasks are available
result = classifier("The actors were very convincing.")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [51]:
result

[{'label': 'POSITIVE', 'score': 0.9998071789741516}]

In [52]:
classifier(["I am from India.", "I am from Iraq."])

[{'label': 'POSITIVE', 'score': 0.9896161556243896},
 {'label': 'NEGATIVE', 'score': 0.9811071157455444}]

In [53]:
classifier("I am from Greece.")

[{'label': 'POSITIVE', 'score': 0.9933144450187683}]

In [55]:
classifier("I am from Patras.")

[{'label': 'POSITIVE', 'score': 0.9881877303123474}]

In [56]:
model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"
classifier_mnli = pipeline("text-classification", model=model_name)
classifier_mnli("She loves me. [SEP] She loves me not.")

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


[{'label': 'contradiction', 'score': 0.9790192246437073}]

In [57]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [58]:
token_ids = tokenizer(["I like soccer. [SEP] We all love soccer!",
                       "Joe lived for a very long time. [SEP] Joe is old."],
                       padding=True, return_tensors="tf")

In [59]:
token_ids

{'input_ids': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[ 101, 1045, 2066, 4715, 1012,  102, 2057, 2035, 2293, 4715,  999,
         102,    0,    0,    0],
       [ 101, 3533, 2973, 2005, 1037, 2200, 2146, 2051, 1012,  102, 3533,
        2003, 2214, 1012,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [60]:
outputs = model(token_ids)
outputs

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-2.1123805 ,  1.1786788 ,  1.4101    ],
       [-0.01478534,  1.0962477 , -0.99199367]], dtype=float32)>, hidden_states=None, attentions=None)

In [61]:
Y_probas = tf.keras.activations.softmax(outputs.logits)
Y_probas

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.01619705, 0.43523592, 0.54856706],
       [0.22655931, 0.6881727 , 0.08526793]], dtype=float32)>

In [63]:
Y_pred = tf.argmax(Y_probas, axis=1)
Y_pred  # 0 = contradiction, 1 = entailment, 2 = neutral

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([2, 1])>

In [64]:
sentences = [("Sky is blue", "Sky is red"), ("I love her", "She loves me")]
X_train = tokenizer(sentences, padding=True, return_tensors="tf").data
y_train = tf.constant([0, 2])  # contradiction, neutral
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer="nadam", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=2)

Epoch 1/2
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
Epoch 2/2
