# Natural Language Processing with RNNs and Attention

In [None]:
import sys

assert sys.version_info >= (3, 7)

In [1]:
from packaging import version
import tensorflow as tf

assert version.parse(tf.__version__) >= version.parse("2.8.0")

In [2]:
tf.__version__

'2.19.0'

In [3]:
import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

In [4]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "nlp"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [5]:
if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. Neural nets can be very slow without a GPU.")
    if "google.colab" in sys.modules:
        print("Go to Runtime > Change runtime and select a GPU hardware "
              "accelerator.")
    if "kaggle_secrets" in sys.modules:
        print("Go to Settings > Accelerator and select GPU.")

In [6]:
tf.keras.__version__

'3.9.2'

Seems that a lot of the code in this notebook has to be run on Keras 2, so refer to the link below for the code implementation. The codes written here may not necessarily work.

https://colab.research.google.com/github/ageron/handson-ml3/blob/main/16_nlp_with_rnns_and_attention.ipynb#scrollTo=EisVmaH56dmU

## Generating Shakespearean Text Using a Character RNN

In [7]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [8]:
# extra code – shows a short text sample
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [14]:
# extra code – shows all 39 distinct characters (after converting to lower case)
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [9]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character",
                                                   standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

2025-06-13 14:22:59.624917: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-06-13 14:22:59.626979: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-06-13 14:22:59.627026: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
I0000 00:00:1749795779.627837 6202607 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1749795779.628056 6202607 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
encoded -= 2 # drop tokens 0 (pad) and 1 (unknown)
n_tokens = text_vec_layer.vocabulary_size()
dataset_size = len(encoded)

In [11]:
dataset_size

1115394

In [None]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift = 1, drop_remainder=True)
    ds = ds.flat_map (lambda window_ds: window_ds.batch(length+1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100000, seed=seed)
    ds = ds.batch(batch_size)
    # returns input / output pairs
    return ds.map(lambda window: (window[:, :-1], window[:,1:])).prefetch(1)

In [13]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1000000], length = length, shuffle = True, seed = 42)
valid_set = to_dataset(encoded[1000000:1060000], length = length)
test_set = to_dataset(encoded[1060000:], length = length)

In [15]:
# extra code – a simple example using to_dataset()
# There's just one sample in this dataset: the input represents "to b" and the
# output represents "o be"
list(to_dataset(text_vec_layer(["To be"])[0], length=4))

2025-06-13 14:48:57.458412: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[(<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[ 4,  5,  2, 23]])>,
  <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[ 5,  2, 23,  3]])>)]

### Building and Training the Char-RNN Model

The model below will run around 1 - 2 hours!

In [None]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
model = tf.keras.Sequential([
    # Encode the characters IDs
    # note that input to Embedding layer are 2D tensors, but outputs are 3D tensors
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    # set the neuron to have n_tokens since we want to output a 
    # probability for each possible character.
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10,
                    callbacks=[model_ckpt])

In [20]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
    model
])

In [None]:
# extra code – downloads a pretrained model
url = "https://github.com/ageron/data/raw/main/shakespeare_model.tgz"
path = tf.keras.utils.get_file("shakespeare_model.tgz", url)
if "_extracted" in path:
    model_path = Path(path) / "shakespeare_model.keras"
else:
    model_path = Path(path).with_name("shakespeare_model.keras")
shakespeare_model = tf.keras.models.load_model(model_path)

In [None]:
y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)  # choose the most probable character ID
text_vec_layer.get_vocabulary()[y_pred + 2]

### Generating Fake Shakespearean Text

When feeding the model some text, we can have it iteratively generate the next most likely next letter (*greedy decoding*). However, since this will in practice lead to the same words being repeated over and over again, we can sample the next character randomly using tensorflow's `tf.random.categorical()` function. The function samples random class indices give the class log probabilities(logits).

We can then divide the logits by a *temperature*, where a close to 0 value will favor high-probability characters, while a high temperature gives all characters an equal probability.

To generate more convincing text, a common technique is to sample only from the top *k* characters, or only from the smallest set of top characters whose total probability exceeds some threshold (*nucleus sampling*).

Alternatively we can try using *beam search* or use ore GRU layers and more neurons per layer, training for longer and adding some regularization if needed.

### Stateful RNN

in *stateless* RNNs, at each training iteration the model starts with a hidden state full of zeros, then updates this state at ech time step, and after the last time step, it throws it away as it is notneeded anymore.

*Stateful* preserves this final state after processing a training batch and uses it as the initial state for the next training batch, allowing the model to learn long-term patterns despite only backpropagating through short sequences.

This ONLY works if each input sequence in a batch starts EXACTLY where the corresponding sequence in the previous batch left off. So when creating the dataset, we must use
- `shift=length` (instead of `shift=1`)
- must NOT call the `shuffle()` method

In [None]:
def to_dataset_for_stateful_rnn(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift = length, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length+1)).batch(1)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1000000], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1000000:1060000], length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1060000:], length)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = n_tokens, output_dim = 16,
                              batch_input_shape=[1, None]),
                              tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
                              tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [None]:
class ResetStatesCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer='nadam',
              metrics=["accuracy"])
history = model.fit(stateful_train_set, validation_data=stateful_valid_set,
                    epochs = 10, callbacks=[ResetStatesCallback(), model_ckpt])

## Sentiment Analysis

A form of text classification.

Unlike the previous model, we will preprocess the text by chopping it into words instead of characters.

In [52]:
import tensorflow_datasets as tfds

raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name="imdb_reviews",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True
)
tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

In [53]:
for review, label in raw_train_set.take(4):
    print(review.numpy().decode("utf-8")[:200], "...")
    print("Label:", label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0
Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Moun ...
Label: 0
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...
Label: 1


2025-06-13 16:21:59.391853: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:387] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2025-06-13 16:21:59.396933: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-06-13 16:21:59.397032: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

In [None]:
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    # input is with size token numbers x dimension for GRU
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=2)

### Masking

basically skipping the time steps where the token are 0.

### Reusing Pretrained Embeddings and Language Models

Due to the limitation of each word only have 1 embedding (though 1 word can mean different things), we can contextualize word embeddings learned from the internal states of a deep bidirectional language model. So instead of just using pretrained embeddings in the model, we reuse part of a pretrained language model.

In [None]:
import os
import tensorflow_hub as hub

os.environ["TFHUB_CACHE_DIR"] = "my_tfhub_cache"
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
model = tf.keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                   trainable=True, dtype=tf.string, input_shape=[]),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_set, validation_data=valid_set, epochs=10)

## An Encoder-Decoder Network for Neural Machine Translation

In [None]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

In [None]:
import numpy as np

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)  # extra code – ensures reproducibility on CPU
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [None]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

In [None]:
vocab_size = 1000
max_length = 50
# for english sentences
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)

# for spanish sentences
text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

In [None]:
# tokens from the first english sentence
text_vec_layer_en.get_vocabulary()[:10]

In [None]:
# tokens from the first spanish sentence
text_vec_layer_es.get_vocabulary()[:10]

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [None]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
# for decoder
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

# target
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [None]:
embed_size = 128

# for encoder
encoder_input_ids = text_vec_layer_en(encoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, 
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)

# for decoder
decoder_input_ids = text_vec_layer_es(decoder_inputs)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, 
                                                    mask_zero=True)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
# return_state = True to ensure that we get a reference
# to the layer's final state
encoder = tf.keras.layers.LSTM(512, return_state=True)
# output, *short-term and long-term state
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [None]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
# use the encoder_state as the initial_state for the decoder
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [None]:
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

When the target vocabulary gets incredibly large, ocnsider the *sampled softmax* technique where the loss is calculated using the logits of the correct word and random sample of incorrect words.

`tf.nn.sample_softmax_loss()` function for training and use the normal softmax function at inference time.

Another way to speed up training is to tie the weights of the output layer to the transpose of the decoder's embedding matrix which can significantly reduce the number of model parameters (essentially having an *orthogonal matrix* for the embedding matrix).

In [None]:
model = tf.keras.Model(inputs =[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

To use the model, use the following utility function!

In [None]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq" + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:
translate("I like soccer")

In [None]:
# can struggle with long sentences
translate("I like soccer and also going to the beach")

Methods of improving is either to increase the training set size and add more LSTM layers in both the encoder and decoder. Another method is to use bidirectional recurrent layers / Bidirectional RNNs.

### Bidirectional RNNs

Normally, a regular recorrent layer only looks at the past and present inputs before generating its output *causal* (cannot look to the future). However, for tasks like text classification, or in the encoder of a seq2seq model, it is often preferable to look ahead at the next words before encoding a given word.

*bidirectional recurrent layer* are 2 recurrent layers on the same input, one reading the word from left to right, and the other reading from right to lect, then combine their outputs at each time step (by concatenating them).

In [None]:
# creates a close of the GRU layer and runs both and concatenate their outputs!
encoder = tf.keras.layers.Bidirectional(
    tf.leras.layers.LSTM(256, return_state=True))


Since this will result in have 4 encoder state outputs (short and long term for left to right; short and long term for right to left), we will concatenate the short and long terms together!

In [None]:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)] # long-term (1 & 3)

In [None]:
# extra code — completes the model and trains it
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

In [None]:
translate("I like soccer")

### Beam Search

Keeping track of a short list of the *k* most promising sentence, and at each decoder step it tries to extend them by one word, keeping only the *k* most likely sentence. The *k* is called *beam width*.

In [None]:
# extra code – a basic implementation of beam search

def beam_search(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])  # encoder input
    X_dec = np.array(["startofseq"])  # decoder input
    y_proba = model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = np.array([sentence_en])  # encoder input
            X_dec = np.array(["startofseq " + translation])  # decoder input
            y_proba = model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()

In [None]:
# extra code – shows how the model making an error
sentence_en = "I love cats and dogs"
translate(sentence_en)

In [None]:
# extra code – shows how beam search can help
beam_search(sentence_en, beam_width=3, verbose=True)

## Attention Mechanisms

Allowing the decoder to focus on the appropriate words (as encoded by the encoder) at each time step. This means that the short-term memory limitations of RNNs have much less impact.

In between the encoder and decoder model, we now send all of the encoder's outputs to the decoder as well and aggregate the encoder output where at each time step, the decoder's memory cell computes a WEIGHTED sum of all the encoder outputs (determining which word it will focus on at this step).

The weights are generated by a small neural network called an *alignment model* (or an *attention layer*), which is trained jointly with the rest of the encoder-decoder model. Since the attention layer concatenates the encoder output with the decoder's previous hidden state, it is sometimes called *concatenative attention* (or *additive attention*).

There is also the *multiplicative attention* where the dot product between the encoder's output and decoder's previous hidden state is computed.

In [None]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences=True, return_state=True))

In [None]:
# extra code – this part of the model is exactly the same as earlier
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [None]:
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation = "softmax")
Y_proba = output_layer(attention_outputs)

In [None]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

In [None]:
translate("I like soccer and also going to the beach")

In [None]:
beam_search("I like soccer and also going to the beach", beam_width=3,
            verbose=True)

### Attention is All You Need: The Original Transformer Architecture

Each embedding layer outputs a 3D tensor of a shape [batch size, sequence length, embedding size].

The encoder is to transform the inputs until each word's representation perfectly captures the meaning of the word. While the decoder is to gradually transform each word representation in the translated sentence into a word representation of the next word in the translatio.

#### Positional encodings

Without positional encodings, we can shuffle the input sequence and output sequences in the same way.

In [None]:
max_length = 50
embed_size = 128
pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer (tf.range(batch_max_len_enc))
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))


In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0, "embed_size must be even"
        p, i = np.meshgrid(np.arange(max_length),
                           2 * np.arange(embed_size // 2))
        pos_emb = np.empty((1, max_length, embed_size))
        pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.pos_encodings[:, :batch_max_length]

In [None]:
pos_embed_layer = PositionalEncoding(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

In [None]:
# extra code – this cells generates and saves Figure 16–9
figure_max_length = 201
figure_embed_size = 512
pos_emb = PositionalEncoding(figure_max_length, figure_embed_size)
zeros = np.zeros((1, figure_max_length, figure_embed_size), np.float32)
P = pos_emb(zeros)[0].numpy()
i1, i2, crop_i = 100, 101, 150
p1, p2, p3 = 22, 60, 35
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(9, 5))
ax1.plot([p1, p1], [-1, 1], "k--", label="$p = {}$".format(p1))
ax1.plot([p2, p2], [-1, 1], "k--", label="$p = {}$".format(p2), alpha=0.5)
ax1.plot(p3, P[p3, i1], "bx", label="$p = {}$".format(p3))
ax1.plot(P[:,i1], "b-", label="$i = {}$".format(i1))
ax1.plot(P[:,i2], "r-", label="$i = {}$".format(i2))
ax1.plot([p1, p2], [P[p1, i1], P[p2, i1]], "bo")
ax1.plot([p1, p2], [P[p1, i2], P[p2, i2]], "ro")
ax1.legend(loc="center right", fontsize=14, framealpha=0.95)
ax1.set_ylabel("$P_{(p,i)}$", rotation=0, fontsize=16)
ax1.grid(True, alpha=0.3)
ax1.hlines(0, 0, figure_max_length - 1, color="k", linewidth=1, alpha=0.3)
ax1.axis([0, figure_max_length - 1, -1, 1])
ax2.imshow(P.T[:crop_i], cmap="gray", interpolation="bilinear", aspect="auto")
ax2.hlines(i1, 0, figure_max_length - 1, color="b", linewidth=3)
cheat = 2  # need to raise the red line a bit, or else it hides the blue one
ax2.hlines(i2+cheat, 0, figure_max_length - 1, color="r", linewidth=3)
ax2.plot([p1, p1], [0, crop_i], "k--")
ax2.plot([p2, p2], [0, crop_i], "k--", alpha=0.5)
ax2.plot([p1, p2], [i2+cheat, i2+cheat], "ro")
ax2.plot([p1, p2], [i1, i1], "bo")
ax2.axis([0, figure_max_length - 1, 0, crop_i])
ax2.set_xlabel("$p$", fontsize=16)
ax2.set_ylabel("$i$", rotation=0, fontsize=16)
save_fig("positional_embedding_plot")
plt.show()

#### Multi-head attention

*Scaled dot-product attention*

Attention (**Q**, **K**, **V**) = softmax($QK^T \over \sqrt{d_{keys}}$) V

Output has shape [n queries, d values] or one row per query, where aech row represents the query result (weighted sum of the values).

- Q = Matrix with 1 row / *query*. Shape of [n queries, d keys], where n queries is the number of queries and d keys is the number of dimensions of each query and each key.
- K = Matrix containing 1 row / *key*. Shape of [n keys, d keys]
- V = Matrix containing 1 row / *value*. Shape of [n keys, d values]


Multi-head attention layer applies *multiple* different linear transformation of the values, keys and queries as it allows the model to apply many different projections of the word representation into different subspcaes, each focusing on a subset of the wrod's characteristics (layer of verb, present tense, object, etc).

**Encoder Part**

In [None]:
N = 2  # instead of 6
num_heads = 8
dropout_rate = 0.1
n_units = 128  # for the first Dense layer in each Feed Forward block
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
Z = encoder_in
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.Dropout(dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

**Decoder part**

In [None]:
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
# should ignore all tokens in the future, hence the lower triangular matrix
causal_mask = tf.linalg.band_part(  # creates a lower triangular matrix
    tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

In [None]:
encoder_outputs = Z  # let's save the encoder's final outputs
Z = decoder_in  # the decoder starts with its own inputs
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    # note that the input here is the encoder_outputs
    Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [None]:
Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

In [None]:
translate("I like soccer and also going to the beach")

## An Avalanche of Transofmer Models

- GPT Paper by Alec Radford and other OpenAI researchers, showing the effectiveness of unsupervised pretraiing using the transformer-like architecture. It was capable of many tasks such as text classification, *entailment*, similarity, question answering.

- Google's BERT paper, demonstrating effectiveness of self-supervised pretraining on a large corpus.

    - Masked Language Model (MLM), where each word in a sentence has a 15% probability of being masked and the model is trained to predict the masked words.
    - Next sentenced prediction (NSP), model is trained to predict whether 2 sentences are consecutive or not.

- GPT-2, by OpenAI . Improvement on GPT model for *zero-shot laerning* where it could achieve good performance on many tasks without fine-tuning.

- DistilBERT, by Hugging Face, is a small and fast transformer model based on BERT. Trained using *distillation* / transferring knowledge from a teacher model to a student one, which usually is much smaller than the teacher model. Typically done by using the teacher's predicted probabilities for each training instance as targets for the student.

- T5, framing all NLP tasks as text-to-text, using an encoder-decoder transformer. Where to translate "translate English to Spanish: I like soccer", "summarize: {paragraph}", "classify: {paragraph}", etc.

- Pathways Languange Model (PaLM), using only decoders with masked multi-head attention layers, the model achieved incredible performance on all sorts of NLP tasks, including natural language understanding (NLU) through the use of *Chain of thought prompting*

## Vision Transformers

**Visual Attention**, where a convolutional neural network first processes the image and outputs some feature maps, then a decoder RNN equipped with an attention mechanism generates the caption, one word at a time.

At each decoder time step, the decoder uses the attention model to focus on just the right part of the image.

Also, attention mechanisms seem to enable scientist to make it easier to understand what led the model to produce its outputs!

Facebook researches propposed a hybrid CNN-transformer architecture for object detection, where the CNN first processes the input imaves and outputs a set of feature maps, then these feature maps are converted to sequences and fed to a transformer, which outputs bounding box predictions.

Google introduced a fully transformer-based vision model *vision transformer* (ViT) where the images is chopped to 16 x 16 squares, and treat the sequence of squares as if it were a sequence of word representations.
1. 16x16x3 (RGB) flattened sequence
2. Use lineary layer to transform them
3. Add positional embeddings
4. Pass the result into a transformer.

Then Facebook introduced the *data-efficient image transformers* (DeiTs), where they used a distillation technique to transfer knowledge from state-of-the-art CNN models to their model.

Deepmind then introduced the *Perceiver* architecture which is a *multimodal* transformer (input can be text, images, audio, etc). The architecture solves the problem of self-attention growing to a large size (since for a sequence of M tokens, the model must compute a M x M matrix for the attention layer), by improving a fairly short *latent (hidden, internal) representation* of the inputs composed of N tokens (typically just a few hundred). (look up what *cross-attention layers* mean). The model uses cross-attention layers only and feeds them the latent representation as queries so it only requires the model to compute M x N matrix.

DINO, an impressive vision transformer trained entirely without labels, using self-supervision, and capable of high-accuracy semantic segmentation. Uses a technique called *self-distillation* where again a teacher and student model is used, where both initially has the same model but gradient descent only affects the student while the teacher's weights are the student's exponential moving average of the student's weights. (so they're basically almost the same model!)

Others include CLIP, DALL-E, GATO, Flamingo, etc.

## Hugging Face's Transfomers Library

In [57]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")  # many other tasks are available
result = classifier("The actors were very convincing.")

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Device set to use 0


In [58]:
result

[{'label': 'POSITIVE', 'score': 0.9998071789741516}]

In [59]:
classifier(["I am from India.", "I am from Iraq."])

[{'label': 'POSITIVE', 'score': 0.9896161556243896},
 {'label': 'NEGATIVE', 'score': 0.9811071157455444}]

In [60]:
classifier("I am from Indonesia.")

[{'label': 'POSITIVE', 'score': 0.9860380291938782}]

In [61]:
model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"
classifier_mnli = pipeline("text-classification", model=model_name)
classifier_mnli("She loves me. [SEP] She loves me not.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Device set to use 0


[{'label': 'contradiction', 'score': 0.9790191650390625}]

In [62]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [63]:
token_ids = tokenizer(["I like soccer. [SEP] We all love soccer!",
                       "Joe lived for a very long time. [SEP] Joe is old."],
                      padding=True, return_tensors="tf")
token_ids

{'input_ids': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[ 101, 1045, 2066, 4715, 1012,  102, 2057, 2035, 2293, 4715,  999,
         102,    0,    0,    0],
       [ 101, 3533, 2973, 2005, 1037, 2200, 2146, 2051, 1012,  102, 3533,
        2003, 2214, 1012,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [None]:
# same as above, just written differently
token_ids = tokenizer([("I like soccer.", "We all love soccer!"),
                       ("Joe lived for a very long time.", "Joe is old.")],
                      padding=True, return_tensors="tf")
token_ids

{'input_ids': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[ 101, 1045, 2066, 4715, 1012,  102, 2057, 2035, 2293, 4715,  999,
         102,    0,    0,    0],
       [ 101, 3533, 2973, 2005, 1037, 2200, 2146, 2051, 1012,  102, 3533,
        2003, 2214, 1012,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [65]:
outputs = model(token_ids)
outputs

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-2.1123815 ,  1.1786786 ,  1.4101012 ],
       [-0.01478325,  1.0962472 , -0.99199563]], dtype=float32)>, hidden_states=None, attentions=None)

In [66]:
Y_probas = tf.keras.activations.softmax(outputs.logits)
Y_probas

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.01619702, 0.43523565, 0.5485674 ],
       [0.22655982, 0.68817246, 0.08526779]], dtype=float32)>

In [67]:
Y_pred = tf.argmax(Y_probas, axis=1)
Y_pred  # 0 = contradiction, 1 = entailment, 2 = neutral

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([2, 1])>

In [68]:
sentences = [("Sky is blue", "Sky is red"), ("I love her", "She loves me")]
X_train = tokenizer(sentences, padding=True, return_tensors="tf").data
y_train = tf.constant([0, 2])  # contradiction, neutral
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer="nadam", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=2)

Epoch 1/2
Epoch 2/2


Check out the O’Reilly book Natural Language Processing with Transformers: Building Language Applications with Hugging Face by Lewis Tunstall, Leandro von Werra, and Thomas Wolf—all from the Hugging Face team.

# Exercises