# 0.0 Installing Dependencies

In [None]:
!pip install -q torchdata==0.3.0 torchtext==0.12 "spacy>=3.2,<3.8" altair vega GPUtil
!python -m spacy download de_core_news_sm-3.2.0 --direct
!python -m spacy download en_core_web_sm-3.2.0 --direct

In [3]:
!pip install -q ipython

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-p

In [5]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as ttd
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from vega_datasets import data


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [6]:
"""Some convenience helper functions used throughout the notebook.
   --------------------------------------------------------------
   These helper functions and dummy classes are used throughout the notebook
   to facilitate the execution of examples and provide placeholder
   implementations when needed.
"""

def is_interactive_notebook():
  """checks if the code is being run in an interactive notebook"""
  return __name__ == "__main__"


def show_example(fn, args=[]):
  """used to conditionally execute display the output of a given function
     fn with the provided args. It checks if it the code is in an interactive
     notebook environment and if <RUN_EXAMPLES> flag is set to True."""
  if __name__ == "__main__" and RUN_EXAMPLES:
    return fn(*args)


def execute_example(fn, args=[]):
  """it executes the given function fn with the provided args without returning
     its output. Checks the same two conditions as show_example function"""
  if __name__ == "__main__" and RUN_EXAMPLES:
    fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
  """dummy implementation of 'torch.optim.Optimizer' class. It's used as a
     placeholder optimizer when an actual optimizer is not required.
  """
  def __init__(self):
    self.param_groups = [{"lr": 0}]
    None

  def step(self):
    None

  def zero_grad(self, set_to_none=False):
    None


class DummyScheduler:
  def step(self):
    None

## Downloading the Multi-30k dataset from github

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
%cd /content/gdrive/MyDrive/10714/transformers

/content/gdrive/MyDrive/10714/transformers


In [9]:
!git clone --recursive https://github.com/multi30k/dataset.git multi30k-dataset

fatal: destination path 'multi30k-dataset' already exists and is not an empty directory.


# Part 1: Model Architecture

- Most competitive neural sequence transduction models have an encoder-decoder structure.

- The encoder maps an input sequence of symbols representations (xᵢ,...xₙ) to a sequence of continuous representations 𝐙 = (zᵢ,...zₙ)

  - These continuous representation are the filtered value matrices (𝐙) produced by the encoder.

- Given the 𝐙, the decoder then generates an output sequence (yᵢ,..yₘ) of symbols one element at a time.

- at each step the model is auto-regressive, consuming the previously generated symbols as additional input when generating the next.

In [10]:
class EncoderDecoder(nn.Module):
  """
  Standard Encoder-Decoder architecture. Base for this and many other models.
  """

  def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
    super(EncoderDecoder, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.tgt_embed = tgt_embed
    self.generator = generator

  def forward(self, src, tgt, src_mask, tgt_mask):
    """Take in and process masked src and target sequence
       src: The source sequence | tgt: The target sequence
       src_mask: The mask for the source sequence.
       tgt_mask: The mask for the target sequence"""
    return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

  def encode(self, src, src_mask):
    """
    applies the source embedding to the source sequence, it then passes the
    embedding source sequence and the source mask to the encoder module.
    This method returns the encoded representation of the source sequence
    """
    return self.encoder(self.src_embed(src), src_mask)

  def decode(self, memory, src_mask, tgt, tgt_mask):
    """
    Applies the target embedding to the target sequence. It passes the embedding
    target sequence, the encoded memory (output of the encoder), the source mask,
    & the target mask through the decoder module. Returns the decoded output.
    """
    return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)


In [11]:
class Generator(nn.Module):
  "Define standard linear + softmax generation step."

  def __init__(self, d_model, vocab):
    super(Generator, self).__init__()
    self.proj = nn.Linear(d_model, vocab)

  def forward(self, x):
    return log_softmax(self.proj(x), dim=-1)

## 1.1 Encoder and Decoder Stacks

### Encoder

The encoder is composed of a stack of 𝐍 = 6 identical layers.

In [12]:
def clones(module, N):
  """Produce N identical layers.  Uses a list comprehension to
     create a list of 'N' deep copies of the module, and return an nn.ModuleList
     containing the cloned modules
     """
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [13]:
class Encoder(nn.Module):
  """
  Core encoder is a stack of 'N'-layers.
  It creates 'N' identical copies of the layer module using clones function
  and assigns them to `self.layers`.
  """
  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask):
    """Pass the input (and mask) through each layer and after processing
       through all the layers, it applies layer normalization to the final
       output using `self.norm(x)`and returns the normalized output.
    """
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

### 1.1.1 LayerNormalization

Layer normalization is applied after each sublayer (multi-head attention and position-wise feed-forward) in the encoder and decoder.

- The LayerNorm class takes two parameters:

  1. **features**: the (number of features or channels in the input tensor)
  2. eps: a small value added to the variance to avoid 0-division.

- it creates two learnable parameters

  - `self.a_2`: a tensor of shape (features,) initialized with ones. This represents the scale parameter.

  - `self.b_2`: a tensor of shape (features,) initialized with zeros. This represents the bias parameter.

    - The scale and bias parameters are learnable, allowing the model to adapt the normalization to the specific task.

- The purpose of layer normalization is to normalize the activations across the features within each layer. It helps to stabilize the training process and improve the convergence of the model.

  - By normalizing the activations, it reduces the internal covariate shift and allows the model to learn more effectively.




In [14]:
class LayerNorm(nn.Module):
  """
  Construct a layernorm module:
  -----------------------------
  """

  def __init__(self, features, eps=1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

### 1.1.2 Sub Layer

- To faciliate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimensions $d_{model}$ = 512.

- this class represents a residual connection followed by layer normalization.

  - It's used to connect sublayers (multi-head attention and position-wise feed_forward)

`SublayerConnection` takes two parameters:
  1. size: the number of features or hidden units in the sublayer
  2. dropout: the dropout probability.

`Forward` method defines the forward pass of the sublayer connection:

  - it takes two parameters `x` (the input tensor) and `sublayer` (the sublayer module, which can be either multihead attention or position wise feedforward).

The purpose of the SublayerConnection class is to facilitate the residual connection and layer normalization between sublayers in the Transformer architecture. Residual connections, also known as skip connections, allow the gradients to flow directly through the layers, mitigating the vanishing gradient problem and enabling the training of deeper networks.

By applying layer normalization before the sublayer and dropout after the sublayer, the SublayerConnection class helps to stabilize the training process and improve the convergence of the model. The residual connection (adding the input tensor to the sublayer output) allows the model to learn identity functions and facilitates the flow of information across layers.

In [15]:
class SublayerConnection(nn.Module):
  """
  A residual connection followed by a layer norm.
  Note for code simplicity the norm is first as opposed to last.
  """

  def __init__(self, size, dropout):
    super(SublayerConnection, self).__init__()
    self.norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    "Apply residual connection to any sublayer with the same size."
    return x + self.dropout(sublayer(self.norm(x)))

Each layer has tw sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.

`EncoderLayer` class: consists of two main components: self-attention and feed forward neetwork.
- utilizes the `SublayerConnection` class for residual connections and layer normalization.

- It takes four parameters:
  - size: The number of features or hidden units in the layer.
  - self_attn: The self-attention module
  - feed_forward: The feed-forward network module.
  - dropout: The dropout probability.

`forward` method: defines the forward pass of the encoder layer.
- it takes two parameters `x` (input tensor) and `mask` (the attention mask).
- The input tensor x goes through the following steps:
  - It passes through the first sublayer connection, which applies layer normalization and the self-attention mechanism.
  - The self-attention is computed using self.self_attn(x, x, x, mask), where x serves as the query, key, and value, and mask is the attention mask.
  - The output of the self-attention sublayer goes through the second sublayer connection, which applies layer normalization and the feed-forward network using self.feed_forward
- the output of the second sublayer connection is returned as the output of the encoder layer.

In [16]:
class EncoderLayer(nn.Module):
  "Encoder is made up of self-attention and feed forward (defined below)"

  def __init__(self, size, self_attn, feed_forward, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(size, dropout), 2)
    self.size = size

  def forward(self, x, mask):
    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
    return self.sublayer[1](x, self.feed_forward)

## 1.2 Decoder

The `Decoder` class consists of N identical decoder layers and a final normalization layer.

It takes two parameters:
1. `layer`: the decoder layer module to be cloned
2. `N`: the number of decoder layers in the stack and creates N identical copies of the layer module using the `clones` function and assigns them to `self.layers`

`Forward` method: defines the forward pass of the decoder. It takes 4 parameters:

  - x: The input tensor to the decoder (target sequence).
  - memory: The output of the encoder (source sequence representation).
  - src_mask: The mask for the source sequence. Used to mask out padding tokens in the source sequence
  - tgt_mask: The mask for the target sequence.

- The input tensor `x` goes through the following steps:
  1. it's passed through each decoder layer in the stack using a for loop
  2. inside the loop, the layer module is called with the current x, memory, src_mask, and tgt_mask as arguments.
  3. the output of each decoder layer becomes the input `x` for the next layer in the stack.

- After processing all the decoder layers, the final output x is passed through the layer normalization module self.norm.

  - <mark >The normalized output is returned as the output of the decoder.</mark>


In [17]:
class Decoder(nn.Module):
  """
  Generic N layer decoder with masking.
  """

  def __init__(self, layer, N):
    super(Decoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, memory, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, memory, src_mask, tgt_mask)
    return self.norm(x)


### 1.2.1 Single Layer of the Decoder Stack

`DecoderLayer` class consists of three main components:
- self attention, source-attention (also known as encoder-decoder attention), and a feed-forward network.
- It also utilizes the `SublayerConnection` class for residual connections and layer normalization.
- It takes five parameters:
  - size: The number of features or hidden units in the layer.
  - self_attn: The self-attention module.
  - src_attn: The source-attention (encoder-decoder attention) module.
  - feed_forward: The feed-forward network module.
  - dropout: The dropout probability.

- It creates three instances of the SublayerConnection class using the clones function, passing the size and dropout parameters. These instances are assigned to self.sublayer, which is a list of three sublayer connections.

`forward` method: defines the forward pass of the decoder layer and takes 4 parameters:
1. `x`: the input tensor to the decoder layer (target sequence)
2. `memory`: the output of the encoder (source sequence representation)
3. `src_mask`: the mask for the soruce sequence
4. `tgt_mask`: the mask for the target sequence

- the memory tensor is assigned to the variable m

- the the input tensor `x` goes through the first sublayer connection, which applies layer normalization and the self-attention mechanism.
  -  The self-attention is computed using `self.self_attn(x, x, x, tgt_mask)`, where `x` serves as the query, key, and value, and `tgt_mask` is the target sequence mask.

- The output of the self-attention sublayer goes throug the second sublayer connection, which applies layer normalization and the source attention mechanism.
  - The source-attention is computed using `self.src_attn(x, m, m, src_mask)`, where `x` is the query, `m` (memory) is the key and value, and `src_mask` is the source sequence mask.

- The output of the source-attention sublayer goes through the third sublayer connection, which applies layer normalization and the feed-forward network using `self.feed_forward`.

In [18]:
class DecoderLayer(nn.Module):
  "Decoder is made of self-attn, src-attn, and feed forward"

  def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
    super(DecoderLayer, self).__init__()
    self.size = size
    self.self_attn = self_attn
    self.src_attn = src_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(size, dropout), 3)

  def forward(self, x, memory, src_mask, tgt_mask):
    m = memory
    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    x = self.sublayer[0](x, lambda x: self.src_attn(x, m, m, src_mask))
    return self.sublayer[2](x, self.feed_forward)



#### 1.2.1.1 Subsequent Masking in the decoder

The subsequent_mask function is used to create a mask that prevents the attention mechanism from attending to subsequent positions in the target sequence during training. This is necessary to ensure that the model does not have access to future information while generating predictions.

- `def subsequent_mask(size)`:
The function takes a single parameter `size`, which represents the size of the target sequence (i.e., the number of positions or tokens in the sequence).

- `attn_shape = (1, size, size)`
This line defines the shape of the attention mask tensor. It creates a tuple attn_shape with three dimensions: batch size (set to 1), target sequence length (size), and target sequence length (size).

- `subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)`

  - This line creates the actual subsequent mask tensor using PyTorch operations.

- `torch.ones(attn_shape)` creates a tensor of shape attn_shape filled with ones.
torch.triu(...) applies an upper triangular mask to the tensor, setting all elements above the main diagonal (specified by diagonal=1) to zero.
  - This creates a mask where the upper triangular part is filled with zeros and the main diagonal and lower triangular part are filled with ones.
- `.type(torch.uint8)` casts the mask tensor to an unsigned 8-bit integer type, which is commonly used for masks.

- By returning `subsequent_mask == 0`, the function effectively inverts the mask, so that the positions that were originally ones (allowed positions) become True, and the positions that were originally zeros (masked positions) become False.



In [19]:
def subsequent_mask(size):
  "Mask out subseqent positions"
  attn_shape = (1, size, size)
  subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
  return subsequent_mask == 0

### 1.2.2 Attention

- An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, and values, and outputs are all vectors.

$$ Attention (Q, K, V) = softmax(\frac{Q*K^T}{\sqrt{d_k}})*V$$

![alt](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQJzZKAY86Cbb-XEKTwxMTEDCBzt8f98tNq3w&s)

- This function takes the following inputs:

  - query: The query tensor of shape (batch_size, num_heads, sequence_length, d_k).
  - key: The key tensor of shape (batch_size, num_heads, sequence_length, d_k).
  - value: The value tensor of shape (batch_size, num_heads, sequence_length, d_v).
  - mask (optional): The mask tensor of shape (batch_size, 1, 1, sequence_length) or (batch_size, 1, sequence_length, sequence_length).
  - dropout (optional): The dropout layer to apply to the attention probabilities.

- d_k = query.size(-1) retrieves the dimensionality of the query vectors



In [20]:
def attention(query, key, value, mask=None, dropout=None):
  # Compute the scaled dot product Attention
  d_k = query.size(-1)
  scores = torch.matmul(query, key.transpose(-2, -1) / math.sqrt(d_k))
  if mask is not None:
    scores = scores.masked_fill(mask == 0, -1e9)
  p_attn = scores.softmax(dim=-1)
  if dropout is not None:
    p_attn = dropout(p_attn)
  return torch.matmul(p_attn, value), p_attn


### 1.2.3 Multi-Head Attention

$$
MultiHead(Q, K, V) = Concat(head_1,..,head_h)W^0
$$

$$where\ head_i = Attention(QW_i^Q, KW_i^K, VW_i^V) $$

- In the ***attention paper*** they employed 8 parallel attention layers (heads). For each of these layers they used $d_k = d_v = d_{model} / h = 64$.

  - Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.

![alt](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQWxgf_Z68_IxjQo4i8_eaNup-FfUAVUu8cww&s)



`MultiHeadAttention` class is an implementation of the multi-head attention mechanism. It allows the model to attend to different parts of the input sequence simultaneously, capturing different relationships and dependencies.

`MultiHeadAttention` class takes 3 parameters:
1. `h`: the number of attention heads
2. `d_model`: the dimensionality of the input and output
3. `dropout`: the dropout probability

- It asserts that `d_model` is divisible by h, ensuring that the dimensions are compatible.
- It calculates `self.d_k` as d_model // h, which represents the dimensionality of each attention head.
- It creates four linear layers using nn.Linear(d_model, d_model) and the clones function, and assigns them to self.linears. These linear layers are used for projecting the queries, keys, and values.
- It initializes self.attn to None, which will store the attention weights during the forward pass.

`forward` method implements the scaled dot-product multi-head attention mechanism
- It takes four parameters: `query, key, value` (the input tensors), and `mask` (the optional attention mask).
  - If mask is provided, it is unsqueezed to add an extra dimension for broadcasting across the attention heads.
- It retrieves the number of batches (`nbatches`) from the size of the `query` tensor.
- Step 1 linear Projections
  - It performs linear projections on the query, key, and value tensors using the corresponding linear layers in self.linears.
  - The projected tensors are reshaped to have dimensions (nbatches, -1, self.h, self.d_k), where -1 represents the sequence length.
  - The reshaped tensors are then transposed to have dimensions (nbatches, self.h, -1, self.d_k), allowing the attention to be applied independently for each head.

- Step 2: Apply Attention

  - It applies the attention function to the projected query, key, and value tensors, along with the mask and dropout.
  - The attention function computes the scaled dot-product attention for each head and returns the attended values (x) and the attention weights (self.attn).


- Step 3: Concatenation and Final Linear Projection
  - It transposes the attended values x to have dimensions (nbatches, -1, self.h, self.d_k).
  - It then reshapes (contiguous().view()) the attended values to have dimensions (nbatches, -1, self.h * self.d_k), effectively concatenating the outputs from all heads.
  - It applies the final linear projection using the last linear layer in self.linears to obtain the output of the multi-head attention.
  
- Finally, it returns the output of the multi-head attention.



In [21]:
class MultiHeadAttention(nn.Module):
  def __init__(self, h, d_model, dropout=0.1):
    "Take in model size and number of heads"
    super(MultiHeadAttention, self).__init__()

    # asserts d_model is divisible by h, ensuring that compatible dimensions
    assert d_model % h == 0
    # We assume d_v always equals d_k
    self.d_k = d_model // h
    self.h = h
    self.linears = clones(nn.Linear(d_model, d_model), 4)
    self.attn = None
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, query, key, value, mask=None):
    "Implement the scaled dot product of the multi-head attention"
    if mask is not None:
      # Same mask applied to all h heads
      mask = mask.unsqueeze(1)
    nbatches = query.size(0)

    # 1) Do all the linear projections in batch from d_model => h x d_k
    query, key, value = [
        lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        for lin, x in zip(self.linears, (query, key, value))
    ]

    # 2) Apply attention on all the projected vectors in batch.
    x, self.attn = attention(
        query, key, value, mask=mask, dropout=self.dropout
    )

    # 3) "concat" using a view and apply a final linear.
    x = (
        x.transpose(1,2)
        .contiguous()
        .view(nbatches, -1, self.h * self.d_k)
    )
    del query
    del key
    del value
    return self.linears[-1](x)

### 1.2.4 Position-Wise Feed Forward Networks

- each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically.

- The FFN is applied independently to each position of the input sequence, hence the name "position-wise." This allows the model to capture local dependencies and learn position-specific transformations.

- This consists of two linear transformations with a ReLU activation in between.

  - Another way of describing this is as two convolutions with kernel size 1.

  - The dimensionality of input and output $d_{model} = 512$ and the inner-layer has dimensionality $d_{ff} = 2048$

$$FFN(x) = max(0, xW_1 + b_1)W_2 + b_2 $$

#### Code Implementation

`foward` method:
- It takes one parameter: `x` <mark >(the input tensor of shape (batch_size, sequence_length, d_model)).
- The input `x` goes through the following operations:
  - It is passed through the first linear layer self.w_1, mapping it from d_model dimensions to d_ff dimensions.
  - the output of the first linear layer is then passed through the ReLU activation function to introduce non-linearity.
  - The output of the ReLU activation is passed through the dropout layer self.dropout, which randomly sets some elements to zero during training to prevent overfitting.
  - The output of the dropout layer is then passed through the second linear layer self.w_2, mapping it back to d_model dimensions.


In [22]:
class PositionwiseFeedForward(nn.Module):
  "Implementing the FFN equation"

  def __init__(self, d_model, d_ff, dropout=0.1):
    super(PositionwiseFeedForward, self).__init__()
    self.w_1 = nn.Linear(d_model, d_ff)
    self.w_2 = nn.Linear(d_ff, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    "x(input tensor of shape (batch_size, sequence length, d_model))"
    return self.w_2(self.dropout(self.w_1(x).relu()))

### 1.2.5 Embeddings and Softmax

<mark > **Embedding** are learned representations that capture the semantic and syntactic relationships between the tokens in the vocabulary.
- we use embeddings to convert the input tokens and output tokens to vectors of dimension $d_{model}$

- also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities.

- The multiplication by math.sqrt(self.d_model) in the forward method is a technique mentioned in the Transformer paper.
  - It scales the embedding vectors by the square root of the embedding dimensionality.
  - This scaling helps to maintain the variance of the embeddings at a similar scale to the other components in the model, such as the position embeddings and the output of the multi-head attention.
  - It has been shown to improve the stability and convergence of the training process.

- The Embeddings class is typically used in conjunction with the `nn.Embedding` layer, which is a lookup table that maps token indices to their corresponding embedding vectors.
  - The Embeddings class encapsulates this lookup process and applies the necessary scaling.

#### Code Implementation
- It creates an embedding layer using `nn.Embedding(vocab, d_model)` and assigns it to `self.lut` (lookup table).
  - This layer maps each token in the vocabulary to a dense vector of size d_model.

`forward` method:
- It takes one parameter: `x` (the input tensor of token indices, typically of shape (batch_size, sequence_length)).

- The input `x` is passed through the embedding layer `self.lut`, which looks up the corresponding embedding vectors for each token index in x.

- The resulting embedding tensor is then multiplied by math.sqrt(self.d_model). This scaling is done to mitigate the vanishing gradients problem in the softmax function when the dimensionality of the embeddings is high.



In [23]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.embedding = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

### 1.2.6 Positional Encoding

- Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some info about the relative or absolute position of the tokens in the sequence.

- To this end, we add **`positional encodings`** to the input embeddings at the bottoms of the encoder and decoder stacks.

- The positional encodings have the same dimension $d_{model}$ as the embeddings, so that the two can be summed.

- We use sine and consine functions of different frequencies.

$$PE(pos, 2_i) = sin(\frac{pos}{10000^{\frac{2i}{d_{model}}}})$$


$$PE(pos, 2_i+1) = cos(\frac{pos}{10000^{\frac{2i}{d_{model}}}})$$

- where `pos` is the position and `i` is the dimension.
  - That is each dimension of the positional encoding corresponds to a sinusoid.

#### Code Implementation

- `pe = torch.zeros(max_len, d_model)` creates a tensor pe of shape (max_len, d_model) initialized with zeros.
- max_len represents the maximum sequence length, and d_model is the dimensionality of the positional encoding, which is typically the same as the dimensionality of the input embeddings.
- Each row of pe corresponds to a position in the sequence, and each column represents a dimension of the positional encoding.

- `position = torch.arange(0, max_len).unsqueeze(1)`:
  - This line creates a tensor position of shape (max_len, 1) containing the position indices from 0 to max_len-1.
  - unsqueeze(1) is used to add an extra dimension to the tensor, making it a column vector.

- `div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))`:
  - This line computes the denominator term for the sinusoidal positional encoding.
  - `torch.arange(0, d_model, 2)` creates a tensor of even indices from 0 to d_model-1 with a step of 2.
  - `-(math.log(10000.0) / d_model)` computes a scaling factor based on the dimensionality of the positional encoding.

- `pe = pe.unsqueeze(0)`:
  - This line adds an extra dimension to pe at the beginning, making it suitable for broadcasting with the input embeddings.
  - The resulting shape of pe is (1, max_len, d_model).

In [24]:
class PositionalEncoding(nn.Module):
  "Implement the Positional Encoding function"

  def __init__(self, d_model, dropout, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)

    # compute the positional encoding once in log space
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer("pe", pe)

  def forward(self, x):
    x = x + self.pe[:, : x.size(1)].requires_grad_(False)
    return self.dropout(x)

## 1.3 Full Model

- This is a function from hyperparameters to a full model

### Code Implementation
`make_model` function takes several arguments:
  - `src_vocab`: the size of the source vocabulary
  - `tgt_vocab`: the size of the target vocabulary
  - `N`: the number of layers in the encoder and decoder stacks (default: 6)
  - `d_model`: the dimensionality of the model (default: 512)
  - `d_ff`: the dimensionality of the feed-forward network (default: 2048)
  - `h`: the number of attention heads (default: 8)
  - `dropout`: the dropout probability (default: 0.1)

1. `c = copy.deepcopy`: creates a function `c` that performs a deep copy of an object.
  - Used to create separate copies of attention, feed-forward, and positional encoding for each layer.
2. `attn = MultiHeadedAttention(h, d_model):` This line creates an instance of the MultiHeadedAttention module with h attention heads and a model dimensionality of d_model
3. ...` nn.Sequential(Embeddings(d_model, src_vocab), c(position)):` The source embedding module, which converts source tokens to dense vectors and applies positional encoding.
4. `Generator(d_model, tgt_vocab):` The generator module, which converts the decoder outputs to predicted target tokens using the softmax equation.

In [25]:
def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
  "Helper: Construct a model from hyperparameters"

  c = copy.deepcopy
  attn = MultiHeadAttention(h, d_model)
  ff = PositionwiseFeedForward(d_model, d_ff, dropout)
  position = PositionalEncoding(d_model, dropout)
  model = EncoderDecoder(
      Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
      Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
      nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
      nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
      Generator(d_model, tgt_vocab),
  )

  # This was important for their code.
  # Initialize parameters with Glorot / fan_avg.
  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)
  return model

### 1.3.1 Inference

- This is just a inference test. We are trying to generate a prediction of the model.

- We try to use the transfomer to memorize the input.

In [26]:
def inference_test():
    test_model = make_model(11, 11, 2)
    test_model.eval()
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    src_mask = torch.ones(1, 1, 10)
    memory = test_model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src)
    for i in range(9):
        tgt_mask = subsequent_mask(ys.size(1)).type_as(src.data)
        out = test_model.decode(memory, src_mask, ys, tgt_mask)
        prob = test_model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )
    print("Example Untrained Model Prediction:", ys)

def run_tests():
  for _ in range(10):
    inference_test()

show_example(run_tests)

Example Untrained Model Prediction: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Example Untrained Model Prediction: tensor([[0, 4, 8, 7, 8, 7, 8, 7, 8, 7]])
Example Untrained Model Prediction: tensor([[0, 7, 1, 2, 2, 2, 4, 4, 4, 5]])
Example Untrained Model Prediction: tensor([[ 0,  9,  9,  9,  9,  9, 10,  9, 10,  9]])
Example Untrained Model Prediction: tensor([[0, 5, 5, 5, 1, 0, 1, 0, 1, 0]])
Example Untrained Model Prediction: tensor([[0, 3, 9, 6, 9, 5, 9, 4, 6, 9]])
Example Untrained Model Prediction: tensor([[0, 1, 5, 5, 3, 1, 5, 5, 5, 5]])
Example Untrained Model Prediction: tensor([[0, 1, 3, 1, 3, 1, 3, 1, 3, 8]])
Example Untrained Model Prediction: tensor([[ 0,  1,  4,  5,  2,  1,  4,  9, 10,  9]])
Example Untrained Model Prediction: tensor([[0, 6, 6, 6, 6, 6, 6, 6, 6, 6]])


# Part 2: Model Training

- We Examines tools needed to train a standard encoder decoder model. First we define a batch object that holds the src and target sentences for training, as well as constructing the masks.

## 2.1 Batches and Masking

- Batch class is responsible for holding a batch of data along with the necessary mask during the training process.

### Code Implementation
- `src`: the source of the sequence tensor
- `tgt`: the target sequence tensor
- `pad`: the padding token index

- `self.src_mask` is created by comparing src with the padding token and unsqueezing the result to add an additional dimension at index -2. This mask is used to indicate which positions in the source sequence are valid (not padding).

- `self.tgt`: stores the target sequence excluding the last token.
- `self.tgt_y`: stores the target sequence tensor excluding the first token.
  - This is used as the ground truth for training.
- `self.tgt_mask`: created by calling the `make_std_mask` method, which creates a mask to hide padding and future words in the target sequence.
- `self.ntokens` calculates the total number of non-padding tokens in `self.tgt_y`

In [27]:
class Batch:
  "Object for holding a batch of data of data with mask during training"

  def __init__(self, src, tgt=None, pad=2):
    self.src = src
    self.src_mask = (src != pad).unsqueeze(-2)
    if tgt is not None:
      self.tgt = tgt[:, :-1] # stores target sequence excluding the last token
      self.tgt_y = tgt[:, 1:] # store target sequence excluding first token
      self.tgt_mask = self.make_std_mask(self.tgt, pad)
      self.ntokens = (self.tgt_y != pad).data.sum()

  @staticmethod
  def make_std_mask(tgt, pad):
    "Create a mask to hide padding and future words."
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
    return tgt_mask

## 2.2 Training Loop

`TrainState` class tracks the training statistics

'run_epoch' function is responsible for training the model for each single epoch and it takes in 8 paramters.

- the function iterates over the batches of data yielded by `data_iter` and for each batch:
  1. It performs a forward pass through the model, passing the source sequence (`batch.src`), target sequence (`batch.tgt`), and their respective masks (`batch.src_mask, batch.tgt_mask`) as inputs.

  2. It computes the loss using `loss_compute` function, which takes the model's output (`out`), the target labels (`batch.tgt_y`), and the number of tokens in the batch (`batch.ntokens`)

  3. If the mode is "train" or "train+log", it performs a backward pass to compute gradients and updates the `train_state` object with the current step, number of samples, and tokens processed.

  4. If the current iteration is a multiple of `accum_iter`, it performs an optimization step using the `optimizer`to update the model's parameters and resets the gradients to zero.
    - it also updates the learning rate using the `scheduler`.

  5. It accumulates the total loss and the total number of tokens processed.
  6. If the current iteration is a multiple of 40 and the mode is <mark >"train" or "train+log"</mark> it prints training stats such as the current epoch step, accumulation step, loss, tokens per second, and learning rate.
    - remember an iteration is the number of batches needed to complete an epoch.

- After processing all the batches, the function returns the average loss per token (total loss divided by total tokens) and the updated `train_state` object.

#### Gradient Accumulation and Learning Rate Scheduler

A technique used to simulate larger batch sizes without increasing memory usage. Instead of updating the model's parameters after each batch, gradients are accumulated over multiple batches before performing an optimization step.
- This allows for effective training with limited memory resources.
- The learning rate scheduler is used to adjust the learning rate during training. It helps in converging the model towards a better solution by adapting the learning rate based on the training progress.

In [28]:
class TrainState:
  """
  This class is used to track various training statistics:
  - step: the current step within the epoch
  - accum_step: the number of gradient accumulation steps performed
  - samples: the total number of examples (samples) processed
  - tokens: the total number of tokens processed.
  """
  step: int = 0 # steps in the current epoch
  accum_step: int = 0 # Number of gradient accumulation steps
  samples: int = 0 # total number of examples used
  tokens: int = 0 # total number of tokens processed

In [29]:
def run_epoch(
    data_iter, # an iterator that yields batches of data
    model, # the transformer model being trained
    loss_compute, # computes the loss given model's output and target labels
    optimizer, # optimizer used to update the model's parameters
    scheduler, # the learning rate scheduler
    mode="Train", # string indicating mode of operation (i.e., "train", "train+log").
    accum_iter=1, # the number of gradient accumulation steps
    train_state=TrainState()): # instance of TrainState class to track training stats

    """Train a single epoch"""
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    n_accum = 0
    for i, batch in enumerate(data_iter):
      out = model.forward(
          batch.src, batch.tgt, batch.src_mask, batch.tgt_mask
      )
      loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens)
      # loss_node = loss_node / accum_iter
      if mode == "train" or mode == "train+log":
        loss_node.backward()
        train_state.step += 1
        train_state.samples += batch.src.shape[0]
        train_state.tokens += batch.ntokens
        if i % accum_iter == 0:
          optimizer.step()
          optimizer.zero_grad(set_to_none=True)
          n_accum += 1
          train_state.accum_step += 1
        scheduler.step()

      total_loss += loss
      total_tokens += batch.ntokens
      tokens += batch.ntokens
      if i % 40 == 1 and (mode == "train" or mode == "train+log"):
        lr = optimizer.param_groups[0]["lr"]
        elapsed = time.time() - start
        print(
            (
                "Epoch Step: %6d | Accumulation step: %3d | Loss: %6.2f "
                + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
            )
            % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr)
        )
        start = time.time()
        tokens = 0
      del loss
      del loss_node
    return total_loss / total_tokens, train_state



## 2.3 Optimizer

- Used the `Adam optimizer` with $β_1=0.9$, $β_2=0.98$, and $ϵ=10^{-9}$

- Varied the learning rate over the course of training according to the formula

$$learning \ rate = d_{model}^{-0.5} * min(step \ num^{-0.5}, step \ num * warmup \ steps^{-1.5})$$

### Code Implementation
The rate function is used to calculate the learning rate at each step during the training process.
- `step`: the current training step

- `model_size`: The size of the model (usually the dimensionality of the hidden states).
- `factor`: A scaling factor for the learning rate.
- `warmup`: The number of warmup steps.

- `step * warmup ** (-1.5)`: This term is used during the warmup phase. It increases the learning rate linearly for the first warmup steps and then decreases it thereafter.
- `min(step ** (-0.5), step * warmup ** (-1.5))`: This expression selects the minimum value between the two terms. It ensures that the learning rate increases linearly during the warmup phase and then transitions to the inverse square root decay.
- `factor`: The scaling factor is applied to the learning rate to adjust its overall magnitude.

The purpose of this learning rate schedule is to adapt the learning rate during training to improve convergence and generalization. The warmup phase allows the model to stabilize in the early stages of training, while the inverse square root decay helps the model to fine-tune its parameters as training progresses.

In [30]:
def rate(step, model_size, factor, warmup):
  """
  We have to default the step to 1 for LambdaLR function
  to avoid zero raising to negative power."""

  if step == 0:
    step = 1
  return factor * (
      model_size ** (0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
  )

## 2.4 Regularization

<mark > **Label smoothing**</mark>: A regularization technique used in classification tasks to prevent the model from becoming too confident in its predictions.
- it address the problem of overfitting by smoothing the ground-truth labels, making the model less prone to memorizing the training data and more adaptable to unseen examples.
- In the context of transformers, label smoothing is applied to the output probabilities of the model during training.
  - Instead of using one-hot encoded labels, where the true class has a probability of 1 and all other classes have a probability of 0, label smoothing assigns a small non-zero probability to the non-target classes.

- They use label smoothing of value $ϵ_{ls}=0.1$, which hurts perplexity as the model learns to be more unsure, but improves accuracy and BLEU score.

- they used KL divergence loss to implement label smoothing



In [31]:
class LabelSmoothing(nn.Module):
  """
    Implement label smoothing
    --------------------------
    size: number of classes in the classification task
    padding_idx: the index of padding token, ignored during loss computation
    smoothing: the smoothing factor, determines the amount of probability
    redistributed to non-target classes.
    """

  def __init__(self, size, padding_idx, smoothing=0.0):

    super(LabelSmoothing, self).__init__()
    self.criterion = nn.KLDivLoss(reduction="sum")
    self.padding_idx = padding_idx
    self.confidence = 1.0 - smoothing
    self.smoothing = smoothing
    self.size = size
    self.true_dist = None

  def forward(self, x, target):
    """
    Takes model's output (x) and the target labels (target) as input.
    Computes the smoothed labels and returns the KL-divergence loss between
    the model's output and the smoothed labels.

    true_dist: created by cloning the model's output x
    smoothing prob: is assigned to all non-target classes in true_dist
    the confidence prob: assigned to the target class in true_dist
    """
    assert x.size(1) == self.size
    true_dist = x.data.clone()
    true_dist.fill_(self.smoothing / (self.size - 2))
    true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
    true_dist[:, self.padding_idx] = 0
    mask = torch.nonzero(target.data == self.padding_idx)
    if mask.dim() > 0:
      true_dist.index_fill_(0, mask.squeeze(), 0.0)
    self.true_dist = true_dist
    return self.criterion(x, true_dist.clone().detach())

# A First Example Using synthetic Data

#### Copy Task
- This is a simple yet fundamental task in sequence-to-sequence (Seq2Seq) learning. The goal is to train a model that can accurately reproduce or copy the input sequence as its output.
  - in other words, given an input sequence of symbols, the model should generate the same sequence of symbols as its output.

The copy task may seem trivial, but it serves several important purposes:

1. It helps validate the correctness of the model implementation. If the model can accurately copy the input sequence, it indicates that the architecture and training process are working as expected.
2. It serves as a basic test of the model's ability to capture and reproduce sequential information.
3. It provides a foundation for more complex tasks, such as language modeling or machine translation, where the model needs to generate output sequences based on input sequences.

## Synthetic Data

#### Code Implementation

`data_gen` takes 3 parameters
  1. `vocab_size`: The size of the vocabulary, i.e., the number of unique symbols in the dataset.
  2. `batch_size`: The number of sequences in each batch.
  3. `num_batches`: The total number of batches to generate.

- We start a loop that iterates num_batches times, generating a new batch of data in each iteration.
  - Inside the loop, we generate a random tensor data of shape (batch_size, 10) using torch.randint.
  - This tensor represents a batch of input sequences, where each sequence has a length of 10 symbols. The symbols are randomly selected from the range [1, vocab_siz].

- We set the first symbol of each sequence in data to be 1. This can be considered as a start-of-sequence token, which is a common practice in seq2seq models.

- We create two copies of the data tensor: src and tgt.
  - These copies are detached from the computational graph using .clone().detach(), which means they don't require gradients and won't be involved in the backward pass during training.
- We yield a Batch object containing src, tgt, and 0.
  - The Batch object is likely a custom class defined elsewhere in the code to represent a batch of data.
  - The third argument 0 could represent some additional information about the batch, such as the source sequence length.


In [32]:
def data_gen(vocab_size, batch_size, num_batches):
  "Generate random data for a src-tgt copy task"
  for i in range(num_batches):
    data = torch.randint(1, vocab_size, size=(batch_size, 10))
    data[:, 0] = 1
    src = data.clone().detach()
    tgt = data.clone().detach()
    yield Batch(src, tgt, 0)

## Loss Computation

#### Code Implementation

`__call__` method is invoked when an instance of SimpleLossCompute is called like a function. It takes 3 parameters:
  1. `x`: The output of the transformer model before passing through the generator.
  2. `y`: the target sequences
  3. `norm`: a normalization factor used to scale the loss value.
    -  It is typically the sum of the source sequence lengths in the batch.

- inside the `__call__` method the first step is to pass the model's output x through the generator to obtain the final output probabilities.
- loss then computed using `criterion` function.
  - Before passing `x` and `y` to the criterion, they are reshaped to be compatible with the expected input format of the loss function (2D tensor where first dimension is flattened batch size and sequence length and 2nd dimension is the vocab size).
  - `y` reshaped the target tensor into a 1D tensor, flattening the batch size and sequence length dimensions.

- the computed loss is then divided by the normalization factor `norm` to obtain scaled loss value.

- Finally, the method returns two values:

  - `sloss.data * norm`: The scaled loss value as a Python scalar. `.data` is used to extract the underlying data tensor from the `sloss` tensor.
  - `sloss`: The unscaled loss tensor, which can be used for backpropagation during training.

In [33]:
class SimpleLossCompute:
  """A simple loss compute and train function
  - Generator: the final layer of the transformer model that generates the'
    output probabilities.
  - criterion: The loss function used to compute the difference between
    the predicted output and the target sequences.
    Typically an instance of nn.CrossEntropyLoss or similar loss function.
    """

  def __init__(self, generator, criterion):
    self.generator = generator
    self.criterion = criterion

  def __call__(self, x, y, norm):
    x = self.generator(x)
    sloss = (
        self.criterion(
            x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
        )
        / norm
    )
    return sloss.data * norm, sloss

## Greedy Decoding

This code predicts the translation using greedy decoding for simplicity.

#### Code Implementation
`greedy_decode` function takes in the following parameters:
  - `model`: the trained transformer model
  - 'src`: the source sequence tensor
  - `src_mask`: The mask tensor for the source sequence.
  - `start_symbol`: the start-of-sequence symbol

- The first step is to encode the source sequence `src` using the transformer model's encoder.
  - This is done by calling `model.encode(src, src_mask)`, which returns the memory tensor.

- We initialize the generated sequence `ys` as a tensor of shape (1, 1) filled with the `start_symbol`. The `type_as(src)` ensures that `ys` has the same data type as the source tensor.

- We start a loop that iterates `max_len - 1` times. In each iteration:

  - We decode the previously generated sequence `ys` using the transformer model's decoder.
    - This is done by calling `model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src))`, which returns the output tensor `out`.
  - We pass the last generated word `out[:, -1]` through the model's generator to obtain the probability distribution over the vocabulary.
  - We find the word with the highest probability using `torch.max(prob, dim=1)`, which returns the maximum value and its index.
  - We extract the index, which represents the next generated word, using `next_word = next_word.item().`

  - We concatenate the generated word next_word to the previously generated sequence ys using `torch.cat([ys, torch.ones(1, 1).type_as(src).fill_(next_word)], dim=1)`.



In [34]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
  memory = model.encode(src, src_mask)
  ys = torch.ones(1, 1).fill_(start_symbol).type_as(src)

  for i in range(max_len - 1):
    out = model.decode(
        memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src)
    )
    prob = model.generator(out[:, -1])
    _, next_word = torch.max(prob, dim=1)
    next_word = next_word.item()
    ys = torch.cat(
        [ys, torch.ones(1, 1).type_as(src).fill_(next_word)], dim=1
    )
  return ys

In [35]:
def example_simple_model():
    V = 11
    criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
    model = make_model(V, V, N=2)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, model_size=model.src_embed[0].d_model, factor=1.0, warmup=400
        ),
    )

    batch_size = 80
    for epoch in range(20):
        model.train()
        run_epoch(
            data_gen(V, batch_size, 20),
            model,
            SimpleLossCompute(model.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train",
        )
        model.eval()
        run_epoch(
            data_gen(V, batch_size, 5),
            model,
            SimpleLossCompute(model.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )[0]

    model.eval()
    src = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
    max_len = src.shape[1]
    src_mask = torch.ones(1, 1, max_len)
    print(greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=0))


# execute_example(example_simple_model)

# Part 3: A Real World Example

- We are training the Multi30k German-English Translation task.



## 3.1 Data Loading

- we will load the dataset using torchtext and spacy for tokenization

In [36]:
def load_tokenizers():
  try:
    spacy_de = spacy.load('de_core_news_sm')
  except IOError:
    os.system("python -m spacy download de_core_news_sm")
    spacy_de = spacy.load("de_core_news_sm")

  try:
    spacy_en = spacy.load("en_core_web_sm")
  except IOError:
    os.system("python -m spacy download en_core_web_sm")
    spacy_en = spacy.load("en_core_web_sm")

  return spacy_de, spacy_en

In [37]:
def tokenize(text, tokenizer):
  """
  takes inputs (text, tokenizer) and returns a list of tokenized words.
  - uses the `tokenizer.tokenizer(text)` to tokenize the input text into
    individual tokens
  - extracts the text of each token using `tok.text` and returns a list of
    tokenized words.
  """
  return [tok.text for tok in tokenizer.tokenizer(text)]


def yield_tokens(data_iter, tokenizer, index):
  """
  A generator that yields tokenized sentences from the data iter. It iterates
  over each `from_to_tuple` in the `data_iter`
  - Each `from_to_tuple` is a tuple containing the source and target language
    sentence.
  - for each `from_to_tuple`, it extracts the sentence at the specific index
    (src or tgt index) and tokenizes it using the tokenizer.
  - yields the tokenized sentence.
  """
  for from_to_tuple in data_iter:
    yield tokenizer(from_to_tuple[index])


In [38]:
import gzip

def build_vocabulary(spacy_de, spacy_en):
    """
    responsible for building the vocab for both german (src) and english (tgt)
    - two inner functions: tokenize the german and english text
    """
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building German Vocabulary...")
    with gzip.open('/content/gdrive/MyDrive/10714/transformers/multi30k-dataset/data/task1/raw/train.de.gz', 'rt') as f:
        train_de = [line.strip() for line in f]
    with gzip.open('/content/gdrive/MyDrive/10714/transformers/multi30k-dataset/data/task1/raw/train.en.gz', 'rt') as f:
        train_en = [line.strip() for line in f]
    with gzip.open('/content/gdrive/MyDrive/10714/transformers/multi30k-dataset/data/task1/raw/val.de.gz', 'rt') as f:
        val_de = [line.strip() for line in f]
    with gzip.open('/content/gdrive/MyDrive/10714/transformers/multi30k-dataset/data/task1/raw/val.en.gz', 'rt') as f:
        val_en = [line.strip() for line in f]
    with gzip.open('/content/gdrive/MyDrive/10714/transformers/multi30k-dataset/data/task1/raw/test_2017_mscoco.de.gz', 'rt') as f:
        test_de = [line.strip() for line in f]
    with gzip.open('/content/gdrive/MyDrive/10714/transformers/multi30k-dataset/data/task1/raw/test_2017_mscoco.en.gz', 'rt') as f:
        test_en = [line.strip() for line in f]

    vocab_src = build_vocab_from_iterator(
        yield_tokens(train_de + val_de + test_de, tokenize_de, index=0),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"]
    )

    print("Building English Vocabulary***")
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(train_en + val_en + test_en, tokenize_en, index=0),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"]
    )

    vocab_src.set_default_index(vocab_src["<unk>"]) # default is to unknown words
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt

In [39]:
def load_vocab(spacy_de, spacy_en):
    """
    Responsible for loading the vocabularies.
    - checks to see if vocabs are already saved in the file 'vocab.pt',
      if the file does not exist, it calls `build_vocabulary()` to build the
      vocabs and save them into the 'vocab.pt'
    - if the file does exist, it loads it and prints the sizes of the loaded vocab
    - finally, returns the loaded vocabs vocab_src & vocab_tgt
    """
    if not exists("vocab.pt"):
        vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
        torch.save((vocab_src, vocab_tgt), "vocab.pt")
    else:
        vocab_src, vocab_tgt = torch.load("vocab.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_src))
    print(len(vocab_tgt))
    return vocab_src, vocab_tgt

if is_interactive_notebook():
    # global variables used later in the script
    spacy_de, spacy_en = show_example(load_tokenizers)
    vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])

Finished.
Vocabulary sizes:
36
36


In [40]:
def collate_batch(
    batch,
    src_pipeline,
    tgt_pipeline,
    src_vocab,
    tgt_vocab,
    device,
    max_padding=128,
    pad_id=2,
):
    bs_id = torch.tensor([0], device=device)  # <s> token id
    eos_id = torch.tensor([1], device=device)  # </s> token id
    src_list, tgt_list = [], []
    for (_src, _tgt) in batch:
        processed_src = torch.cat(
            [
                bs_id,
                torch.tensor(
                    src_vocab(src_pipeline(_src)),
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )
        processed_tgt = torch.cat(
            [
                bs_id,
                torch.tensor(
                    tgt_vocab(tgt_pipeline(_tgt)),
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )
        src_list.append(
            # warning - overwrites values for negative values of padding - len
            pad(
                processed_src,
                (
                    0,
                    max_padding - len(processed_src),
                ),
                value=pad_id,
            )
        )
        tgt_list.append(
            pad(
                processed_tgt,
                (0, max_padding - len(processed_tgt)),
                value=pad_id,
            )
        )

    src = torch.stack(src_list)
    tgt = torch.stack(tgt_list)
    return (src, tgt)

In [45]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler

def create_dataloaders(
    device,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    batch_size=12000,
    max_padding=128,
    is_distributed=True,
):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    def collate_fn(batch):
        return collate_batch(
            batch,
            tokenize_de,
            tokenize_en,
            vocab_src,
            vocab_tgt,
            device,
            max_padding=max_padding,
            pad_id=vocab_src.get_stoi()["<blank>"],
        )

    class Multi30kDataset(Dataset):
        def __init__(self, data_dir, split):
            self.data_dir = data_dir
            self.split = split
            self.src_data, self.tgt_data = self.load_data()

        def load_data(self):
            src_data = []
            tgt_data = []

            with gzip.open(f"{self.data_dir}/data/task1/raw/{self.split}.de.gz", 'rt') as f:
                src_data = [line.strip() for line in f]
            with gzip.open(f"{self.data_dir}/data/task1/raw/{self.split}.en.gz", 'rt') as f:
                tgt_data = [line.strip() for line in f]

            return src_data, tgt_data

        def __len__(self):
            return len(self.src_data)

        def __getitem__(self, idx):
            return self.src_data[idx], self.tgt_data[idx]

    data_dir = "/content/gdrive/MyDrive/10714/transformers/multi30k-dataset"
    train_dataset = Multi30kDataset(data_dir, "train")
    valid_dataset = Multi30kDataset(data_dir, "val")
    test_dataset = Multi30kDataset(data_dir, "test_2017_mscoco")

    train_sampler = DistributedSampler(train_dataset) if is_distributed else None
    valid_sampler = DistributedSampler(valid_dataset) if is_distributed else None

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=(train_sampler is None),
        sampler=train_sampler,
        collate_fn=collate_fn,
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=batch_size,
        shuffle=(valid_sampler is None),
        sampler=valid_sampler,
        collate_fn=collate_fn,
    )

    return train_dataloader, valid_dataloader

In [46]:
def train_worker(
    gpu,
    ngpus_per_node,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    config,
    is_distributed=False,
):
    print(f"Train worker process using GPU: {gpu} for training", flush=True)
    torch.cuda.set_device(gpu)

    pad_idx = vocab_tgt["<blank>"]
    d_model = 512
    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.cuda(gpu)
    module = model
    is_main_process = True
    if is_distributed:
        dist.init_process_group(
            "nccl", init_method="env://", rank=gpu, world_size=ngpus_per_node
        )
        model = DDP(model, device_ids=[gpu])
        module = model.module
        is_main_process = gpu == 0

    criterion = LabelSmoothing(
        size=len(vocab_tgt), padding_idx=pad_idx, smoothing=0.1
    )
    criterion.cuda(gpu)

    train_dataloader, valid_dataloader = create_dataloaders(
        gpu,
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=config["batch_size"] // ngpus_per_node,
        max_padding=config["max_padding"],
        is_distributed=is_distributed,
    )

    optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, d_model, factor=1, warmup=config["warmup"]
        ),
    )
    train_state = TrainState()

    for epoch in range(config["num_epochs"]):
        if is_distributed:
            train_dataloader.sampler.set_epoch(epoch)
            valid_dataloader.sampler.set_epoch(epoch)

        model.train()
        print(f"[GPU{gpu}] Epoch {epoch} Training ====", flush=True)
        _, train_state = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in train_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train+log",
            accum_iter=config["accum_iter"],
            train_state=train_state,
        )

        GPUtil.showUtilization()
        if is_main_process:
            file_path = "%s%.2d.pt" % (config["file_prefix"], epoch)
            torch.save(module.state_dict(), file_path)
        torch.cuda.empty_cache()

        print(f"[GPU{gpu}] Epoch {epoch} Validation ====", flush=True)
        model.eval()
        sloss = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in valid_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )
        print(sloss)
        torch.cuda.empty_cache()

    if is_main_process:
        file_path = "%sfinal.pt" % config["file_prefix"]
        torch.save(module.state_dict(), file_path)

In [47]:
from os.path import exists

def train_distributed_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    ngpus = torch.cuda.device_count()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12356"
    print(f"Number of GPUs detected: {ngpus}")
    print("Spawning training processes ...")
    mp.spawn(
        train_worker,
        nprocs=ngpus,
        args=(ngpus, vocab_src, vocab_tgt, spacy_de, spacy_en, config, True),
    )


def train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    if config["distributed"]:
        train_distributed_model(
            vocab_src, vocab_tgt, spacy_de, spacy_en, config
        )
    else:
        train_worker(
            0, 1, vocab_src, vocab_tgt, spacy_de, spacy_en, config, False
        )


def load_trained_model():
    config = {
        "batch_size": 32,
        "distributed": False,
        "num_epochs": 8,
        "accum_iter": 10,
        "base_lr": 1.0,
        "max_padding": 72,
        "warmup": 3000,
        "file_prefix": "multi30k_model_",
    }
    model_path = "/content/gdrive/MyDrive/10714/transformers/multi30k_model_final.pt"
    if not exists(model_path):
        train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config)

    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.load_state_dict(torch.load(model_path))
    return model


if is_interactive_notebook():
    model = load_trained_model()

Train worker process using GPU: 0 for training
[GPU0] Epoch 0 Training ====
Epoch Step:      1 | Accumulation step:   1 | Loss:   2.77 | Tokens / Sec:  1693.3 | Learning Rate: 2.8e-04
Epoch Step:     41 | Accumulation step:   5 | Loss:   3.33 | Tokens / Sec:  1680.0 | Learning Rate: 5.8e-03
Epoch Step:     81 | Accumulation step:   9 | Loss:   1.78 | Tokens / Sec:  1710.1 | Learning Rate: 1.1e-02
Epoch Step:    121 | Accumulation step:  13 | Loss:   1.35 | Tokens / Sec:  1762.5 | Learning Rate: 1.7e-02
Epoch Step:    161 | Accumulation step:  17 | Loss:   1.20 | Tokens / Sec:  1781.7 | Learning Rate: 2.2e-02
Epoch Step:    201 | Accumulation step:  21 | Loss:   0.93 | Tokens / Sec:  1796.5 | Learning Rate: 2.8e-02
Epoch Step:    241 | Accumulation step:  25 | Loss:   0.82 | Tokens / Sec:  1841.9 | Learning Rate: 3.3e-02
Epoch Step:    281 | Accumulation step:  29 | Loss:   0.80 | Tokens / Sec:  1817.6 | Learning Rate: 3.9e-02
Epoch Step:    321 | Accumulation step:  33 | Loss:   0.68 |

In [43]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [48]:
def check_outputs(
    valid_dataloader,
    model,
    vocab_src,
    vocab_tgt,
    n_examples=15,
    pad_idx=2,
    eos_string="</s>",
):
    results = [()] * n_examples
    for idx in range(n_examples):
        print("\nExample %d ========\n" % idx)
        b = next(iter(valid_dataloader))
        rb = Batch(b[0], b[1], pad_idx)
        greedy_decode(model, rb.src, rb.src_mask, 64, 0)[0]

        src_tokens = [
            vocab_src.get_itos()[x] for x in rb.src[0] if x != pad_idx
        ]
        tgt_tokens = [
            vocab_tgt.get_itos()[x] for x in rb.tgt[0] if x != pad_idx
        ]

        print(
            "Source Text (Input)        : "
            + " ".join(src_tokens).replace("\n", "")
        )
        print(
            "Target Text (Ground Truth) : "
            + " ".join(tgt_tokens).replace("\n", "")
        )
        model_out = greedy_decode(model, rb.src, rb.src_mask, 72, 0)[0]
        model_txt = (
            " ".join(
                [vocab_tgt.get_itos()[x] for x in model_out if x != pad_idx]
            ).split(eos_string, 1)[0]
            + eos_string
        )
        print("Model Output               : " + model_txt.replace("\n", ""))
        results[idx] = (rb, src_tokens, tgt_tokens, model_out, model_txt)
    return results


def run_model_example(n_examples=5):
    global vocab_src, vocab_tgt, spacy_de, spacy_en

    print("Preparing Data ...")
    _, valid_dataloader = create_dataloaders(
        torch.device("cpu"),
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=1,
        is_distributed=False,
    )

    print("Loading Trained Model ...")

    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.load_state_dict(
        torch.load("multi30k_model_final.pt", map_location=torch.device("cpu"))
    )

    print("Checking Model Outputs:")
    example_data = check_outputs(
        valid_dataloader, model, vocab_src, vocab_tgt, n_examples=n_examples
    )
    return model, example_data

In [49]:
execute_example(run_model_example)


Preparing Data ...
Loading Trained Model ...
Checking Model Outputs:


Source Text (Input)        : <s> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Target Text (Ground Truth) : <s> A <unk> <unk> a <unk> <unk> <unk> a <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Model Output               : <s> D <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> D <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk></s>


Source Text (Input)        : <s> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Target Text (Ground Truth) : <s> A <unk> <unk> a <unk> <unk> <unk> <unk> <un

In [50]:
def mtx2df(m, max_row, max_col, row_tokens, col_tokens):
    "convert a dense matrix to a data frame with row and column indices"
    return pd.DataFrame(
        [
            (
                r,
                c,
                float(m[r, c]),
                "%.3d %s"
                % (r, row_tokens[r] if len(row_tokens) > r else "<blank>"),
                "%.3d %s"
                % (c, col_tokens[c] if len(col_tokens) > c else "<blank>"),
            )
            for r in range(m.shape[0])
            for c in range(m.shape[1])
            if r < max_row and c < max_col
        ],
        # if float(m[r,c]) != 0 and r < max_row and c < max_col],
        columns=["row", "column", "value", "row_token", "col_token"],
    )


def attn_map(attn, layer, head, row_tokens, col_tokens, max_dim=30):
    df = mtx2df(
        attn[0, head].data,
        max_dim,
        max_dim,
        row_tokens,
        col_tokens,
    )
    return (
        alt.Chart(data=df)
        .mark_rect()
        .encode(
            x=alt.X("col_token", axis=alt.Axis(title="")),
            y=alt.Y("row_token", axis=alt.Axis(title="")),
            color="value",
            tooltip=["row", "column", "value", "row_token", "col_token"],
        )
        .properties(height=400, width=400)
        .interactive()
    )

In [51]:
def get_encoder(model, layer):
    return model.encoder.layers[layer].self_attn.attn


def get_decoder_self(model, layer):
    return model.decoder.layers[layer].self_attn.attn


def get_decoder_src(model, layer):
    return model.decoder.layers[layer].src_attn.attn


def visualize_layer(model, layer, getter_fn, ntokens, row_tokens, col_tokens):
    # ntokens = last_example[0].ntokens
    attn = getter_fn(model, layer)
    n_heads = attn.shape[1]
    charts = [
        attn_map(
            attn,
            0,
            h,
            row_tokens=row_tokens,
            col_tokens=col_tokens,
            max_dim=ntokens,
        )
        for h in range(n_heads)
    ]
    assert n_heads == 8
    return alt.vconcat(
        charts[0]
        # | charts[1]
        | charts[2]
        # | charts[3]
        | charts[4]
        # | charts[5]
        | charts[6]
        # | charts[7]
        # layer + 1 due to 0-indexing
    ).properties(title="Layer %d" % (layer + 1))

## 3.2: Encoder Self Attention

In [52]:
def viz_encoder_self():
    model, example_data = run_model_example(n_examples=1)
    example = example_data[
        len(example_data) - 1
    ]  # batch object for the final example

    layer_viz = [
        visualize_layer(
            model, layer, get_encoder, len(example[1]), example[1], example[1]
        )
        for layer in range(6)
    ]
    return alt.hconcat(
        layer_viz[0]
        # & layer_viz[1]
        & layer_viz[2]
        # & layer_viz[3]
        & layer_viz[4]
        # & layer_viz[5]
    )


show_example(viz_encoder_self)

Preparing Data ...
Loading Trained Model ...
Checking Model Outputs:


Source Text (Input)        : <s> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Target Text (Ground Truth) : <s> A <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Model Output               : <s> D <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> D <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk></s>


3.3: Decoder Self Attention

In [53]:
def viz_decoder_self():
    model, example_data = run_model_example(n_examples=1)
    example = example_data[len(example_data) - 1]

    layer_viz = [
        visualize_layer(
            model,
            layer,
            get_decoder_self,
            len(example[1]),
            example[1],
            example[1],
        )
        for layer in range(6)
    ]
    return alt.hconcat(
        layer_viz[0]
        & layer_viz[1]
        & layer_viz[2]
        & layer_viz[3]
        & layer_viz[4]
        & layer_viz[5]
    )


show_example(viz_decoder_self)

Preparing Data ...
Loading Trained Model ...
Checking Model Outputs:


Source Text (Input)        : <s> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Target Text (Ground Truth) : <s> A <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> a <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>
Model Output               : <s> D <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk></s>
