#**Transformer Architecture**


In [1]:
!pip install torchtext==0.17.2

Collecting torchtext==0.17.2
  Downloading torchtext-0.17.2-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torch==2.2.2 (from torchtext==0.17.2)
  Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2

In [2]:
import torchtext
import torch
import torch.nn as nn
from torch.nn import functional as F


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

##**Transformer-Encoder**

###**Token and Positional Embedding**

**Token Embedding:** Represents the input tokens (usually split by subword-based tokenization) as dense vectors.

**Positional Encoding:** Represents the position (order) of tokens in a sentence. This is typically computed using sinusoidal functions or learned during model training.

In [3]:
class TokenAndPositionEmbedding(nn.Module):
  def __init__(self, vocab_size, embed_dim, max_length, device='cpu'):
    super().__init__()
    self.device = device
    #Token embedding
    self.word_emb = nn.Embedding(
      num_embeddings=vocab_size,
      embedding_dim=embed_dim
    )
    #Positional embedding
    self.pos_emb = nn.Embedding(
      num_embeddings=max_length,
      embedding_dim=embed_dim
    )

  def forward(self, x):
    N, seq_len = x.size()
    positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
    output1 = self.word_emb(x)
    output2 = self.pos_emb(positions)
    output = output1 + output2
    return output

###**Transformer-Encoder Block**

**Encoder blocks:** Encode the input tokens into contextual embeddings. They include: Multi-Head Attention, Add & Normalization, Feed Forward Network



In [4]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
    super().__init__()
    #Multi-head Attention
    self.attn = nn.MultiheadAttention(
      embed_dim=embed_dim,
      num_heads=num_heads,
      batch_first=True
    )
    #Feed Forward Network
    self.ffn = nn.Sequential(
      nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
      nn.ReLU(),
      nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
    )
    #Add & Normalization
    self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
    self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
    self.dropout_1 = nn.Dropout(p=dropout)
    self.dropout_2 = nn.Dropout(p=dropout)

  def forward(self, query, key, value):
    attn_output, _ = self.attn(query, key, value)
    attn_output = self.dropout_1(attn_output)
    out_1 = self.layernorm_1(query + attn_output)
    ffn_output = self.ffn(out_1)
    ffn_output = self.dropout_2(ffn_output)
    out_2 = self.layernorm_2(out_1 + ffn_output)
    return out_2

###**Transformer-Encoder**

In [5]:
class TransformerEncoder(nn.Module):
  def __init__(self,
              src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim,
              dropout=0.1, device='cpu'
  ):
    super().__init__()
    # Input embedding
    self.embedding = TokenAndPositionEmbedding(
      src_vocab_size, embed_dim, max_length, device
    )
    # Encoder block
    self.layers = nn.ModuleList(
      [
        TransformerEncoderBlock(
          embed_dim, num_heads, ff_dim, dropout
        ) for i in range(num_layers)
      ]
    )

  def forward(self, x):
    output = self.embedding(x)
    for layer in self.layers:
      output = layer(output, output, output)
    return output

In [6]:
batch_size = 32
src_vocab_size = 1000
embed_dim = 200
max_length = 100
num_layers = 2
num_heads = 4
ff_dim = 256

In [7]:
input = torch.randint(
  high=2,
  size=(batch_size, max_length),
  dtype=torch.int64
)

In [11]:
encoder = TransformerEncoder(
  src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim
)
print(encoder)

TransformerEncoder(
  (embedding): TokenAndPositionEmbedding(
    (word_emb): Embedding(1000, 200)
    (pos_emb): Embedding(100, 200)
  )
  (layers): ModuleList(
    (0-1): 2 x TransformerEncoderBlock(
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
      )
      (ffn): Sequential(
        (0): Linear(in_features=200, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=200, bias=True)
      )
      (layernorm_1): LayerNorm((200,), eps=1e-06, elementwise_affine=True)
      (layernorm_2): LayerNorm((200,), eps=1e-06, elementwise_affine=True)
      (dropout_1): Dropout(p=0.1, inplace=False)
      (dropout_2): Dropout(p=0.1, inplace=False)
    )
  )
)


In [9]:
encoded = encoder(input)

In [10]:
encoded.shape

torch.Size([32, 100, 200])

##**Transformer-Decoder**

###**Transformer-Decoder Block**

**Decoder blocks:** Take as input the historical tokens and the encoded states from the encoder, decoding to predict the next token. They include: Masked Multi-Head Attention (based on the decoder's historical tokens), Multi-Head Attention (based on the encoder outputs and current decoder state), Add & Normalization, Feed Forward Network

In [12]:
class TransformerDecoderBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
    super().__init__()
    # Masked Multi-Head Attention
    self.attn = nn.MultiheadAttention(
      embed_dim=embed_dim,
      num_heads=num_heads,
      batch_first=True
    )
    # Multi-Head Attention
    self.cross_attn = nn.MultiheadAttention(
      embed_dim=embed_dim,
      num_heads=num_heads,
      batch_first=True
    )
    # Feed Forward Network
    self.ffn = nn.Sequential(
      nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
      nn.ReLU(),
      nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
    )
    # Add & Normalization
    self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
    self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
    self.layernorm_3 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
    self.dropout_1 = nn.Dropout(p=dropout)
    self.dropout_2 = nn.Dropout(p=dropout)
    self.dropout_3 = nn.Dropout(p=dropout)

  def forward(self, x, enc_output, src_mask, tgt_mask):
    attn_output, _ = self.attn(x, x, x, attn_mask=tgt_mask)
    attn_output = self.dropout_1(attn_output)
    out_1 = self.layernorm_1(x + attn_output)

    attn_output, _ = self.cross_attn(
      out_1, enc_output, enc_output, attn_mask=src_mask
    )
    attn_output = self.dropout_2(attn_output)
    out_2 = self.layernorm_2(out_1 + attn_output)

    ffn_output = self.ffn(out_2)
    ffn_output = self.dropout_2(ffn_output)
    out_3 = self.layernorm_2(out_2 + ffn_output)
    return out_3

###**Transformer-Decoder**

In [13]:
class TransformerDecoder(nn.Module):
  def __init__(self,
      tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim,
      dropout=0.1, device='cpu'
  ):
    super().__init__()
    # Input embedding
    self.embedding = TokenAndPositionEmbedding(
      tgt_vocab_size, embed_dim, max_length, device
    )
    # Decoder block
    self.layers = nn.ModuleList(
      [
        TransformerDecoderBlock(
          embed_dim, num_heads, ff_dim, dropout
        ) for i in range(num_layers)
      ]
    )

  def forward(self, x, enc_output, src_mask, tgt_mask):
    output = self.embedding(x)
    for layer in self.layers:
      output = layer(output, enc_output, src_mask, tgt_mask)
    return output

##**Transformer**

In [14]:
class Transformer(nn.Module):
  def __init__(self,
      src_vocab_size, tgt_vocab_size,
      embed_dim, max_length, num_layers, num_heads, ff_dim,
      dropout=0.1, device='cpu'
  ):
    super().__init__()
    self.device = device
    self.encoder = TransformerEncoder(
      src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim
    )
    self.decoder = TransformerDecoder(
      tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim
    )
    self.fc = nn.Linear(embed_dim, tgt_vocab_size)

  def generate_mask(self, src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    src_mask = torch.zeros(
      (src_seq_len, src_seq_len),
      device=self.device
    ).type(torch.bool)

    tgt_mask = (torch.triu(torch.ones(
      (tgt_seq_len, tgt_seq_len),
      device=self.device)
    ) == 1).transpose(0, 1)
    tgt_mask = tgt_mask.float().masked_fill(
      tgt_mask == 0, float('-inf')
    ).masked_fill(tgt_mask == 1, float(0.0))
    return src_mask, tgt_mask

  def forward(self, src, tgt):
    src_mask, tgt_mask = self.generate_mask(src, tgt)
    enc_output = self.encoder(src)
    dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
    output = self.fc(dec_output)
    return output

In [15]:
batch_size = 128
src_vocab_size = 1000
tgt_vocab_size = 2000
embed_dim = 200
max_length = 100
num_layers = 2
num_heads = 4
ff_dim = 256

In [16]:
model = Transformer(
  src_vocab_size, tgt_vocab_size,
  embed_dim, max_length, num_layers, num_heads, ff_dim
)
print(model)

Transformer(
  (encoder): TransformerEncoder(
    (embedding): TokenAndPositionEmbedding(
      (word_emb): Embedding(1000, 200)
      (pos_emb): Embedding(100, 200)
    )
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=200, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=200, bias=True)
        )
        (layernorm_1): LayerNorm((200,), eps=1e-06, elementwise_affine=True)
        (layernorm_2): LayerNorm((200,), eps=1e-06, elementwise_affine=True)
        (dropout_1): Dropout(p=0.1, inplace=False)
        (dropout_2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (embedding): TokenAndPositionEmbedding(
      (word_emb): Embedding(2000, 200)
      (pos_emb): Embedding(10

In [17]:
src = torch.randint(
  high=2,
  size=(batch_size, max_length),
  dtype=torch.int64
)

In [18]:
tgt = torch.randint(
  high=2,
  size=(batch_size, max_length),
  dtype=torch.int64
)

In [19]:
prediction = model(src, tgt)

In [20]:
prediction.shape

torch.Size([128, 100, 2000])