In [None]:
import torch, math
from torch import nn
from torch.nn import functional as F
from torchsummary import summary

Part 1: Multi-head Attention In this part, you will implement the core attention mechanism used throughout the Transformer.

The multi-head attention module projects the input into multiple query, key, and value subspaces, applies scaled dot-product attention in parallel across heads, and concatenates the results. Your implementation should support multiple heads and include appropriate linear projections and dropout.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, proj_drop: float = 0.1, atten_drop: float = 0.1) -> None:
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model divisible by num_heads."
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        # Linear layers to project input to q, k, v
        self.W_q = nn.ModuleList([nn.Linear(d_model, self.d_k) for _ in range(num_heads)])
        self.W_k = nn.ModuleList([nn.Linear(d_model, self.d_k) for _ in range(num_heads)])
        self.W_v = nn.ModuleList([nn.Linear(d_model, self.d_k) for _ in range(num_heads)])

        # Output linear layer
        self.ffn = nn.Linear(d_model, d_model)

        # Dropout for attention weights
        self.proj_drop = nn.Dropout(p=proj_drop)
        self.atten_drop = nn.Dropout(p=atten_drop)

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        outputs: list[torch.Tensor] = []
        for h in range(self.num_heads):
            # Project inputs to q, k, v
            Q: torch.Tensor = self.W_q[h](self.proj_drop(query))
            K: torch.Tensor = self.W_k[h](self.proj_drop(key))
            V: torch.Tensor = self.W_v[h](self.proj_drop(value))

            # Calculate scaled dot-product attention scores
            # Apply mask (if provided)
            # (Hint: use torch.matmul and scale by sqrt(d_k))
            Z: torch.Tensor = Q.matmul(K.transpose(0, 1)) / math.sqrt(self.d_k)
            if(mask): Z *= mask
            Z = F.softmax(Z, dim=0).matmul(V)

            # Attention dropout
            Z = self.atten_drop(Z)
            outputs.append(Z)

        # Concatenate heads and apply final linear projection
        output = torch.cat(outputs, dim=1)
        return self.ffn(output)

Part 2: Positional Encoding Positional Encoding is used to inject the position information of each token in the input sequence. It uses sine and cosine functions of different frequencies to generate the positional encoding.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_length: int) -> None:
        super(PositionalEncoding, self).__init__()
        # Define your position embedding function
        self.pe = nn.Parameter(data=torch.zeros(max_seq_length, d_model), requires_grad=False)
        for i in range(max_seq_length):
            for pos in range(0, d_model, 2):
                self.pe[i][pos] = math.sin(pos / math.pow(10000.0, 2*i / d_model))
            for pos in range(1, d_model, 2):
                self.pe[i][pos] = math.cos(pos / math.pow(10000.0, 2*i / d_model))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Add positional encoding to input x
        x_seq_length = x.shape[0]
        return x + self.pe[:x_seq_length,:]

Part 3: EncoderLayer In this part, you will implement a single layer of the Transformer encoder, as shown in Figure 1.

Each encoder layer consists of a multi-head self-attention block followed by a position-wise feed-forward network. Residual connections, layer normalization, and dropout are applied after each sub-layer. You should use your MultiHeadAttention module from Part 1 as a building block.

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1) -> None:
        super(EncoderLayer, self).__init__()

        # Use your Multi-Head Attention module
        self.attn = MultiHeadAttention(d_model, num_heads)

        # Define position-wise feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model*4),
            nn.ReLU(),
            nn.Linear(d_model*4, d_model)
        )

        # Define two layer normalization layers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Define dropout layers if needed
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        o  = self.norm1(self.attn(x,x,x) + x)
        o  = self.dropout(o)
        o2 = self.norm2(self.ffn(o)  + o)
        return o2

Part 4: DecoderLayer Based on Figure 2, implement the DecoderLayer class, with two Multi-Head Attention layers, a Position-wise Feed-Forward layer, and three Layer Normalization layers.

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1) -> None:
        super(DecoderLayer, self).__init__()

        # Define projection layers for masked self-attention
        self.masked_attn = MultiHeadAttention(d_model, num_heads)
        
        # Define projection layers for encoder-decoder attention
        self.attn = MultiHeadAttention(d_model, num_heads)

        # Define position-wise feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model*4),
            nn.ReLU(),
            nn.Linear(d_model*4, d_model)
        )

        # Define three layer normalization layers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        # Optional: Define dropout layers if needed
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        self.dropout3 = nn.Dropout(p=dropout)

    def forward(self, x: torch.Tensor, enc_out: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        # Masked multi-head self-attention on decoder input
        o = self.norm1(self.masked_attn(x,x,x,mask) + x)
        o = self.dropout1(o)

        # Encoder-decoder attention (attend over encoder outputs)
        o2 = self.norm2(self.attn(enc_out, enc_out, o) + o)
        o2 = self.dropout2(o2)

        # Position-wise feed-forward network
        o3 = self.norm3(self.ffn(o2) + o2)
        return o3

Part 5: Implement Transformer Class In this final part, you will assemble the full Transformer model using the components you have implemented in previous parts.

Your model should include token embeddings, positional encodings, stacked encoder and decoder layers, and an output projection layer.

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_heads: int, num_layers: int, dropout: float = 0.1, max_seq_length: int = 512):
        super(Transformer, self).__init__()

        # Token embedding and Positional encoding
        self.em = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.pe = PositionalEncoding(d_model, max_seq_length)

        # Encoder stack (N layers)
        self.encoders = nn.ModuleList([
            EncoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)
        ])

        # Decoder stack (N layers)
        self.decoders = nn.ModuleList([
            DecoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)
        ])

        # Output projection layer
        self.proj = nn.Sequential(
            nn.Linear(d_model, vocab_size),
            nn.Softmax(dim=0)
        )

    def forward(self, src: torch.Tensor, tgt: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        enc_output = self.pe(self.em(src))
        for layer in self.encoders:
            enc_output = layer(enc_output)

        dec_output = self.pe(self.em(tgt))
        for layer in self.decoders:
            dec_output = layer(dec_output, enc_output, mask)

        return self.proj(dec_output)