<a href="https://colab.research.google.com/github/itsmepriyabrata/priyabrata_ai_python/blob/main/Transformer%20imp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The transformer model uses positional encoding to incorporate the order of the sequence. We'll implement a simple version of it.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        x = x + self.encoding[:, :x.size(1), :].to(x.device)
        return x

# Example usage
d_model = 4
pos_encoder = PositionalEncoding(d_model)
input_seq = torch.tensor([[[1.0, 0.0, 1.0, 0.0], [0.0, 2.0, 0.0, 1.0]]])  # Added batch dimension
pos_encoded_input = pos_encoder(input_seq)
print("Positional Encoded Input:\n", pos_encoded_input)


Positional Encoded Input:
 tensor([[[1.0000, 1.0000, 1.0000, 1.0000],
         [0.8415, 2.5403, 0.0100, 1.9999]]])


Multi-head attention allows the model to jointly attend to information from different representation subspaces.

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, x):
        batch_size = x.size(0)

        # Perform linear operation and split into num_heads
        Q = self.q_linear(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.k_linear(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.v_linear(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        attention_weights = F.softmax(scores, dim=-1)

        # Compute the attention output
        attention_output = torch.matmul(attention_weights, V)
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)

        return self.out_linear(attention_output)

# Example usage
num_heads = 2
multi_head_attention = MultiHeadAttention(d_model, num_heads)
multi_head_output = multi_head_attention(pos_encoded_input.unsqueeze(0))
print("Multi-Head Attention Output:\n", multi_head_output)


Multi-Head Attention Output:
 tensor([[[ 0.3992,  0.3363,  0.0131, -0.1204],
         [ 0.3962,  0.2500, -0.0358, -0.1234]]], grad_fn=<ViewBackward0>)


Here's how you can integrate positional encoding and multi-head self-attention in a transformer block



In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model)
        )

    def forward(self, x):
        # Apply multi-head attention
        attention_output = self.attention(x)
        x = self.norm1(x + attention_output)

        # Apply feed-forward network
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)

        return x

# Example usage
transformer_block = TransformerBlock(d_model, num_heads)
transformer_output = transformer_block(pos_encoded_input.unsqueeze(0))
print("Transformer Block Output:\n", transformer_output)


Transformer Block Output:
 tensor([[[[-0.0039,  1.1578, -1.5754,  0.4214],
          [-0.2514,  1.0388, -1.5223,  0.7349]]]],
       grad_fn=<NativeLayerNormBackward0>)


Here's a basic transformer encoder model that combines the above components.



In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, max_len=5000):
        super(TransformerEncoder, self).__init__()
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([TransformerBlock(d_model, num_heads) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        x = self.pos_encoder(x)
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)

# Example usage
num_layers = 2
transformer_encoder = TransformerEncoder(d_model, num_heads, num_layers)
encoder_output = transformer_encoder(input_seq.unsqueeze(0))
print("Transformer Encoder Output:\n", encoder_output)


Transformer Encoder Output:
 tensor([[[[-0.1246,  1.5678, -1.2140, -0.2293],
          [-0.8749,  1.4999, -0.9408,  0.3158]]]],
       grad_fn=<NativeLayerNormBackward0>)
