In [None]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import random
random.seed(24)  # Python random seed
np.random.seed(24)  # NumPy seed
torch.manual_seed(24)  # PyTorch seed (CPU)

In [None]:
# Set print options: No scientific notation, 2 decimal places
torch.set_printoptions(sci_mode=False, precision=4)

# Define the maximum sequence length and the embedding dimension for a model:

max_sequence_length = 5: Specifies the maximum number of tokens a sequence can have. If a sequence is shorter, it may be padded; if longer, it may be truncated.

d_model = 8: Defines the size of each token’s embedding vector, meaning each token will be represented as a     8-dimensional vector.

In [None]:
d_model = 8
max_sequence_length = 5

# Define three linear layers using nn.Linear in PyTorch:

w_query: Projects input embeddings into query space.

w_key: Projects input embeddings into key space.

w_value: Projects input embeddings into value space.
## These linear layers transform input embeddings (d_model dimensional) into new representations of the same size (d_model → d_model)

In [None]:
w_query = nn.Linear(d_model, d_model)
w_key   = nn.Linear(d_model, d_model)
w_value = nn.Linear(d_model, d_model)

# Create a tensor tokens with random values, to simulate a batch of sequence of token embeddings.

Use torch.randn() to generate a random tensor of shape (batch_size, max_sequence_length, d_model), where:

batch_size defines the number of sequences processed simultaneously

max_sequence_length represents the number of tokens in the sequence.

d_model represents the embedding dimension.

In [None]:
X_input_tokens.size()

In [None]:
X_input_tokens

# Apply linear transformations to the tokens tensor using w_query, w_key, and w_value to obtain query (q), key (k), and value (v) representations.

## Pass tokens through the three linear layers to compute q, k, and v.

# Compute Per-Head Dimensions in Multi-Head Attention
In multi-head attention, we split the embedding dimension (d_model) into multiple heads to allow the model to focus on different parts of the input simultaneously.

Define the number of attention heads as num_heads = 4.

Compute the per-head query (d_q), key (d_k), and value (d_v) dimensions by dividing d_model by num_heads.

Ensure that d_q, d_k, and d_v are equal and represent the dimension per head.

# Reshape each tensor to (batch_size, max_sequence_length, num_heads, d_q), where:

batch_size is the number of input sequences in a batch.

max_sequence_length is the number of tokens per sequence.

num_heads is the number of attention heads.

d_q, d_k, and d_v are the per-head dimensions (d_model // num_heads).

Verify that the new shape correctly divides the d_model dimension across multiple heads.

Print the new shapes of q, k, and v to confirm the changes.

In [None]:
q.shape, k.shape, v.shape

# Transpose Query Tensor for Multi-Head Attention
In multi-head attention, after reshaping the query (q), key (k), and value (v) tensors, we need to transpose them to bring the num_heads dimension to the second position. This helps in efficiently computing attention scores across multiple heads.

Assume you have a query tensor (q) of shape (batch_size, max_sequence_length, num_heads, d_q) after reshaping.

Transpose q to rearrange dimensions, so that num_heads moves to the second position, resulting in (batch_size, num_heads, max_sequence_length, d_q).

Print the shape of q after transposing to verify the changes.

# Repeat the same operation for k and v.

For a single head:
$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

# Compute Attention Scores for Each Head
In multi-head attention, after transposing q and k, we compute attention scores using scaled dot-product attention. This involves:

Taking the dot product of q with the transposed k to get similarity scores.

Scaling the scores by dividing by the square root of d_k (to stabilize gradients).

Printing the shape of attn_scores to verify it per head.

In [None]:
k.transpose(-1, -2).shape

## Masking

- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders, but required in the decoders

# Create a lower triangular mask using torch.tril, which generates a matrix where only the lower triangle (including the diagonal) contains ones, while the upper triangle contains zeros. This mask is typically used in masked self-attention in transformers to ensure that each position in a sequence can only attend to previous positions and itself, preventing access to future tokens during decoding.

In [None]:
attn_scores

# Compute the weighted sum of value (v) vectors using attention weights, where each query token receives a context-aware representation. This ensures that each generated token attends to relevant past tokens, influencing its prediction based on learned dependencies.

In [None]:
attn_weights

In [None]:
attention_output.shape

In [None]:
attention_output

# Implement MultiHeadAttention class

In [None]:
# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.d_v = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        
        self.fc = nn.Linear(d_model, d_model)
        
    def forward(self, q_input, k_input, v_input, mask=None):

        batch_size, max_sequence_length, _ = q_input.size()
        
        Q = self.query(q_input)
        K = self.key(k_input)
        V = self.value(v_input)
        
        q = Q.reshape(batch_size, max_sequence_length, self.num_heads, self.d_k)
        k = K.reshape(batch_size, max_sequence_length, self.num_heads, self.d_k)
        v = V.reshape(batch_size, max_sequence_length, self.num_heads, self.d_v)
        
        q = q.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_k]
        k = k.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_k]
        v = v.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_v]

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))  # Ensure mask matches input length
        attn_weights = F.softmax(attn_scores, dim=-1)
    
        attention_output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, max_sequence_length, self.d_model)
       
        output = self.fc(attention_output)
        return output

In [None]:
d_model = 16  # Small model for testing
num_heads = 4  # Number of heads
seq_len = 5  # Sequence length
batch_size = 2  # Batch size

# Create a random input tensor with batch_size, seq_len, d_model

# Self-Attention in the Encoder
## Purpose: Allows each token to attend to all other tokens in the input sequence.

### Masking: No causal masking (tokens can see all positions).

#### Input: x (same for query, key, value).

#### Mask: Usually a padding mask (not needed for random input).

# Masked Self-Attention in the Decoder
## Purpose: Prevents each token from attending to future tokens.

### Masking: Causal mask applied.

### Input: x (same for query, key, value).

### Mask: Lower triangular mask to enforce causality.

# Encoder-Decoder Cross-Attention in the Decoder
## Purpose: Allows decoder tokens to attend to all encoder tokens.

## Masking: No causal mask (attends to all encoder tokens).

### Input:

#### q (decoder representation).

#### k, v (encoder output).

# Understanding Contiguous and Non-Contiguous Tensors in PyTorch

In [None]:
import torch

# Create a contiguous tensor
x = torch.randn(2, 3)
print("Original tensor:\n", x)
print("Is contiguous?", x.is_contiguous())  # True

# Transpose it (creates a non-contiguous tensor)
x_t = x.transpose(0, 1)
print("\nTransposed tensor:\n", x_t)
print("Is contiguous?", x_t.is_contiguous())  # False
