In [1]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import random
random.seed(24)  # Python random seed
np.random.seed(24)  # NumPy seed
torch.manual_seed(24)  # PyTorch seed (CPU)

<torch._C.Generator at 0x7fb8b5e4f8f0>

In [3]:
# Set print options: No scientific notation, 2 decimal places
torch.set_printoptions(sci_mode=False, precision=4)

# Define the maximum sequence length and the embedding dimension for a model:

max_sequence_length = 5: Specifies the maximum number of tokens a sequence can have. If a sequence is shorter, it may be padded; if longer, it may be truncated.

d_model = 8: Defines the size of each token’s embedding vector, meaning each token will be represented as a     8-dimensional vector.

In [4]:
d_model = 8
max_sequence_length = 5

# Define three linear layers using nn.Linear in PyTorch:

w_query: Projects input embeddings into query space.

w_key: Projects input embeddings into key space.

w_value: Projects input embeddings into value space.
## These linear layers transform input embeddings (d_model dimensional) into new representations of the same size (d_model → d_model)

In [5]:
w_query = nn.Linear(d_model, d_model)
w_key   = nn.Linear(d_model, d_model)
w_value = nn.Linear(d_model, d_model)

# Create a tensor tokens with random values, to simulate a batch of sequence of token embeddings.

Use torch.randn() to generate a random tensor of shape (batch_size, max_sequence_length, d_model), where:

batch_size defines the number of sequences processed simultaneously

max_sequence_length represents the number of tokens in the sequence.

d_model represents the embedding dimension.

In [6]:
batch_size = 2
X_input_tokens = torch.randn( (batch_size, max_sequence_length, d_model) )

In [7]:
X_input_tokens.size()

torch.Size([2, 5, 8])

In [8]:
X_input_tokens

tensor([[[ 1.5722, -1.5508, -0.9508, -0.8640, -0.2197, -0.5222,  0.5066,
           0.3882],
         [-0.3872, -2.0749,  1.8568, -0.9443,  1.5268,  2.4347, -0.9094,
          -0.4100],
         [-0.1295,  0.8775, -1.2089, -0.8320,  0.0255, -1.4403, -0.2634,
          -0.4547],
         [-0.2725,  0.3093,  1.6942, -0.7240,  0.3211,  1.2152, -0.2805,
          -0.8066],
         [ 0.4247, -2.2672,  0.3589,  0.9826, -0.5440,  0.8139,  0.7595,
           0.6310]],

        [[-0.5642, -0.3357,  0.3224, -0.2222, -0.2079, -0.1331, -0.6866,
           1.3085],
         [-1.4051, -0.7623, -0.6487,  0.9436, -0.2093,  0.6138,  0.7549,
          -0.2196],
         [ 1.1316,  0.7940, -0.4966, -1.6428,  0.4345, -0.4395, -0.1694,
           0.7543],
         [ 0.4022,  0.2770, -0.7614,  0.8873, -1.5371,  0.0049, -0.1836,
          -1.7946],
         [-1.2220,  1.0616,  0.2727, -0.6134,  1.3609,  1.3732,  0.5116,
           0.8456]]])

# Apply linear transformations to the tokens tensor using w_query, w_key, and w_value to obtain query (q), key (k), and value (v) representations.

## Pass tokens through the three linear layers to compute q, k, and v.

In [9]:
q = w_query(X_input_tokens)
k = w_key(X_input_tokens)
v = w_value(X_input_tokens)

# Compute Per-Head Dimensions in Multi-Head Attention
In multi-head attention, we split the embedding dimension (d_model) into multiple heads to allow the model to focus on different parts of the input simultaneously.

Define the number of attention heads as num_heads = 4.

Compute the per-head query (d_q), key (d_k), and value (d_v) dimensions by dividing d_model by num_heads.

Ensure that d_q, d_k, and d_v are equal and represent the dimension per head.

In [10]:
num_heads = 4
d_q = d_model // num_heads
d_k = d_model // num_heads
d_v = d_model // num_heads

# Reshape each tensor to (batch_size, max_sequence_length, num_heads, d_q), where:

batch_size is the number of input sequences in a batch.

max_sequence_length is the number of tokens per sequence.

num_heads is the number of attention heads.

d_q, d_k, and d_v are the per-head dimensions (d_model // num_heads).

Verify that the new shape correctly divides the d_model dimension across multiple heads.

Print the new shapes of q, k, and v to confirm the changes.

In [11]:
q = q.reshape(batch_size, max_sequence_length, num_heads, d_q)
k = k.reshape(batch_size, max_sequence_length, num_heads, d_k)
v = v.reshape(batch_size, max_sequence_length, num_heads, d_v)

In [12]:
q.shape, k.shape, v.shape

(torch.Size([2, 5, 4, 2]), torch.Size([2, 5, 4, 2]), torch.Size([2, 5, 4, 2]))

# Transpose Query Tensor for Multi-Head Attention
In multi-head attention, after reshaping the query (q), key (k), and value (v) tensors, we need to transpose them to bring the num_heads dimension to the second position. This helps in efficiently computing attention scores across multiple heads.

Assume you have a query tensor (q) of shape (batch_size, max_sequence_length, num_heads, d_q) after reshaping.

Transpose q to rearrange dimensions, so that num_heads moves to the second position, resulting in (batch_size, num_heads, max_sequence_length, d_q).

Print the shape of q after transposing to verify the changes.

In [13]:
q = q.transpose(1, 2) # [batch_size, num_heads, sequence_length, d_q]
q.shape

torch.Size([2, 4, 5, 2])

# Repeat the same operation for k and v.

In [14]:
k = k.transpose(1, 2) # [batch_size, num_heads, sequence_length, d_k]
v = v.transpose(1, 2) # [batch_size, num_heads, sequence_length, d_v]
k.shape, v.shape

(torch.Size([2, 4, 5, 2]), torch.Size([2, 4, 5, 2]))

For a single head:
$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

# Compute Attention Scores for Each Head
In multi-head attention, after transposing q and k, we compute attention scores using scaled dot-product attention. This involves:

Taking the dot product of q with the transposed k to get similarity scores.

Scaling the scores by dividing by the square root of d_k (to stabilize gradients).

Printing the shape of attn_scores to verify it per head.

In [15]:
attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float))
attn_scores.shape

torch.Size([2, 4, 5, 5])

In [16]:
k.transpose(-1, -2).shape

torch.Size([2, 4, 2, 5])

## Masking

- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders, but required in the decoders

# Create a lower triangular mask using torch.tril, which generates a matrix where only the lower triangle (including the diagonal) contains ones, while the upper triangle contains zeros. This mask is typically used in masked self-attention in transformers to ensure that each position in a sequence can only attend to previous positions and itself, preventing access to future tokens during decoding.

In [17]:
mask = torch.tril(torch.ones((max_sequence_length, max_sequence_length)))
print(mask)

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])


In [18]:
attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

In [19]:
attn_scores

tensor([[[[ 0.2502,    -inf,    -inf,    -inf,    -inf],
          [-0.4179, -0.1734,    -inf,    -inf,    -inf],
          [ 0.0281,  0.1137, -0.0939,    -inf,    -inf],
          [-0.6354, -0.5394, -0.1098, -0.2694,    -inf],
          [-0.1238, -0.2060,  0.0896,  0.0569, -0.3147]],

         [[-0.4693,    -inf,    -inf,    -inf,    -inf],
          [-1.0784, -0.4054,    -inf,    -inf,    -inf],
          [ 0.2629, -0.2601, -0.2105,    -inf,    -inf],
          [-0.4109, -0.2085,  0.0790, -0.0139,    -inf],
          [-0.9293, -1.2110, -0.1219, -0.1696, -0.7876]],

         [[-0.1552,    -inf,    -inf,    -inf,    -inf],
          [-0.1664, -0.4841,    -inf,    -inf,    -inf],
          [ 0.0737,  0.2080,  0.2011,    -inf,    -inf],
          [-0.0485, -0.1298, -0.1409, -0.0246,    -inf],
          [-0.3010, -1.1245, -0.4832, -0.1315,  0.8267]],

         [[-0.0373,    -inf,    -inf,    -inf,    -inf],
          [-0.1096, -0.0305,    -inf,    -inf,    -inf],
          [-0.0115, -0.00

# Compute the weighted sum of value (v) vectors using attention weights, where each query token receives a context-aware representation. This ensures that each generated token attends to relevant past tokens, influencing its prediction based on learned dependencies.

In [20]:
attn_weights = F.softmax(attn_scores, dim=-1)

In [21]:
attn_weights

tensor([[[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.4392, 0.5608, 0.0000, 0.0000, 0.0000],
          [0.3362, 0.3662, 0.2976, 0.0000, 0.0000],
          [0.1911, 0.2103, 0.3232, 0.2755, 0.0000],
          [0.1929, 0.1777, 0.2388, 0.2311, 0.1594]],

         [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.3378, 0.6622, 0.0000, 0.0000, 0.0000],
          [0.4513, 0.2675, 0.2811, 0.0000, 0.0000],
          [0.1871, 0.2291, 0.3054, 0.2783, 0.0000],
          [0.1372, 0.1035, 0.3077, 0.2934, 0.1581]],

         [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.5788, 0.4212, 0.0000, 0.0000, 0.0000],
          [0.3049, 0.3487, 0.3464, 0.0000, 0.0000],
          [0.2592, 0.2390, 0.2363, 0.2655, 0.0000],
          [0.1528, 0.0671, 0.1273, 0.1810, 0.4719]],

         [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.4802, 0.5198, 0.0000, 0.0000, 0.0000],
          [0.3300, 0.3319, 0.3381, 0.0000, 0.0000],
          [0.2693, 0.2827, 0.3167, 0.1313, 0.0000],
      

In [22]:
attention_output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, max_sequence_length, d_model)

In [23]:
attention_output.shape

torch.Size([2, 5, 8])

In [24]:
attention_output

tensor([[[     0.5889,     -0.2544,      0.5796,     -0.2053,      0.8867,
               0.8584,     -0.1849,     -0.0656],
         [     1.3559,      0.1895,      0.1792,      0.5596,      0.5102,
               0.4807,      0.2403,     -0.2482],
         [     0.8766,     -0.1489,      0.1727,      0.2288,      0.4115,
               0.1955,      0.1986,     -0.1561],
         [     0.7896,     -0.1617,     -0.0589,      0.5489,      0.2659,
               0.1213,      0.3184,     -0.1361],
         [     0.7703,     -0.0865,     -0.0439,      0.3919,      0.2174,
               0.4571,      0.2243,     -0.2422]],

        [[    -0.1676,      0.1436,     -0.1614,      0.3819,     -0.1275,
              -0.3200,     -0.2202,     -0.5121],
         [    -0.0581,     -0.3306,     -0.4269,      0.1313,      0.0746,
              -0.0504,     -0.2902,     -0.6253],
         [    -0.0008,     -0.2957,     -0.2886,      0.4192,      0.3742,
              -0.0817,     -0.0170,     -0.3264]

# Task-1: Implement MultiHeadAttention class

In [25]:
### BEGIN SOLUTION

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.d_v = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        
        self.fc = nn.Linear(d_model, d_model)
        
    def forward(self, q_input, k_input, v_input, mask=None):

        batch_size, max_sequence_length, _ = q_input.size()
        
        Q = self.query(q_input)
        K = self.key(k_input)
        V = self.value(v_input)
        
        q = Q.reshape(batch_size, max_sequence_length, self.num_heads, self.d_k)
        k = K.reshape(batch_size, max_sequence_length, self.num_heads, self.d_k)
        v = V.reshape(batch_size, max_sequence_length, self.num_heads, self.d_v)
        
        q = q.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_k]
        k = k.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_k]
        v = v.transpose(1, 2) # [batch_size, num_heads, max_sequence_length, d_v]

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))  # Ensure mask matches input length
        attn_weights = F.softmax(attn_scores, dim=-1)
    
        attention_output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, max_sequence_length, self.d_model)
       
        output = self.fc(attention_output)
        return output
    
### END SOLUTION

In [26]:
d_model = 16  # Small model for testing
num_heads = 4  # Number of heads
seq_len = 5  # Sequence length
batch_size = 2  # Batch size

In [27]:
# Create a random input tensor
x = torch.rand((batch_size, seq_len, d_model))

# Self-Attention in the Encoder
## Purpose: Allows each token to attend to all other tokens in the input sequence.

### Masking: No causal masking (tokens can see all positions).

#### Input: x (same for query, key, value).

#### Mask: Usually a padding mask (not needed for random input).

In [28]:
mha_enc = MultiHeadAttention(d_model, num_heads)

mask = None

# Forward pass
output = mha_enc(x, x, x, mask)
output.shape

torch.Size([2, 5, 16])

# Masked Self-Attention in the Decoder
## Purpose: Prevents each token from attending to future tokens.

### Masking: Causal mask applied.

### Input: x (same for query, key, value).

### Mask: Lower triangular mask to enforce causality.

In [29]:
mha_dec = MultiHeadAttention(d_model, num_heads)

mask = torch.tril(torch.ones((max_sequence_length, max_sequence_length)))

# Forward pass
output = mha_dec(x, x, x, mask)
output.shape

torch.Size([2, 5, 16])

# Encoder-Decoder Cross-Attention in the Decoder
## Purpose: Allows decoder tokens to attend to all encoder tokens.

## Masking: No causal mask (attends to all encoder tokens).

### Input:

#### q (decoder representation).

#### k, v (encoder output).

In [30]:
# Initialize Multi-Head Attention
mha = MultiHeadAttention(d_model, num_heads)

# Create random input tensors
decoder_input = torch.rand((batch_size, seq_len, d_model))  # Query from decoder
encoder_output = torch.rand((batch_size, seq_len, d_model))  # Key & Value from encoder

# No causal mask needed for encoder-decoder cross-attention
mask = None

# Forward pass
cross_attn_output = mha(decoder_input, encoder_output, encoder_output, mask)
print(cross_attn_output.shape)  # Expected: (batch_size, seq_len, d_model)

torch.Size([2, 5, 16])


# Understanding Contiguous and Non-Contiguous Tensors in PyTorch

In [31]:
# Create a contiguous tensor
x = torch.randn(2, 3)
print("Original tensor:\n", x)
print("Is contiguous?", x.is_contiguous())  # True

# Transpose it (creates a non-contiguous tensor)
x_t = x.transpose(0, 1)
print("\nTransposed tensor:\n", x_t)
print("Is contiguous?", x_t.is_contiguous())  # False

Original tensor:
 tensor([[ 0.4496, -0.3525,  0.7069],
        [ 2.0206,  0.1058,  0.8492]])
Is contiguous? True

Transposed tensor:
 tensor([[ 0.4496,  2.0206],
        [-0.3525,  0.1058],
        [ 0.7069,  0.8492]])
Is contiguous? False
