## Exercise 3.1: Comparing SelfAttention_v1 and SelfAttention_v2

This exercise demonstrates the similarity between two self-attention implementations.

- `SelfAttention_v1` uses manually initialized weight matrices via `nn.Parameter`.
- `SelfAttention_v2` uses PyTorch’s `nn.Linear`, which stores weights in a transposed form internally.

Even though the implementations are different, we can show they are functionally equivalent by copying the transposed weights from `SelfAttention_v2` into `SelfAttention_v1`. Once this is done correctly, both implementations will produce the same output for a given input.

> Note: `nn.Linear` stores weights in shape `(d_out, d_in)`, whereas matrix multiplication in `SelfAttention_v1` expects `(d_in, d_out)`, hence we use `.T` when copying.

In [7]:
from importlib.metadata import version

import torch
print("torch version:", version("torch"))

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

d_in, d_out = 3, 2

torch version: 2.6.0


In [8]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [9]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key   = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)

        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)


In [10]:
# Transfer weights from sa_v2 to sa_v1
with torch.no_grad():
    sa_v1.W_query.copy_(sa_v2.W_query.weight.T)
    sa_v1.W_key.copy_(sa_v2.W_key.weight.T)
    sa_v1.W_value.copy_(sa_v2.W_value.weight.T)

# Verify outputs match
out_v1 = sa_v1(inputs)
out_v2 = sa_v2(inputs)

print("Are outputs close?", torch.allclose(out_v1, out_v2, atol=1e-6))

Are outputs close? True


## Exercise 3.2: Returning Two-Dimensional Embedding Vectors

Here I ensure that the output of `MultiHeadAttentionWrapper` has a dimensionality of 2, while keeping the number of attention heads equal to 2.

In multi-head attention, the `d_out` dimension is split equally among the heads:
\[
d_k = \frac{d_{\text{out}}}{\text{num_heads}}
\]

To get an overall output of 2 dimensions with `num_heads = 2`, I  set `d_out = 2`, so each head has `d_k = 1`.

This requires **no change to the class implementation** — I only modify the constructor:
```python
mha = MultiHeadAttentionWrapper(d_in=4, d_out=2, num_heads=2)

In [12]:
# Define MultiHeadAttentionWrapper
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, d_in, d_out, num_heads):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.num_heads = num_heads
        self.d_k = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)
        self.fc_out = nn.Linear(d_out, d_out, bias=False)

    def forward(self, x):
        B, T, _ = x.shape  # Batch size, sequence length, d_in

        # Linear projections
        Q = self.W_query(x)  # (B, T, d_out)
        K = self.W_key(x)
        V = self.W_value(x)

        # Split into heads
        Q = Q.view(B, T, self.num_heads, self.d_k).transpose(1, 2)  # (B, num_heads, T, d_k)
        K = K.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        # Attention mechanism
        attn_scores = Q @ K.transpose(-2, -1) / (self.d_k ** 0.5)  # (B, num_heads, T, T)
        attn_weights = torch.softmax(attn_scores, dim=-1)  # (B, num_heads, T, T)
        context = attn_weights @ V  # (B, num_heads, T, d_k)

        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.d_k)
        return self.fc_out(context)


# Create input tensor
torch.manual_seed(42)
inputs = torch.randn(1, 5, 4)  # (batch=1, seq_len=5, d_in=4)

# Set d_out=2 with num_heads=2 to return 2-dimensional output
mha = MultiHeadAttentionWrapper(d_in=4, d_out=2, num_heads=2)
outputs = mha(inputs)

# Check output shape
print("Output shape:", outputs.shape)
print("Output:", outputs)

Output shape: torch.Size([1, 5, 2])
Output: tensor([[[-0.0267, -0.0087],
         [-0.0919, -0.0284],
         [-0.0792, -0.0155],
         [-0.0848, -0.0206],
         [-0.0685, -0.0139]]], grad_fn=<UnsafeViewBackward0>)


## Exercise 3.3: Initializing GPT-2 Size Attention Modules

In this task, we initialize a multi-head attention module that matches the smallest GPT-2 model configuration:

- Number of attention heads: **12**
- Hidden size (input/output embedding dimension): **768**
- Context length supported: **1024 tokens**

To achieve this, we initialize our `MultiHeadAttention` class with:
```python
mha = MultiHeadAttention(d_in=768, d_out=768, num_heads=12)

In [13]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, num_heads):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.num_heads = num_heads
        self.d_k = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out)
        self.W_key = nn.Linear(d_in, d_out)
        self.W_value = nn.Linear(d_in, d_out)
        self.fc_out = nn.Linear(d_out, d_out)

    def forward(self, x):
        B, T, _ = x.shape  # Batch, Time (context), d_in

        Q = self.W_query(x)
        K = self.W_key(x)
        V = self.W_value(x)

        # Split into heads
        Q = Q.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention
        attn_scores = Q @ K.transpose(-2, -1) / self.d_k ** 0.5
        attn_weights = torch.softmax(attn_scores, dim=-1)
        context = attn_weights @ V

        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.d_k)
        return self.fc_out(context)


# Initialize GPT-2 scale multi-head attention
d_model = 768
num_heads = 12
context_len = 1024
batch_size = 1

# Dummy input: [batch_size, context_len, d_in]
x = torch.randn(batch_size, context_len, d_model)

# Initialize the attention module
mha_gpt2 = MultiHeadAttention(d_in=d_model, d_out=d_model, num_heads=num_heads)
output = mha_gpt2(x)

print("Output shape:", output.shape)  # Expected: [1, 1024, 768]

Output shape: torch.Size([1, 1024, 768])
