# Chapter 4 Exercises

Solutions to the exercises from Chapter 4 

In [10]:
import os
import sys

current_dir = os.getcwd()

main_chapter_code_dir = os.path.join(current_dir, "01_main-chapter-code")
if main_chapter_code_dir not in sys.path:
    sys.path.append(main_chapter_code_dir)



## Exercise 4.1: Parameters in the feed forward versus attention module

Analyze and compare the number of parameters in the feed-forward and attention modules of a transformer block.

In [11]:
from gpt import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

block = TransformerBlock(GPT_CONFIG_124M)
print(block)

TransformerBlock(
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (norm1): LayerNorm()
  (norm2): LayerNorm()
  (drop_shortcut): Dropout(p=0.1, inplace=False)
)


In [12]:
# Count parameters in feed forward module
total_ff_params = sum(p.numel() for p in block.ff.parameters())
print(f"Total number of parameters in feed forward module: {total_ff_params:,}")

Total number of parameters in feed forward module: 4,722,432


In [13]:
# Count parameters in attention module
total_att_params = sum(p.numel() for p in block.att.parameters())
print(f"Total number of parameters in attention module: {total_att_params:,}")

Total number of parameters in attention module: 2,360,064


### Mathematical Breakdown

Feed forward module (emb_dim=768):
- 1st Linear layer: 768 × (4×768) + 4×768 bias = 2,362,368
- 2nd Linear layer: (4×768) × 768 + 768 bias = 2,360,064
- Total: 4,722,432

Attention module:
- W_query: 768 × 768 = 589,824
- W_key: 768 × 768 = 589,824
- W_value: 768 × 768 = 589,824
- out_proj: 768 × 768 + 768 bias = 590,592
- Total: 2,360,064

## Exercise 4.2: Initialize larger GPT models

Implement a function to create configurations for different GPT-2 model sizes.

In [14]:
def get_config(base_config, model_name="gpt2-small"):
    GPT_CONFIG = base_config.copy()
    
    if model_name == "gpt2-small":
        GPT_CONFIG["emb_dim"] = 768
        GPT_CONFIG["n_layers"] = 12
        GPT_CONFIG["n_heads"] = 12
        
    elif model_name == "gpt2-medium":
        GPT_CONFIG["emb_dim"] = 1024
        GPT_CONFIG["n_layers"] = 24
        GPT_CONFIG["n_heads"] = 16
        
    elif model_name == "gpt2-large":
        GPT_CONFIG["emb_dim"] = 1280
        GPT_CONFIG["n_layers"] = 36
        GPT_CONFIG["n_heads"] = 20
        
    elif model_name == "gpt2-xl":
        GPT_CONFIG["emb_dim"] = 1600
        GPT_CONFIG["n_layers"] = 48
        GPT_CONFIG["n_heads"] = 25
        
    return GPT_CONFIG

In [15]:
# Test the configurations
models = ["gpt2-small", "gpt2-medium", "gpt2-large", "gpt2-xl"]

for model in models:
    config = get_config(GPT_CONFIG_124M, model)
    print(f"\n{model} configuration:")
    print(f"Embedding dimension: {config['emb_dim']}")
    print(f"Number of layers: {config['n_layers']}")
    print(f"Number of attention heads: {config['n_heads']}")


gpt2-small configuration:
Embedding dimension: 768
Number of layers: 12
Number of attention heads: 12

gpt2-medium configuration:
Embedding dimension: 1024
Number of layers: 24
Number of attention heads: 16

gpt2-large configuration:
Embedding dimension: 1280
Number of layers: 36
Number of attention heads: 20

gpt2-xl configuration:
Embedding dimension: 1600
Number of layers: 48
Number of attention heads: 25


## Exercise 4.3: Implement a custom attention mask

In this exercise, we'll implement a custom attention mask that allows tokens to attend to their immediate neighbors only (one token to the left and right).

In [16]:
def create_neighbor_mask(seq_length):
    """Create a mask where each token can only attend to its immediate neighbors.
    
    Args:
        seq_length (int): Length of the sequence
        
    Returns:
        torch.Tensor: A binary mask of shape (seq_length, seq_length)
    """
    # Create a matrix of indices
    indices = torch.arange(seq_length)
    
    # Create matrices of row and column indices
    row_indices = indices.unsqueeze(1).expand(seq_length, seq_length)
    col_indices = indices.unsqueeze(0).expand(seq_length, seq_length)
    
    # Create the neighbor mask
    mask = (row_indices - col_indices).abs() <= 1
    
    return mask

In [17]:
# Test the neighbor mask
seq_length = 5
mask = create_neighbor_mask(seq_length)
print("Neighbor mask for sequence length 5:")
print(mask.int())  # Convert to integers for better visualization

Neighbor mask for sequence length 5:
tensor([[1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [0, 1, 1, 1, 0],
        [0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1]], dtype=torch.int32)
