In [4]:
import torch
import torch.nn as nn

from importlib.metadata import version

print("torch version:", version("torch"))

torch version: 2.6.0


## Exercise 4.1 Number of parameters in feed forward and attention modules
Calculate and compare the number of parameters that are contained in the feed for- ward module and those that are contained in the multi-head attention module.

In [5]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [6]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [7]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [8]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [9]:
from previous_chapters import MultiHeadAttention

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [12]:
block = TransformerBlock(GPT_CONFIG_124M)
print(block)

TransformerBlock(
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (norm1): LayerNorm()
  (norm2): LayerNorm()
  (drop_shortcut): Dropout(p=0.1, inplace=False)
)


In [15]:
total_parameters = sum(p.numel() for p in block.ff.parameters())
print("Total feed forward parameters: ", total_parameters)

Total feed forward parameters:  4722432


In [16]:
total_parameters = sum(p.numel() for p in block.att.parameters())
print("Total attention module parameters: ", total_parameters)

Total attention module parameters:  2360064


## Exercise 4.2 Initializing larger GPT models
We initialized a 124-million-parameter GPT model, which is known as “GPT-2 small.” Without making any code modifications besides updating the configuration file, use the GPTModel class to implement GPT-2 medium (using 1,024-dimensional embed- dings, 24 transformer blocks, 16 multi-head attention heads), GPT-2 large (1,280- dimensional embeddings, 36 transformer blocks, 20 multi-head attention heads), and GPT-2 XL (1,600-dimensional embeddings, 48 transformer blocks, 25 multi-head attention heads). As a bonus, calculate the total number of parameters in each GPT model.

In [10]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [11]:
def get_config(config_name):
    if config_name == "Small":
        config = {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 768,         # Embedding dimension
            "n_heads": 12,          # Number of attention heads
            "n_layers": 12,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        }
    elif config_name == "Medium":
        config = {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1024,         # Embedding dimension
            "n_heads": 16,          # Number of attention heads
            "n_layers": 24,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        }
    elif config_name == "Large":
        config = {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1280,         # Embedding dimension
            "n_heads": 20,          # Number of attention heads
            "n_layers": 36,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        }
    elif config_name == "XL":
        config = {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1600,         # Embedding dimension
            "n_heads": 25,          # Number of attention heads
            "n_layers": 48,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        }
    else:
        raise ValueError(f"Invalid configuration size: ", config_name)

    return config

In [12]:
def calculate_size(model): # based on chapter code

    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params:,}")

    total_params_gpt2 =  total_params - sum(p.numel() for p in model.out_head.parameters())
    print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

    # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
    total_size_bytes = total_params * 4

    # Convert to megabytes
    total_size_mb = total_size_bytes / (1024 * 1024)

    print(f"Total size of the model: {total_size_mb:.2f} MB")

In [21]:
for config_name in ("Small", "Medium", "Large", "XL"):
    print("GPT-2 ", config_name, ":")
    model = GPTModel(get_config(config_name))
    calculate_size(model)

#Can't run this on my laptop without it crashing

GPT-2  Small :
GPT-2  Medium :
GPT-2  Large :
GPT-2  XL :


## Exercise 4.3 Using separate dropout parameters
At the beginning of this chapter, we defined a global drop_rate setting in the GPT_ CONFIG_124M dictionary to set the dropout rate in various places throughout the GPTModel architecture. Change the code to specify a separate dropout value for the various dropout layers throughout the model architecture. (Hint: there are three dis- tinct places where we used dropout layers: the embedding layer, shortcut layer, and multi-head attention module.)

In [13]:
config = {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1024,         # Embedding dimension
            "n_heads": 16,          # Number of attention heads
            "n_layers": 24,         # Number of layers
            "embedding_drop_rate": 0.1,       # EMBEDDING Dropout rate
            "shortcut_drop_rate": 0.2,        # SHORTCUT Dropout rate
            "attention_drop_rate": 0.3,      # MULTI-HEAD ATTENTION Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        }

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["attention_drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["shortcut_drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [15]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["embedding_drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [20]:
torch.manual_seed(123)
model = GPTModel(config)