

This notebook contains solutions to the exercises from Chapter 4.

## Exercise 4.1: Number of parameters in feed forward and attention modules

In [23]:
import torch
from gpt import TransformerBlock
import os
import sys
import gpt


sys.path.append(os.path.expanduser("~/PycharmProjects/ArtLLM/ch04"))


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

block = TransformerBlock(GPT_CONFIG_124M)

ff_params = sum(p.numel() for p in block.ff.parameters())
attn_params = sum(p.numel() for p in block.att.parameters())

print(f"Feedforward parameters: {ff_params:,}")
print(f"Attention parameters: {attn_params:,}")

Feedforward parameters: 4,722,432
Attention parameters: 2,360,064


In [19]:
print(gpt.__file__)

/Users/Msi/PycharmProjects/ArtLLM/ch04/gpt.py


##  Exercise 4.2: Initializing Larger GPT Models

In [24]:
from gpt import GPTModel

def print_model_size(name, cfg):
    model = GPTModel(cfg)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{name} → Total parameters: {total_params:,}")

# GPT-2 Medium
print_model_size("GPT-2 Medium", {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False
})

# GPT-2 Large
print_model_size("GPT-2 Large", {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False
})

# GPT-2 XL
print_model_size("GPT-2 XL", {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False
})

GPT-2 Medium → Total parameters: 406,212,608
GPT-2 Large → Total parameters: 838,220,800
GPT-2 XL → Total parameters: 1,637,792,000


## Exercise 4.3: Using Separate Dropout Parameters

In [25]:
from gptmodel import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate_attn": 0.1,
    "drop_rate_shortcut": 0.1,
    "drop_rate_emb": 0.1,
    "qkv_bias": False
}

model = GPTModel(GPT_CONFIG_124M)

print("Model successfully initialized with:")
print("- Attention dropout:", GPT_CONFIG_124M["drop_rate_attn"])
print("- Shortcut dropout:", GPT_CONFIG_124M["drop_rate_shortcut"])
print("- Embedding dropout:", GPT_CONFIG_124M["drop_rate_emb"])

Model successfully initialized with:
- Attention dropout: 0.1
- Shortcut dropout: 0.1
- Embedding dropout: 0.1
