# LLM inference

Loads GPT2 weights of choice or saved weights and shows how to run inference

In [101]:
import torch

In [102]:
from chapter04 import generate_text_simple
from chapter05 import text_to_token_ids, token_ids_to_text, GPTModel, load_weights_into_gpt

In [103]:
from importlib.metadata import version

print(f'Torch loaded. version {torch.__version__}')
print(f'MPS available: {torch.backends.mps.is_available()}')
print("tiktoken version:", version("tiktoken"))

Torch loaded. version 2.9.1
MPS available: True
tiktoken version: 0.12.0


## model config
sizes available ("124M", "355M", "774M", "1558M")

In [104]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12, "model_name": "124M"},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16, "model_name": "355M"},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20, "model_name": "774M"},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25, "model_name": "1558M"},
}

In [105]:
GPT_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": True,
}

In [106]:
model_name = "gpt2-xl (1558M)"
GPT_CONFIG.update(model_configs[model_name])
GPT_CONFIG.update({"context_length": 1024}) # woo bigger context
GPT_CONFIG.update({"qkv_bias": True}) # gpt2 uses bias on the qkv weight parameters

In [107]:
gpt = GPTModel(GPT_CONFIG)
gpt.eval()
True

True

In [108]:
## torchinfo is a nice package to show more model information than pytorch print gives

In [109]:
from torchinfo import summary
# You must provide an input_size so it can "run" a pass
summary(gpt, input_size=(1, GPT_CONFIG["context_length"]), dtypes=[torch.long])

Layer (type:depth-idx)                        Output Shape              Param #
GPTModel                                      [1, 1024, 50257]          --
├─Embedding: 1-1                              [1, 1024, 1600]           80,411,200
├─Embedding: 1-2                              [1024, 1600]              1,638,400
├─Dropout: 1-3                                [1, 1024, 1600]           --
├─Sequential: 1-4                             [1, 1024, 1600]           --
│    └─TransformerBlock: 2-1                  [1, 1024, 1600]           --
│    │    └─LayerNorm: 3-1                    [1, 1024, 1600]           3,200
│    │    └─MultiHeadAttention: 3-2           [1, 1024, 1600]           10,246,400
│    │    └─Dropout: 3-3                      [1, 1024, 1600]           --
│    │    └─LayerNorm: 3-4                    [1, 1024, 1600]           3,200
│    │    └─FeedForward: 3-5                  [1, 1024, 1600]           20,488,000
│    │    └─Dropout: 3-6                      [1, 1024, 16

## configure device

In [110]:
# Create a status tuple to match against
hardware_status = (torch.cuda.is_available(), torch.backends.mps.is_available())

match hardware_status:
    case (True, _):
        # CUDA is available; prioritize it for maximum performance
        device = torch.device("cuda")
    case (False, True):
        # No CUDA, but MPS is available; preferred for Mac users
        device = torch.device("mps")
    case _:
        # Fallback for all other scenarios
        device = torch.device("cpu")

print(f"Preferred device selected: {device}")
print(f"Hardware status: {hardware_status}")

Preferred device selected: mps
Hardware status: (False, True)


## load the weights and copy them over to our model

In [111]:
print(GPT_CONFIG)

{'vocab_size': 50257, 'context_length': 1024, 'emb_dim': 1600, 'n_heads': 25, 'n_layers': 48, 'drop_rate': 0.1, 'qkv_bias': True, 'model_name': '1558M'}


In [112]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size=GPT_CONFIG['model_name'], models_dir="gpt2"
)

checkpoint: 100%|████████████████████████████████████████████████████████████████████████████████████| 77.0/77.0 [00:00<00:00, 72.9kiB/s]
encoder.json: 100%|████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 2.33MiB/s]
hparams.json: 100%|██████████████████████████████████████████████████████████████████████████████████| 91.0/91.0 [00:00<00:00, 62.3kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████████████████████████████████████████████████████████| 6.23G/6.23G [03:47<00:00, 27.4MiB/s]
model.ckpt.index: 100%|█████████████████████████████████████████████████████████████████████████████| 20.7k/20.7k [00:00<00:00, 404kiB/s]
model.ckpt.meta: 100%|█████████████████████████████████████████████████████████████████████████████| 1.84M/1.84M [00:00<00:00, 3.25MiB/s]
vocab.bpe: 100%|█████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 1.59MiB/s]


In [113]:
device

device(type='mps')

In [114]:
load_weights_into_gpt(gpt, params)
gpt.to(device)
True

True

## set up tokenizer

In [115]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

## run inference

In [116]:
gpt.eval()
gpt.to(device)
True

True

In [117]:
user_input = "Everybody loves a"

In [None]:
input_ids = text_to_token_ids(user_input, tokenizer)
input_ids = input_ids.to(device)

token_ids = generate_text_simple(
    model=gpt,
    idx=input_ids,
    max_new_tokens=50,
    context_size=GPT_CONFIG["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))