# LLM inference

Loads GPT2 weights of choice or saved weights and shows how to run inference

In [30]:
import torch

In [31]:
from chapter04 import generate_text_simple
from chapter05 import text_to_token_ids, token_ids_to_text, GPTModel, load_weights_into_gpt

In [32]:
from importlib.metadata import version

print(f'Torch loaded. version {torch.__version__}')
print(f'MPS available: {torch.backends.mps.is_available()}')
print("tiktoken version:", version("tiktoken"))

Torch loaded. version 2.9.1
MPS available: True
tiktoken version: 0.12.0


## model config
sizes available ("124M", "355M", "774M", "1558M")

In [33]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12, "model_name": "124M"},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16, "model_name": "355M"},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20, "model_name": "774M"},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25, "model_name": "1558M"},
}

In [34]:
GPT_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": True,
}

In [35]:
model_name = "gpt2-large (774M)"
GPT_CONFIG.update(model_configs[model_name])
GPT_CONFIG.update({"qkv_bias": True}) # gpt2 uses bias on the qkv weight parameters

In [36]:
gpt = GPTModel(GPT_CONFIG)
gpt.eval()
True

True

In [37]:
## torchinfo is a nice package to show more model information than pytorch print gives

In [38]:
from torchinfo import summary
# You must provide an input_size so it can "run" a pass
summary(gpt, input_size=(1, GPT_CONFIG["context_length"]), dtypes=[torch.long])

Layer (type:depth-idx)                        Output Shape              Param #
GPTModel                                      [1, 1024, 50257]          --
├─Embedding: 1-1                              [1, 1024, 1280]           64,328,960
├─Embedding: 1-2                              [1024, 1280]              1,310,720
├─Dropout: 1-3                                [1, 1024, 1280]           --
├─Sequential: 1-4                             [1, 1024, 1280]           --
│    └─TransformerBlock: 2-1                  [1, 1024, 1280]           --
│    │    └─LayerNorm: 3-1                    [1, 1024, 1280]           2,560
│    │    └─MultiHeadAttention: 3-2           [1, 1024, 1280]           6,558,720
│    │    └─Dropout: 3-3                      [1, 1024, 1280]           --
│    │    └─LayerNorm: 3-4                    [1, 1024, 1280]           2,560
│    │    └─FeedForward: 3-5                  [1, 1024, 1280]           13,113,600
│    │    └─Dropout: 3-6                      [1, 1024, 128

## configure device

In [39]:
# Create a status tuple to match against
hardware_status = (torch.cuda.is_available(), torch.backends.mps.is_available())

match hardware_status:
    case (True, _):
        # CUDA is available; prioritize it for maximum performance
        device = torch.device("cuda")
    case (False, True):
        # No CUDA, but MPS is available; preferred for Mac users
        device = torch.device("mps")
    case _:
        # Fallback for all other scenarios
        device = torch.device("cpu")

print(f"Preferred device selected: {device}")
print(f"Hardware status: {hardware_status}")

Preferred device selected: mps
Hardware status: (False, True)


## load the weights and copy them over to our model

In [40]:
print(GPT_CONFIG)

{'vocab_size': 50257, 'context_length': 1024, 'emb_dim': 1280, 'n_heads': 20, 'n_layers': 36, 'drop_rate': 0.1, 'qkv_bias': True, 'model_name': '774M'}


In [41]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size=GPT_CONFIG['model_name'], models_dir="gpt2"
)

File already exists and is up-to-date: gpt2/774M/checkpoint
File already exists and is up-to-date: gpt2/774M/encoder.json
File already exists and is up-to-date: gpt2/774M/hparams.json
File already exists and is up-to-date: gpt2/774M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/774M/model.ckpt.index
File already exists and is up-to-date: gpt2/774M/model.ckpt.meta
File already exists and is up-to-date: gpt2/774M/vocab.bpe


In [42]:
device

device(type='mps')

In [43]:
load_weights_into_gpt(gpt, params)
gpt.to(device)
True

True

## set up tokenizer

In [44]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

## run inference

In [45]:
gpt.eval()
gpt.to(device)
True

True

In [70]:
def generate(model, idx, max_new_tokens, context_size,
             temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(
                logits < min_val,
                torch.tensor(float('-inf')).to(logits.device),
                logits
            )
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        if idx_next == eos_id:
            break
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [79]:
user_input = "The main problem with Cars is"

In [85]:
input_ids = text_to_token_ids(user_input, tokenizer)
input_ids = input_ids.to(device)

token_ids = generate(
    model=gpt,
    idx=input_ids,
    max_new_tokens=25,
    context_size=GPT_CONFIG["context_length"],
    top_k=15,
    temperature=0.1
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 The main problem with Cars is that it's a game that's not really about cars. It's about the people who drive them. It's about the
