# A taste of transformers

## Device setup


In [1]:
import torch

# Select best available device
device = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)

## Model properties


In [2]:
from transformers import AutoConfig, AutoModel, AutoTokenizer

# Load pretrained GPT-2 using Hugging Face Transformers
config = AutoConfig.from_pretrained("gpt2")
model = AutoModel.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Move model to device
model = model.to(device)

# Add padding token for consistency
tokenizer.pad_token = tokenizer.eos_token

# Some properties of the model
print(f"Model type: {config.model_type}")
print(f"Hidden size: {config.hidden_size}")
print(f"Number of attention heads: {config.num_attention_heads}")
print(f"Number of layers: {config.num_hidden_layers}")
print(f"Vocabulary size: {config.vocab_size}")

Model type: gpt2
Hidden size: 768
Number of attention heads: 12
Number of layers: 12
Vocabulary size: 50257


## Vocabulary


In [3]:
# Explore the tokenizer's vocabulary
sorted_vocab = sorted(list(tokenizer.vocab.items()), key=lambda n: n[1])
example_vocab = [vocab[0] for vocab in sorted_vocab[2000:2010]]
print(f"Example vocabulary: {example_vocab}")

Example vocabulary: ['Ġmind', 'aff', 'omm', 'Ġfuture', 'ged', 'Ġcut', 'Ġtot', 'itch', 'Ġvideo', 'Ġinvestig']


## Tokenization


In [4]:
# Text to analyze
reference_text = "London is the capital of the United"

# Convert text to tokens
tokens = tokenizer.encode(reference_text, return_tensors="pt").to(device)

# Show tokenization results
print(f"Input text: {reference_text}")
print(f"Tokenized input: {tokens}")
print(f"Input tokens as strings: {tokenizer.convert_ids_to_tokens(tokens[0])}")

Input text: London is the capital of the United
Tokenized input: tensor([[23421,   318,   262,  3139,   286,   262,  1578]], device='mps:0')
Input tokens as strings: ['London', 'Ġis', 'Ġthe', 'Ġcapital', 'Ġof', 'Ġthe', 'ĠUnited']


## Inference


In [5]:
from torch.nn.functional import softmax

# Run model and get outputs
with torch.no_grad():
    outputs = model(tokens)
    last_hidden_states = outputs.last_hidden_state

    # Manual language modeling head - linear projection to vocabulary size
    # Get the embedding weights to use as the language modeling head (weight tying)
    embed_weights = model.wte.weight  # [vocab_size, hidden_size]

    # Apply language modeling head: hidden_states @ embed_weights.T
    logits = torch.matmul(
        last_hidden_states, embed_weights.T
    )  # [batch_size, seq_len, vocab_size]

# Get probabilities for next token from last position
next_token_probs = softmax(logits[0, -1], dim=-1)

# Get top 10 most likely next tokens
top_k = 5
top_probs, top_indices = torch.topk(next_token_probs, top_k)

print("\nTop 10 most likely next tokens:")
for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
    token_str = tokenizer.decode(idx)
    print(f"{i + 1:2d}. '{token_str}' (prob: {prob:.4f})")

# Show what text would look like with each top prediction
print("Continuing with most likely tokens:")
for i in range(top_k):
    print(f"{i + 1:2d}: '{reference_text + tokenizer.decode(top_indices[i])}'")


Top 10 most likely next tokens:
 1. ' Kingdom' (prob: 0.7349)
 2. ' States' (prob: 0.2262)
 3. ' Arab' (prob: 0.0195)
 4. ' Nations' (prob: 0.0107)
 5. ' State' (prob: 0.0009)
Continuing with most likely tokens:
 1: 'London is the capital of the United Kingdom'
 2: 'London is the capital of the United States'
 3: 'London is the capital of the United Arab'
 4: 'London is the capital of the United Nations'
 5: 'London is the capital of the United State'
