## 1) Encoder-only (BERT/DistilBERT) — sentence embedding + similarity

In [1]:
# Goal: Turn sentences into embeddings using an encoder-only model (DistilBERT)
# and compute cosine similarity.

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

model_name = "distilbert-base-uncased"
tok = AutoTokenizer.from_pretrained(model_name)
enc = AutoModel.from_pretrained(model_name)

sentences = [
    "The cat sat on the mat.",
    "A dog rested on the rug.",
    "Transformers process sequences in parallel."
]

with torch.no_grad():
    batch = tok(sentences, padding=True, truncation=True, return_tensors="pt")
    out = enc(**batch)                 # last hidden states (B, T, H)
    cls_emb = out.last_hidden_state[:, 0, :]  # [CLS] token embedding as sentence vector (B, H)
    cls_emb = F.normalize(cls_emb, p=2, dim=1)

# cosine similarity between sentence 0 and others
sims = (cls_emb @ cls_emb[0].unsqueeze(1)).squeeze(1)
for i, s in enumerate(sentences):
    print(f"sim(sentence 0, {i}) = {sims[i].item():.3f} :: {s}")


sim(sentence 0, 0) = 1.000 :: The cat sat on the mat.
sim(sentence 0, 1) = 0.983 :: A dog rested on the rug.
sim(sentence 0, 2) = 0.796 :: Transformers process sequences in parallel.


## 2) Decoder-only (GPT-2) — text generation (causal LM)

In [2]:
# Goal: Generate text token-by-token with a decoder-only model (GPT-2).

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "gpt2"
tok = AutoTokenizer.from_pretrained(model_name)
lm  = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "In 2017, researchers discovered that attention"
inputs = tok(prompt, return_tensors="pt")

gen_ids = lm.generate(
    **inputs,
    max_length=60,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tok.eos_token_id
)
print(tok.decode(gen_ids[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In 2017, researchers discovered that attention-seeking and attention-avoidance behaviors—such as having fewer kids—can cause children to have a higher risk of developing ADHD, and that children who are stressed out are at greater risk.

"Children's behaviors can also cause us to be more sensitive


## 3) Encoder–Decoder (T5) — summarization (seq2seq)

In [3]:
# Goal: Summarize a paragraph with an encoder-decoder model (T5).

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-small"
tok = AutoTokenizer.from_pretrained(model_name)
seq2seq = AutoModelForSeq2SeqLM.from_pretrained(model_name)

article = (
    "Transformers use self-attention to weigh the importance of different tokens. "
    "By processing sequences in parallel, they handle long-range dependencies more effectively "
    "than recurrent neural networks."
)

inputs = tok("summarize: " + article, return_tensors="pt", max_length=256, truncation=True)
summary_ids = seq2seq.generate(
    **inputs,
    max_length=40,
    num_beams=4,
    length_penalty=1.0,
    early_stopping=True
)
print(tok.decode(summary_ids[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Transformers use self-attention to weigh the importance of different tokens. by processing sequences in parallel, they handle long-range dependencies more effectively than recurrent neural networks 


## 4) Tiny Self-Attention from scratch (to demystify)

In [4]:
# Goal: Show the core of scaled dot-product attention on tiny tensors.

import torch
import torch.nn.functional as F

torch.manual_seed(0)

B, T, H = 1, 4, 8   # batch=1, tokens=4, hidden=8
x = torch.randn(B, T, H)  # toy inputs

# Learnable projections (as in a single attention head)
Wq = torch.randn(H, H)
Wk = torch.randn(H, H)
Wv = torch.randn(H, H)

Q = x @ Wq        # (B, T, H)
K = x @ Wk        # (B, T, H)
V = x @ Wv        # (B, T, H)

scale = H ** 0.5
attn_scores = (Q @ K.transpose(-2, -1)) / scale   # (B, T, T)
attn_weights = F.softmax(attn_scores, dim=-1)     # (B, T, T)

out = attn_weights @ V                            # (B, T, H)

print("Attention weights (rows sum to 1):")
print(attn_weights[0].detach().round(decimals=3))
print("\nOutput shape:", out.shape)


Attention weights (rows sum to 1):
tensor([[0.0000, 0.0670, 0.9320, 0.0000],
        [0.0020, 0.0410, 0.0680, 0.8890],
        [0.0000, 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 1.0000, 0.0000]])

Output shape: torch.Size([1, 4, 8])
