# Performance Metrics for LMs & GNNs

In [None]:
!pip -q install -U evaluate datasets rouge_score sacrebleu transformers


In [None]:
import math, torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate

# Perplexity demo with small causal LM
tok = AutoTokenizer.from_pretrained("distilgpt2")
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

texts = ["Language models estimate distributions over text.", "PEFT reduces trainable parameters."]
enc = tok(texts, return_tensors="pt", padding=True)
with torch.no_grad():
    out = model(**enc, labels=enc["input_ids"])
ppl = math.exp(out.loss.item())
print("Perplexity (toy batch):", ppl)


In [None]:
# ROUGE/BLEU demo on toy summaries
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")

preds = ["LoRA enables efficient adaptation of LMs."]
refs  = [["LoRA makes adapting language models efficient."]]
print("ROUGE:", rouge.compute(predictions=preds, references=[r[0] for r in refs]))
print("BLEU:", bleu.compute(predictions=preds, references=refs))


In [None]:
# Calibration (ECE) for a binary classifier (synthetic)
import numpy as np

# Fake probs and labels
probs = np.array([0.05,0.2,0.3,0.4,0.6,0.7,0.85,0.9])
labels = np.array([0,0,0,1,1,1,1,1])

def ece(probs, labels, bins=5):
    bins_idx = np.clip((probs*bins).astype(int), 0, bins-1)
    total = len(probs)
    acc = 0.0
    for b in range(bins):
        mask = bins_idx == b
        if mask.sum() == 0: 
            continue
        conf = probs[mask].mean()
        accu = labels[mask].mean()
        acc += (mask.sum()/total) * abs(accu - conf)
    return acc

print("ECE (toy):", ece(probs, labels, bins=5))
