In [36]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import (
    accuracy_score, hamming_loss,
    f1_score, roc_auc_score, log_loss, brier_score_loss
)
from tqdm.auto import tqdm

In [3]:
# 1. Load model & tokenizer
model_name = "tingtone/go_emo_gpt"  # replace if different
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2ForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [4]:
# GPT-2 has no pad token by default—set it to eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [6]:
device = "cuda" if torch.cuda.is_available() else "mps"
model.to(device).eval()
print(f"Using device: {device}")

Using device: mps


In [15]:
# 2. Load and preprocess GoEmotions test set
dataset = load_dataset("go_emotions")
test_ds = dataset["test"]
num_labels = dataset["train"].features["labels"].feature.num_classes  # => 28
print(f"Number of labels: {num_labels}")
def preprocess(batch):
    toks = tokenizer(batch["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=1024,
                     return_tensors="pt")
    # build multi‑hot labels
    mh = np.zeros((len(batch["labels"]), num_labels), dtype=np.int8)
    for i, lab_indices in enumerate(batch["labels"]):
        mh[i, lab_indices] = 1.0

    toks["labels"] = torch.from_numpy(mh)
    return toks

test_ds = test_ds.map(preprocess, batched=True, remove_columns=test_ds.column_names)
test_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
# ——— SUBSAMPLE 500 RANDOM EXAMPLES ———


Number of labels: 28


Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [16]:
test_ds = test_ds.shuffle(seed=42).select(range(32))
dataloader = DataLoader(test_ds, batch_size=32)

In [33]:
loss_fn    = torch.nn.BCEWithLogitsLoss()
all_logits = []
all_labels = []
total_loss = 0.0

for batch in tqdm(dataloader, desc="Evaluating"):
    input_ids      = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels         = batch["labels"].to(device)
    labels = labels.float()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        # loss   = loss_fn(logits, labels)

    # total_loss += loss.item()
    all_logits.append(logits.cpu().numpy())
    all_labels.append(labels.cpu().numpy())

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

In [34]:
# stack arrays
all_logits = np.vstack(all_logits)
all_labels = np.vstack(all_labels)
probs      = torch.sigmoid(torch.tensor(all_logits)).numpy()
preds      = (probs >= 0.5).astype(int)


In [37]:
# Element‑wise accuracy
elementwise_acc = accuracy_score(all_labels.flatten(), preds.flatten())

# Exact‑match (subset) accuracy
subset_acc = np.mean(np.all(all_labels == preds, axis=1))

# Hamming loss
hamming = hamming_loss(all_labels, preds)

# F1
f1_micro = f1_score(all_labels, preds, average="micro", zero_division=0)
f1_macro = f1_score(all_labels, preds, average="macro", zero_division=0)

# # ROC-AUC
# roc_micro = roc_auc_score(all_labels, probs, average="micro")
# roc_macro = roc_auc_score(all_labels, probs, average="macro")

# Log loss
lloss = log_loss(all_labels.ravel(), probs.ravel())

# Brier score
brier = brier_score_loss(all_labels.ravel(), probs.ravel())

In [38]:
# Per-class ECE
def per_class_ece(probs, labels, n_bins=15):
    edges = np.linspace(0.0, 1.0, n_bins+1)
    eces  = []
    for c in range(labels.shape[1]):
        conf = probs[:, c]
        true = labels[:, c]
        ece_c = 0.0
        for i in range(n_bins):
            lo, hi = edges[i], edges[i+1]
            mask = (conf > lo) & (conf <= hi)
            if mask.any():
                acc_in  = true[mask].mean()
                conf_in = conf[mask].mean()
                ece_c  += np.abs(acc_in - conf_in) * mask.mean()
        eces.append(ece_c)
    return np.array(eces)

In [39]:
eces     = per_class_ece(probs, all_labels)
mean_ece = eces.mean()

In [40]:
print(f"Avg Loss:            {total_loss/len(dataloader):.4f}")
print(f"Element-wise Acc:    {elementwise_acc:.4f}")
print(f"Subset Exact Acc:    {subset_acc:.4f}")
print(f"Hamming Loss:        {hamming:.4f}")
print(f"Brier Score:         {brier:.4f}")
print(f"F1 (micro):          {f1_micro:.4f}")
print(f"F1 (macro):          {f1_macro:.4f}")
# print(f"ROC AUC (micro):     {roc_micro:.4f}")
# print(f"ROC AUC (macro):     {roc_macro:.4f}")
# print(f"Log Loss:            {lloss:.4f}")
print(f"Mean ECE:            {mean_ece:.4f}")

Avg Loss:            0.0000
Element-wise Acc:    0.9721
Subset Exact Acc:    0.5000
Hamming Loss:        0.0279
Brier Score:         0.0233
F1 (micro):          0.6154
F1 (macro):          0.2444
Mean ECE:            0.0340
