In [2]:
!pip -q install "openai>=1.40" datasets scikit-learn tqdm python-dotenv huggingface_hub

In [3]:
from huggingface_hub import notebook_login
notebook_login()  # paste token when prompted

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import os, getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API key: ")

OpenAI API key: ··········


In [5]:
# ==== Evaluate OpenAI GPT-5 on TheFinAI/en-fpb (MCQ schema) ====
# Quick setup in Colab (run once):
# !pip -q install datasets scikit-learn tqdm openai
# import os; os.environ["OPENAI_API_KEY"] = "sk-..."  # <-- set your key

DATASET = "TheFinAI/en-fpb"   # columns: id, query, answer, text, choices, gold
SPLIT   = "test"              # will fall back to validation/valid/train if missing
MODEL   = "gpt-5"             # auto-fallback to "gpt-5-mini" if unavailable
MAX_SAMPLES = 0               # 0 = use FULL split; set a small number first to sanity-check cost
QPS_DELAY = 1.0               # polite throttle between requests (sec)
MAX_RETRIES = 6               # exponential backoff attempts
CACHE_PATH = f"gpt5_eval_cache__{DATASET.replace('/','__')}__{SPLIT}.jsonl"

import os, json, time, random, re
from pathlib import Path
from typing import List, Tuple
from collections import OrderedDict

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
from tqdm import tqdm
from openai import OpenAI, APIStatusError, APIError, APIConnectionError, RateLimitError

assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY first."
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# ---------- Helpers for this dataset ----------
def row_text(row):
    q = (row.get("query") or "").strip()
    t = (row.get("text") or "").strip()
    return f"{q}\n\nText:\n{t}" if (q and t) else (q or t)

def normalize_choices(raw):
    """choices can be list[str] OR list[dict]. Return list[str]."""
    out = []
    if isinstance(raw, list):
        for x in raw:
            if isinstance(x, str):
                s = x.strip()
            elif isinstance(x, dict):
                s = str(x.get("label") or x.get("text") or x.get("value") or x.get("choice") or "").strip()
            else:
                s = str(x).strip()
            if s:
                out.append(s)
    return out

def gold_to_label(row, choices: List[str]):
    """Map gold/answer to a label string."""
    g = row.get("gold", None)
    if g is None:
        g = row.get("answer", None)
    if isinstance(g, int):
        return choices[g] if 0 <= g < len(choices) else str(g)
    if isinstance(g, str):
        gs = g.strip()
        for c in choices:
            if gs.lower() == c.lower():
                return c
        return gs
    return str(g)

# ---------- Robust OpenAI calling with backoff & fallback ----------
def call_model(prompt: str, preferred_model: str):
    def _once(m): return client.responses.create(model=m, input=prompt)
    model = preferred_model
    for k in range(MAX_RETRIES):
        try:
            return _once(model)
        except APIStatusError as e:
            # If gpt-5 not available on your org, fall back once to gpt-5-mini
            if e.status_code in (400, 404) and model == "gpt-5":
                model = "gpt-5-mini"
                continue
            # Pass through insufficient_quota immediately
            try:
                err = e.response.json().get("error", {})
                if err.get("code") == "insufficient_quota":
                    raise
            except Exception:
                pass
            time.sleep((2 ** k) + random.random())
        except (RateLimitError, APIConnectionError, APIError):
            time.sleep((2 ** k) + random.random())
    return _once(model)

def call_gpt5(text: str, labels: List[str]) -> Tuple[str, float, dict]:
    """Ask GPT-5 to pick ONE label from labels; return (label, confidence, raw_json_or_text)."""
    prompt = f"""
You are a precise financial classifier. Choose ONE label from this set:
{labels}

Return ONLY a compact JSON object with no explanation:
{{"label": "<one of {labels}>", "confidence": <0..1>}}

Text:
{text}
""".strip()

    r = call_model(prompt, MODEL)

    # Extract text across SDK variants
    out = getattr(r, "output_text", None)
    if not out:
        try:
            content = (r.output or [])[0].content or []
            out = getattr(content[0], "text", "")
        except Exception:
            out = ""
    out = out.strip()

    # Parse JSON if possible, else salvage a label token
    try:
        obj = json.loads(out)
        label = obj.get("label")
        conf = float(obj.get("confidence", 0.0))
        if label not in labels: raise ValueError
        return label, conf, obj
    except Exception:
        low = out.lower()
        for L in labels:
            if re.search(rf"\b{re.escape(L.lower())}\b", low):
                return L, 0.0, {"label": L, "raw": out}
        return labels[0], 0.0, {"label": labels[0], "raw": out}  # safe fallback

# ---------- Load dataset (with split fallbacks) ----------
try:
    ds = load_dataset(DATASET, split=SPLIT)
except Exception:
    for sp in ["validation", "valid", "test", "train"]:
        try:
            ds = load_dataset(DATASET, split=sp); SPLIT = sp; break
        except Exception:
            pass
    else:
        raise RuntimeError("Failed to load any split.")

print("Columns:", ds.column_names)

# ---------- Index / full dataset ----------
idx = list(range(len(ds)))
if MAX_SAMPLES and MAX_SAMPLES > 0:
    idx = idx[:MAX_SAMPLES]  # else evaluate the entire split

# ---------- Resume-safe cache ----------
cache = {}
cache_path = Path(CACHE_PATH)
if cache_path.exists():
    for line in cache_path.read_text().splitlines():
        try:
            obj = json.loads(line); cache[obj["id"]] = obj["pred"]
        except: pass

def cache_append(row_id, pred):
    with cache_path.open("a", encoding="utf-8") as f:
        f.write(json.dumps({"id": row_id, "pred": pred}, ensure_ascii=False) + "\n")

# ---------- Evaluate ----------
gold, pred, rows = [], [], []
all_label_set = OrderedDict()

pbar = tqdm(idx, desc=f"Querying {MODEL} on {SPLIT} (N={len(idx)})")
for i in pbar:
    row = ds[i]
    row_id = str(row.get("id", i))

    # Per-example choice set
    choices = normalize_choices(row.get("choices")) or ["negative", "neutral", "positive"]
    for c in choices: all_label_set.setdefault(c, None)

    g_label = gold_to_label(row, choices)
    all_label_set.setdefault(g_label, None)  # in case gold not in choices

    if row_id in cache:
        guess = cache[row_id]; conf = 0.0
    else:
        txt = row_text(row)
        time.sleep(QPS_DELAY)
        guess, conf, raw = call_gpt5(txt, choices)
        cache[row_id] = guess
        cache_append(row_id, guess)

    gold.append(g_label); pred.append(guess)
    rows.append({"text": row_text(row), "gold": g_label, "pred": guess})

# ---------- Metrics ----------
label_list = list(all_label_set.keys())
P,R,F1,S = precision_recall_fscore_support(gold, pred, labels=label_list, zero_division=0)
acc = accuracy_score(gold, pred)
f1_micro = f1_score(gold, pred, average="micro")
f1_macro = f1_score(gold, pred, average="macro")
report = classification_report(gold, pred, labels=label_list, digits=4, zero_division=0)
cm = confusion_matrix(gold, pred, labels=label_list).tolist()

# ---------- Save + print ----------
outdir = Path("gpt5_eval_outputs"); outdir.mkdir(parents=True, exist_ok=True)
pred_path = outdir / f"predictions__{MODEL.replace('/','_')}__{DATASET.replace('/','__')}__{SPLIT}.tsv"
with pred_path.open("w", encoding="utf-8") as f:
    f.write("text\tgold\tpred\n")
    for r in rows:
        f.write(f"{r['text'].replace('\t',' ')}\t{r['gold']}\t{r['pred']}\n")

metrics = {
    "model": MODEL, "dataset": DATASET, "split": SPLIT, "n": len(gold),
    "labels": label_list,
    "accuracy": acc, "f1_micro": f1_micro, "f1_macro": f1_macro,
    "per_class": {c: {"precision": float(P[i]), "recall": float(R[i]), "f1": float(F1[i]), "support": int(S[i])}
                  for i,c in enumerate(label_list)},
    "confusion_matrix_labels": label_list,
    "confusion_matrix": cm,
    "classification_report": report,
}
metrics_path = outdir / f"metrics__{MODEL.replace('/','_')}__{DATASET.replace('/','__')}__{SPLIT}.json"
metrics_path.write_text(json.dumps(metrics, indent=2, ensure_ascii=False), encoding="utf-8")

print("\n=== Summary ===")
print(f"Model: {MODEL}  Dataset: {DATASET} [{SPLIT}]  N={len(gold)}")
print(f"Accuracy: {acc:.4f} | F1(micro): {f1_micro:.4f} | F1(macro): {f1_macro:.4f}")
print("\nPer-class:")
for i,c in enumerate(label_list):
    print(f"{c:>12s}  P={P[i]:.3f} R={R[i]:.3f} F1={F1[i]:.3f} (support={S[i]})")
print(f"\nSaved:\n- {pred_path}\n- {metrics_path}\n- cache: {cache_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

data/train-00000-of-00001-ab9a3b4799b095(…):   0%|          | 0.00/608k [00:00<?, ?B/s]

data/test-00000-of-00001-8bd1e21c671fb67(…):   0%|          | 0.00/188k [00:00<?, ?B/s]

data/valid-00000-of-00001-303e4ba2afe838(…):   0%|          | 0.00/154k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/970 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/776 [00:00<?, ? examples/s]

Columns: ['id', 'query', 'answer', 'text', 'choices', 'gold']


Querying gpt-5 on test (N=970): 100%|██████████| 970/970 [1:24:16<00:00,  5.21s/it]


=== Summary ===
Model: gpt-5  Dataset: TheFinAI/en-fpb [test]  N=970
Accuracy: 0.8227 | F1(micro): 0.8227 | F1(macro): 0.8228

Per-class:
    positive  P=0.716 R=0.845 F1=0.775 (support=277)
     neutral  P=0.916 R=0.780 F1=0.843 (support=577)
    negative  P=0.750 R=0.983 F1=0.851 (support=116)

Saved:
- gpt5_eval_outputs/predictions__gpt-5__TheFinAI__en-fpb__test.tsv
- gpt5_eval_outputs/metrics__gpt-5__TheFinAI__en-fpb__test.json
- cache: gpt5_eval_cache__TheFinAI__en-fpb__test.jsonl



