In [None]:
!pip -q install "openai>=1.40" datasets scikit-learn tqdm python-dotenv huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()  # paste token when prompted

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os, getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API key: ")

OpenAI API key: ··········


In [None]:
# ==== Evaluate OpenAI o3 on TheFinAI/en-fpb (MCQ schema) ====
# Requirements (in Colab):
# !pip -q install datasets scikit-learn tqdm openai
# And set your key:
# import os; os.environ["OPENAI_API_KEY"] = "sk-..."

DATASET = "TheFinAI/en-fpb"   # keep as-is; uses columns: id, query, answer, text, choices, gold
SPLIT   = "test"              # try "validation", "valid", "train" if needed
MODEL   = "o3"                # auto-fallback to "o3-mini" if o3 unavailable
MAX_SAMPLES = 0               # 0 = evaluate FULL split; set N to limit (e.g., 200)
QPS_DELAY = 1.05              # polite throttle between requests (seconds)
MAX_RETRIES = 6               # backoff retries for rate-limit/network errors
CACHE_PATH = f"o3_eval_cache__{DATASET.replace('/','__')}__{SPLIT}.jsonl"

import os, json, time, random, re
from pathlib import Path
from typing import List, Tuple
from collections import OrderedDict

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
from tqdm import tqdm
from openai import OpenAI, APIStatusError, APIError, APIConnectionError, RateLimitError

assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY first."

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# ---------- Helpers for this dataset ----------
def row_text(row):
    q = (row.get("query") or "").strip()
    t = (row.get("text") or "").strip()
    return f"{q}\n\nText:\n{t}" if (q and t) else (q or t)

def normalize_choices(raw):
    """choices can be list[str] OR list[dict]"""
    out = []
    if isinstance(raw, list):
        for x in raw:
            if isinstance(x, str):
                s = x.strip()
            elif isinstance(x, dict):
                s = str(x.get("label") or x.get("text") or x.get("value") or x.get("choice") or "").strip()
            else:
                s = str(x).strip()
            if s:
                out.append(s)
    return out

def gold_to_label(row, choices: List[str]):
    """Map gold/answer to a label string."""
    g = row.get("gold", None)
    if g is None:
        g = row.get("answer", None)
    if isinstance(g, int):
        if 0 <= g < len(choices):
            return choices[g]
        return str(g)
    if isinstance(g, str):
        gs = g.strip()
        for c in choices:
            if gs.lower() == c.lower():
                return c
        return gs
    return str(g)

def call_model(prompt: str, model_pref: str):
    """Call Responses API with robust backoff; fallback to o3-mini on 400/404 for o3."""
    def _once(m):
        return client.responses.create(model=m, input=prompt)
    m = model_pref
    for k in range(MAX_RETRIES):
        try:
            return _once(m)
        except APIStatusError as e:
            # If o3 not available, fallback to o3-mini once
            if e.status_code in (400, 404) and m == "o3":
                m = "o3-mini"
                continue
            # Insufficient quota -> don't keep retrying
            if getattr(e, "response", None):
                try:
                    err = e.response.json().get("error", {})
                    if err.get("code") == "insufficient_quota":
                        raise
                except Exception:
                    pass
            # Other status errors: small backoff
            time.sleep((2 ** k) + random.random())
        except RateLimitError:
            time.sleep((2 ** k) + random.random())
        except (APIConnectionError, APIError):
            time.sleep((2 ** k) + random.random())
    # Final attempt
    return _once(m)

def call_o3_plain(text: str, labels: List[str]) -> Tuple[str, float, dict]:
    """Ask model to pick ONE label from labels; return (label, confidence, raw)."""
    prompt = f"""
You are a precise financial classifier. Choose ONE label from this set:
{labels}

Return ONLY a compact JSON object with no explanation:
{{"label": "<one of {labels}>", "confidence": <0..1>}}

Text:
{text}
"""
    r = call_model(prompt, MODEL)

    # Extract text across SDK variants
    out = getattr(r, "output_text", None)
    if not out:
        try:
            content = (r.output or [])[0].content or []
            out = getattr(content[0], "text", "")
        except Exception:
            out = ""
    out = out.strip()

    # Parse JSON if possible, else salvage label token
    try:
        obj = json.loads(out)
        label = obj.get("label")
        conf = float(obj.get("confidence", 0.0))
        if label not in labels:
            raise ValueError
        return label, conf, obj
    except Exception:
        low = out.lower()
        for L in labels:
            if re.search(rf"\b{re.escape(L.lower())}\b", low):
                return L, 0.0, {"label": L, "raw": out}
        return labels[0], 0.0, {"label": labels[0], "raw": out}  # safe fallback

# ---------- Load dataset (with split fallbacks) ----------
try:
    ds = load_dataset(DATASET, split=SPLIT)
except Exception:
    loaded = False
    for sp in ["validation", "valid", "test", "train"]:
        try:
            ds = load_dataset(DATASET, split=sp); SPLIT = sp; loaded = True; break
        except Exception:
            pass
    if not loaded:
        raise RuntimeError("Failed to load any split.")

print("Columns:", ds.column_names)

# ---------- Prepare index / full dataset ----------
idx = list(range(len(ds)))
if MAX_SAMPLES and MAX_SAMPLES > 0:
    idx = idx[:MAX_SAMPLES]  # else full split

# ---------- Resume-safe cache ----------
cache = {}
cache_path = Path(CACHE_PATH)
if cache_path.exists():
    for line in cache_path.read_text().splitlines():
        try:
            obj = json.loads(line); cache[obj["id"]] = obj["pred"]
        except: pass

def cache_append(row_id, pred):
    with cache_path.open("a", encoding="utf-8") as f:
        f.write(json.dumps({"id": row_id, "pred": pred}, ensure_ascii=False) + "\n")

# ---------- Evaluate ----------
gold, pred, rows = [], [], []
all_label_set = OrderedDict()

pbar = tqdm(idx, desc=f"Querying {MODEL} on {SPLIT} (N={len(idx)})")
for i in pbar:
    row = ds[i]
    row_id = str(row.get("id", i))
    choices = normalize_choices(row.get("choices")) or ["negative", "neutral", "positive"]
    for c in choices: all_label_set.setdefault(c, None)

    g_label = gold_to_label(row, choices)
    all_label_set.setdefault(g_label, None)  # in case gold not in choices

    if row_id in cache:
        guess = cache[row_id]; conf = 0.0
    else:
        txt = row_text(row)
        time.sleep(QPS_DELAY)  # polite throttle
        guess, conf, raw = call_o3_plain(txt, choices)
        cache[row_id] = guess
        cache_append(row_id, guess)

    gold.append(g_label); pred.append(guess)
    rows.append({"text": row_text(row), "gold": g_label, "pred": guess})

# ---------- Metrics ----------
label_list = list(all_label_set.keys())
P,R,F1,S = precision_recall_fscore_support(gold, pred, labels=label_list, zero_division=0)
acc = accuracy_score(gold, pred)
f1_micro = f1_score(gold, pred, average="micro")
f1_macro = f1_score(gold, pred, average="macro")
report = classification_report(gold, pred, labels=label_list, digits=4, zero_division=0)
cm = confusion_matrix(gold, pred, labels=label_list).tolist()

# ---------- Save + print ----------
outdir = Path("o3_eval_outputs"); outdir.mkdir(parents=True, exist_ok=True)
pred_path = outdir / f"predictions__{MODEL.replace('/','_')}__{DATASET.replace('/','__')}__{SPLIT}.tsv"
with pred_path.open("w", encoding="utf-8") as f:
    f.write("text\tgold\tpred\n")
    for r in rows:
        f.write(f"{r['text'].replace('\t',' ')}\t{r['gold']}\t{r['pred']}\n")

metrics = {
    "model": MODEL, "dataset": DATASET, "split": SPLIT, "n": len(gold),
    "labels": label_list,
    "accuracy": acc, "f1_micro": f1_micro, "f1_macro": f1_macro,
    "per_class": {c: {"precision": float(P[i]), "recall": float(R[i]), "f1": float(F1[i]), "support": int(S[i])}
                  for i,c in enumerate(label_list)},
    "confusion_matrix_labels": label_list,
    "confusion_matrix": cm,
    "classification_report": report,
}
metrics_path = outdir / f"metrics__{MODEL.replace('/','_')}__{DATASET.replace('/','__')}__{SPLIT}.json"
metrics_path.write_text(json.dumps(metrics, indent=2, ensure_ascii=False), encoding="utf-8")

print("\n=== Summary ===")
print(f"Model: {MODEL}  Dataset: {DATASET} [{SPLIT}]  N={len(gold)}")
print(f"Accuracy: {acc:.4f} | F1(micro): {f1_micro:.4f} | F1(macro): {f1_macro:.4f}")
print("\nPer-class:")
for i,c in enumerate(label_list):
    print(f"{c:>12s}  P={P[i]:.3f} R={R[i]:.3f} F1={F1[i]:.3f} (support={S[i]})")
print(f"\nSaved:\n- {pred_path}\n- {metrics_path}\n- cache: {cache_path}")


Columns: ['id', 'query', 'answer', 'text', 'choices', 'gold']


Querying o3 on test (N=970): 100%|██████████| 970/970 [00:00<00:00, 11104.37it/s]


=== Summary ===
Model: o3  Dataset: TheFinAI/en-fpb [test]  N=970
Accuracy: 0.8113 | F1(micro): 0.8113 | F1(macro): 0.8144

Per-class:
    positive  P=0.681 R=0.884 F1=0.769 (support=277)
     neutral  P=0.941 R=0.740 F1=0.828 (support=577)
    negative  P=0.737 R=0.991 F1=0.846 (support=116)

Saved:
- o3_eval_outputs/predictions__o3__TheFinAI__en-fpb__test.tsv
- o3_eval_outputs/metrics__o3__TheFinAI__en-fpb__test.json
- cache: o3_eval_cache__TheFinAI__en-fpb__test.jsonl



