In [1]:
%pip -q install --upgrade --force-reinstall \
  openai>=1.30.0 datasets>=2.19.0 scikit-learn>=1.4.2 tqdm>=4.66.4 \
  pandas==2.2.2 numpy==2.0.2 "pyarrow<20" requests==2.32.4 "pydantic<2.12"


In [3]:


import os, getpass
from datasets import load_dataset
from huggingface_hub import HfApi, HfFolder

DATASET = "TheFinAI/flare-fiqasa"

# OpenAI key
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API key: ")

# Get a HF token
token = HfFolder.get_token() or os.getenv("HF_TOKEN")
if not token:
    try:
        from google.colab import userdata
        token = userdata.get("HF_TOKEN")
    except Exception:
        pass
if not token:
    token = getpass.getpass("Hugging Face token (hf_...): ")

# Verify token has access to the gated dataset, then persist it
api = HfApi()
try:
    who = api.whoami(token=token).get("name", "<unknown>")
    _ = api.dataset_info(DATASET, token=token)  # raises if no access
    print(f"HF user: {who} — access to {DATASET} OK ✅")
    HfFolder.save_token(token)
except Exception as e:
    raise RuntimeError(
        "This token does not have access to TheFinAI/flare-fiqasa.\n"
        "On the dataset page, request/accept access with this account.\n"
        "For fine-grained tokens, enable:\n"
        "  • Permission: Read\n"
        "  • Access public gated repositories\n"
        "  • Resources: All repos (or add TheFinAI/flare-fiqasa)\n"
        f"\nOriginal error: {e}"
    )

# Load dataset with the verified token
ds_all = load_dataset(DATASET, token=token)
SPLIT = "train" if "train" in ds_all else list(ds_all.keys())[0]
ds = ds_all[SPLIT]
print(f"Loaded {DATASET} [{SPLIT}] — n={len(ds)}")
print("Columns:", ds.column_names)


Hugging Face token (hf_...): ··········
HF user: Hanlin0914 — access to TheFinAI/flare-fiqasa OK ✅


data/train-00000-of-00001-d0f9b6513e12e0(…):   0%|          | 0.00/100k [00:00<?, ?B/s]

data/test-00000-of-00001-faca082021057ac(…):   0%|          | 0.00/35.8k [00:00<?, ?B/s]

data/valid-00000-of-00001-36997935dc03cb(…):   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/750 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/235 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/188 [00:00<?, ? examples/s]

Loaded TheFinAI/flare-fiqasa [train] — n=750
Columns: ['id', 'query', 'answer', 'text', 'choices', 'gold']


In [4]:
from typing import List
from collections import OrderedDict
import re

# Eval config
MODEL        = "gpt-4o"
MAX_SAMPLES  = None
QPS_DELAY    = 1.0
MAX_RETRIES  = 6

def row_text(row):
    q = (row.get("query") or "").strip()
    t = (row.get("text")  or "").strip()
    return f"{q}\n\nText:\n{t}" if (q and t) else (q or t)

def normalize_choices(raw):
    out = []
    if isinstance(raw, list):
        for x in raw:
            if isinstance(x, str):
                s = x.strip()
            elif isinstance(x, dict):
                s = str(x.get("label") or x.get("text") or x.get("value") or x.get("choice") or "").strip()
            else:
                s = str(x).strip()
            if s:
                out.append(s)
    return out

def gold_to_label(row, choices: List[str]):
    g = row.get("gold", None)
    if g is None:
        g = row.get("answer", None)
    if isinstance(g, int):
        return choices[g] if 0 <= g < len(choices) else str(g)
    if isinstance(g, str):
        gs = g.strip()
        for c in choices:
            if gs.lower() == c.lower():
                return c
        return gs
    return str(g)


In [5]:
import json, time, random
from typing import Tuple, List
from openai import OpenAI, APIStatusError, RateLimitError, APIConnectionError, APIError

client = OpenAI()  # uses OPENAI_API_KEY

def call_gpt4_json(text: str, labels: List[str]) -> Tuple[str, float, dict]:
    """
    Ask GPT-4 (e.g., gpt-4o) to choose ONE label from `labels`.
    Returns (label:str, confidence:float, raw_obj_or_text:dict).
    """
    sys = "Return only valid JSON. Choose exactly one label from the provided set."
    user = f"""Choose ONE label from this set:
{labels}

Return ONLY this JSON (no prose):
{{"label": "<one of {labels}>", "confidence": <0..1>}}

Text:
{text}"""

    def _once(with_temp: bool):
        kwargs = dict(
            model=MODEL,
            messages=[
                {"role": "system", "content": sys},
                {"role": "user",   "content": user},
            ],
        )
        if with_temp:
            kwargs["temperature"] = 0
        return client.chat.completions.create(**kwargs)

    last_err = None
    for k in range(MAX_RETRIES):
        try:
            # try deterministic temp=0 first; if the model rejects it, retry once without temp
            try:
                r = _once(with_temp=True)
            except APIStatusError as e:
                if getattr(e, "status_code", None) == 400:
                    r = _once(with_temp=False)
                else:
                    raise
            raw = r.choices[0].message.content.strip()

            # strict JSON first
            try:
                obj = json.loads(raw)
                lab = obj.get("label")
                conf = float(obj.get("confidence", 0) or 0.0)
            except Exception:
                # salvage by token match
                low = raw.lower()
                lab = next((L for L in labels if re.search(rf"\b{re.escape(L.lower())}\b", low)), None)
                if lab is None:
                    lab = labels[0]
                conf = 0.0
                obj = {"label": lab, "raw": raw}

            # normalize to provided labels (case-insensitive)
            for L in labels:
                if str(lab).lower() == L.lower():
                    lab = L; break

            return lab, conf, obj

        except (APIStatusError, RateLimitError, APIConnectionError, APIError) as e:
            last_err = e
            time.sleep((2**k) + random.random())

    raise last_err or RuntimeError("call_gpt4_json failed")


In [6]:
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix

N = (MAX_SAMPLES or len(ds))
idx = list(range(N))

gold, pred, rows = [], [], []
labels_seen = OrderedDict()

for i in tqdm(idx, desc=f"{MODEL} on {SPLIT}"):
    row = ds[i]
    choices = normalize_choices(row.get("choices")) or ["negative","neutral","positive"]
    for c in choices: labels_seen.setdefault(c, None)
    g = gold_to_label(row, choices)
    labels_seen.setdefault(g, None)

    time.sleep(QPS_DELAY)
    lab, conf, raw = call_gpt4_json(row_text(row), choices)

    gold.append(g); pred.append(lab)
    rows.append({"text": row_text(row), "gold": g, "pred": lab})

# Metrics
label_list = list(labels_seen.keys())
P,R,F1,S = precision_recall_fscore_support(gold, pred, labels=label_list, zero_division=0)
acc = accuracy_score(gold, pred)
f1_micro = f1_score(gold, pred, average="micro")
f1_macro = f1_score(gold, pred, average="macro")
report = classification_report(gold, pred, labels=label_list, digits=4, zero_division=0)
cm = confusion_matrix(gold, pred, labels=label_list).tolist()

print({"n":len(gold), "accuracy":round(acc,4), "f1_micro":round(f1_micro,4), "f1_macro":round(f1_macro,4)})


gpt-4o on train: 100%|██████████| 750/750 [24:06<00:00,  1.93s/it]

{'n': 750, 'accuracy': 0.6907, 'f1_micro': 0.6907, 'f1_macro': 0.6413}



