In [23]:
!pip install -q "openai>=2.6.1" datasets pandas scikit-learn tqdm httpx==0.28.1 httpcore==1.0.5

import os, getpass, json, time
from openai import OpenAI
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report

os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your OpenAI API key: ")
client = OpenAI()
MODEL = "gpt-5"
LIMIT = 100

ds = load_dataset("ChanceFocus/en-fpb", split="test").select(range(LIMIT))

def ask_model(sentence, choices, retries=3, sleep=1):
    fmt = {
        "type": "json_schema",
        "json_schema": {
            "name": "sentiment",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {"label": {"type": "string", "enum": choices}},
                "required": ["label"],
                "additionalProperties": False
            }
        }
    }
    prompt = f"Text: {sentence}\nOptions: {', '.join(choices)}\nReturn a JSON object with key 'label' only."
    last = None
    for _ in range(retries):
        try:
            r = client.responses.create(model=MODEL, input=prompt, response_format=fmt, max_output_tokens=32)
            t = r.output_text
            if t:
                j = json.loads(t)
                return j.get("label"), t
        except Exception as e:
            last = e
        time.sleep(sleep)
    raise last

rows, y_true, y_pred = [], [], []
for i in tqdm(range(len(ds))):
    x = ds[i]
    choices = list(x["choices"])
    gold = x["answer"]
    try:
        pred, raw = ask_model(x["text"], choices)
    except Exception:
        pred, raw = None, ""
    if not pred:
        pred = "UNKNOWN"
    rows.append({"id": x.get("id", i), "text": x["text"], "choices": "|".join(choices), "pred_raw": raw, "pred": pred, "label": gold})
    y_true.append(gold); y_pred.append(pred)

df = pd.DataFrame(rows)
df.to_csv("/content/fpb_predictions_gpt5_100.csv", index=False)
ok = df[df["pred"]!="UNKNOWN"]
print("Used for scoring:", len(ok), "/", len(df))
if len(ok) > 0:
    print("Accuracy:", round(accuracy_score(ok["label"], ok["pred"]), 4))
    print("Macro-F1:", round(f1_score(ok["label"], ok["pred"], average="macro"), 4))
    print("\nReport:\n", classification_report(ok["label"], ok["pred"]))


Paste your OpenAI API key: ··········


100%|██████████| 100/100 [05:00<00:00,  3.00s/it]

Used for scoring: 0 / 100



