In [None]:
%pip install -q "datasets>=2.19.0" "huggingface_hub>=0.24"
from huggingface_hub import login
login(add_to_git_credential=True)
%pip install datasets


In [None]:
from datasets import load_dataset, Dataset

LABELS = ["negative","neutral","positive"]

ds_raw = load_dataset("TheFinAI/flare-fpb", split="test")
print("Loaded flare-fpb test:", len(ds_raw), "columns:", ds_raw.column_names)

_alias = {"pos":"positive","neg":"negative","neu":"neutral",
          "bullish":"positive","bearish":"negative"}

def _norm_label(v):
    if v is None: return None
    if isinstance(v, (int,float)) or (isinstance(v,str) and v.isdigit()):
        i = int(v); return LABELS[i] if 0 <= i < len(LABELS) else None
    s = str(v).strip().lower(); s = _alias.get(s, s)
    return s if s in LABELS else None

def _map_row(x):
    text = x.get("text") or x.get("sentence") or x.get("content") or x.get("input") or ""
    lab  = _norm_label(x.get("label", x.get("labels", x.get("answer"))))
    return {"text": text, "choices": LABELS, "answer": lab}

ds = Dataset.from_list([{**r, **_map_row(r)} for r in ds_raw])
bad = [i for i,r in enumerate(ds) if r["answer"] not in LABELS]
print("Samples with unusable label:", len(bad))
assert len(bad) == 0, "Found unparseable labels; please check the field mapping."


In [None]:
%pip install -q "openai==1.40.2" "httpx==0.27.2" "httpcore==1.0.5" \
               "pandas>=2.2.2" "tqdm>=4.66.4" "requests>=2.31.0"

In [None]:
import os, getpass, json, time, platform
from importlib.metadata import version, PackageNotFoundError

try:
    LABELS
except NameError:
    LABELS = ["negative", "neutral", "positive"]

MODEL    = "o3"
BASE_URL = "https://api.openai.com/v1"

api_key = os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
if not api_key:
    api_key = getpass.getpass("Paste your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = api_key

run_tag   = f"flare_fpb_{MODEL.replace('/','_')}"
save_dir  = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
meta_path = f"{save_dir}/{run_tag}_metadata.json"

def ver(pkg: str) -> str:
    try:
        return version(pkg)
    except PackageNotFoundError:
        return "not-installed"

meta = {
    "dataset": "TheFinAI/flare-fpb",
    "split": "test",
    "labels": list(LABELS),
    "model": MODEL,
    "openai_sdk": ver("openai"),
    "httpx": ver("httpx"),
    "httpcore": ver("httpcore"),
    "datasets_version": ver("datasets"),
    "pandas": ver("pandas"),
    "tqdm": ver("tqdm"),
    "time_utc": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
    "python": platform.python_version(),
    "base_url": BASE_URL,
    "note": "Responses API; strict text-JSON protocol; retries+checkpoint"
}

os.makedirs(save_dir, exist_ok=True)
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("Meta saved ->", meta_path)
print("MODEL:", MODEL, "| BASE_URL:", BASE_URL)
print("OPENAI_API_KEY is set:", bool(os.environ.get("OPENAI_API_KEY")))


In [None]:
import requests, json, os, re, time
import pandas as pd
from tqdm import tqdm

def _strip_code_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", s)
        s = re.sub(r"\s*```$", "", s)
    return s.strip()

def _extract_output_text(data: dict) -> str | None:
    t = data.get("output_text")
    if isinstance(t, str) and t.strip():
        return t
    for o in data.get("output", []):
        for p in o.get("content", []):
            if p.get("type") == "output_text":
                tt = p.get("text")
                if isinstance(tt, str) and tt.strip():
                    return tt
    return None

def _make_user_text(sentence: str, choices=("",)):
    return (
        "Task: classify the sentence into exactly one of these labels: "
        f"{', '.join(choices)}.\n\n"
        f"Sentence: {sentence}\n\n"
        "Return ONLY a JSON object on a single line, exactly in this form:\n"
        "{\"label\":\"negative|neutral|positive\"}\n"
        "No code fences, no extra text, no explanation."
    )


def safe_post(url, headers, json, max_retries=5, base_delay=2, timeout=300):
    delay = base_delay
    for attempt in range(max_retries):
        try:
            r = requests.post(url, headers=headers, json=json, timeout=timeout)
            if r.status_code >= 500:
                print(f"[warn] Server error {r.status_code}, retrying in {delay}s...")
                time.sleep(delay)
                delay = min(delay * 2, 60)
                continue
            return r
        except requests.exceptions.Timeout:
            print(f"[warn] Timeout (attempt {attempt+1}), retrying in {delay}s...")
            time.sleep(delay)
            delay = min(delay * 2, 60)
        except requests.exceptions.ConnectionError as e:
            print(f"[warn] Connection error: {e}, retrying in {delay}s...")
            time.sleep(delay)
            delay = min(delay * 2, 60)
    raise RuntimeError("Failed after repeated timeouts or connection errors.")

def ask_o3_textjson_once(sentence, choices=("negative","neutral","positive"), max_tok=128):
    url = f"{BASE_URL.rstrip('/')}/responses"
    headers = {
        "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}",
        "Content-Type": "application/json",
    }
    user_text = _make_user_text(sentence, choices)
    payload = {
        "model": MODEL,
        "input": [{
            "role": "user",
            "content": [{"type": "input_text", "text": user_text}]
        }],
        "max_output_tokens": int(max_tok),
        "reasoning": {"effort": "low"},
    }

    r = safe_post(url, headers=headers, json=payload)


    if r.status_code != 200:
        raise RuntimeError(f"Responses API error {r.status_code}: {r.text[:500]}")
    data = r.json()
    if data.get("status") == "incomplete":
        reason = (data.get("incomplete_details") or {}).get("reason")
        raise RuntimeError(f"incomplete:{reason}")

    txt = _extract_output_text(data)
    if not isinstance(txt, str) or not txt.strip():
        raise RuntimeError(f"No output_text in response. Snippet: {json.dumps(data)[:400]}")

    txt = _strip_code_fences(txt)
    obj = json.loads(txt)
    lab = obj.get("label")
    if lab not in choices:
        raise RuntimeError(f"Invalid label {lab!r}; raw json: {obj}")
    return lab

def ask_o3_textjson(sentence, choices=("negative","neutral","positive")):
    for max_tok in (128, 256, 512):
        delay = 1.0
        for attempt in range(5):
            try:
                return ask_o3_textjson_once(sentence, choices, max_tok=max_tok)
            except RuntimeError as e:
                msg = str(e)

                if "Responses API error 5" in msg or "server_error" in msg:
                    time.sleep(delay); delay = min(delay*2, 30); continue
                if "Responses API error 429" in msg:
                    time.sleep(delay); delay = min(delay*2, 60); continue
                if "incomplete:max_output_tokens" in msg:
                    break
                raise
    raise RuntimeError("Exhausted retries and token budgets for this sample.")

run_tag   = f"flare_fpb_{MODEL.replace('/','_')}"
save_dir  = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
err_path  = f"{save_dir}/{run_tag}_errors.csv"

rows_done = []
done_idx  = set()
if os.path.exists(pred_path):
    old = pd.read_csv(pred_path)
    if "row_idx" in old.columns:
        rows_done = old.to_dict("records")
        done_idx  = set(old["row_idx"].tolist())
        print(f"[resume] loaded {len(done_idx)} completed rows.")

err_rows = []
buf = []
save_every = 50

total = len(ds)
for i in tqdm(range(total)):
    if i in done_idx:
        continue
    x = ds[i]
    text = x["text"]
    gold = x["answer"]

    try:
        pred = ask_o3_textjson(text, LABELS)
        raw  = json.dumps({"label": pred})
    except Exception as e:
        pred = "UNKNOWN"
        raw  = f"ERROR: {type(e).__name__}: {e}"
        err_rows.append({"row_idx": i, "id": x.get("id", i), "error": raw, "text": text})

    buf.append({
        "row_idx": i,
        "id": x.get("id", i),
        "text": text,
        "pred_raw": raw,
        "pred": pred,
        "label": gold
    })

    if len(buf) % save_every == 0:
        out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
        out.to_csv(pred_path, index=False)
        if err_rows:
            pd.DataFrame(err_rows).to_csv(err_path, index=False)
        print(f"[checkpoint] saved {len(out)}/{total} -> {pred_path}")

out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
out.to_csv(pred_path, index=False)
if err_rows:
    pd.DataFrame(err_rows).to_csv(err_path, index=False)
print(f"[done] saved -> {pred_path}")
if os.path.exists(err_path):
    print(f"[errors] logged -> {err_path}")


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

df = pd.read_csv(pred_path).sort_values("row_idx").drop_duplicates("row_idx", keep="last")
ok = df[df["pred"] != "UNKNOWN"].copy()

try:
    LABELS
except NameError:
    LABELS = ["negative","neutral","positive"]

f1ma = f1_score(ok["label"], ok["pred"], labels=LABELS, average="macro", zero_division=0)
acc  = accuracy_score(ok["label"], ok["pred"])

print(f"F1: {f1ma:.4f}, Accuracy: {acc:.4f}")