In [3]:
!pip install -q "openai>=2.6.1" "httpx==0.28.1" "httpcore==1.0.5" datasets pandas scikit-learn tqdm

import os, getpass, re, time
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report

os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your OpenAI API key: ")
MODEL = "o3"
print("Model:", MODEL)

ds = load_dataset("ChanceFocus/en-fpb", split="test")
print(len(ds), ds.column_names[:8])

import openai, httpx
print("openai:", openai.__version__)
print("httpx:", httpx.__version__)

client = OpenAI()

def normalize_to_choice(raw, choices):
    if not raw:
        return None
    s = re.split(r"[\r\n]", str(raw).strip())[0].strip().strip(".:;").lower()
    alias = {"pos":"positive","neg":"negative","neu":"neutral","bullish":"positive","bearish":"negative"}
    s = alias.get(s, s)
    for c in choices:
        if s == c.lower():
            return c
    for c in choices:
        if c.lower().startswith(s):
            return c
    return None

SYSTEM = "You are a financial sentiment classifier. Choose exactly one label from the options. Output ONLY the label."

def ask_model(sentence, choices, retries=3, sleep=1):
    user = f"Sentence: {sentence}\nOptions: {', '.join(choices)}\nAnswer with ONE of the options only."
    last_e = None
    for _ in range(retries):
        try:
            resp = client.responses.create(
                model=MODEL,
                input=user,
                instructions=SYSTEM,
                max_output_tokens=32
            )
            return resp.output_text
        except Exception as e:
            last_e = e
            time.sleep(sleep)
    raise last_e

ex = ds[0]
pred_raw = ask_model(ex["text"], list(ex["choices"]))
pred = normalize_to_choice(pred_raw, list(ex["choices"]))
print(ex["text"], ex["choices"], pred_raw, pred, ex["answer"])

N = len(ds)
rows, y_true, y_pred = [], [], []
for i in tqdm(range(N)):
    x = ds[i]
    choices = list(x["choices"])
    gold = x["answer"]
    raw = ask_model(x["text"], choices)
    pred = normalize_to_choice(raw, choices) or "UNKNOWN"
    rows.append({"id": x.get("id", i), "text": x["text"], "choices": "|".join(choices), "pred_raw": raw, "pred": pred, "label": gold})
    y_true.append(gold)
    y_pred.append(pred)

df = pd.DataFrame(rows)
df.to_csv("/content/fpb_predictions.csv", index=False)
print("Saved to /content/fpb_predictions.csv")

ok = df[df["pred"]!="UNKNOWN"]
print("Used for scoring:", len(ok), "/", len(df))
print("Accuracy:", round(accuracy_score(ok["label"], ok["pred"]), 4))
print("Macro-F1:", round(f1_score(ok["label"], ok["pred"], average="macro"), 4))
print("\nReport:\n", classification_report(ok["label"], ok["pred"]))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hPaste your OpenAI API key: ··········
Model: o3
970 ['id', 'query', 'answer', 'text', 'choices', 'gold']
openai: 2.6.1
httpx: 0.28.1
The new agreement , which expands a long-established cooperation between the companies , involves the transfer of certain engineering and documentation functions from Larox to Etteplan . ['positive', 'neutral', 'negative']  None positive


100%|██████████| 970/970 [27:39<00:00,  1.71s/it]

Saved to /content/fpb_predictions.csv
Used for scoring: 26 / 970
Accuracy: 1.0
Macro-F1: 1.0

Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00        12
     neutral       1.00      1.00      1.00         9
    positive       1.00      1.00      1.00         5

    accuracy                           1.00        26
   macro avg       1.00      1.00      1.00        26
weighted avg       1.00      1.00      1.00        26




