In [1]:
# Step 1: Install dependencies & set Hugging Face token
%pip install -q "datasets>=2.19.0" "huggingface_hub>=0.24"
import os
import getpass

# 直接设置Hugging Face token，跳过登录界面
hf_token = getpass.getpass("Paste your Hugging Face token: ")
os.environ['HF_TOKEN'] = hf_token
os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token

print("Hugging Face token set successfully!")

Note: you may need to restart the kernel to use updated packages.


Paste your Hugging Face token:  ········


Hugging Face token set successfully!


In [2]:
# Step 2: Load FLARE-FPB test set and normalize labels
from datasets import load_dataset, Dataset

LABELS = ["negative", "neutral", "positive"]

ds_raw = load_dataset("TheFinAI/flare-fpb", split="test")
print("Loaded flare-fpb test:", len(ds_raw), "columns:", ds_raw.column_names)

_alias = {"pos": "positive", "neg": "negative", "neu": "neutral",
          "bullish": "positive", "bearish": "negative"}

def _norm_label(v):
    if v is None: 
        return None
    if isinstance(v, (int, float)) or (isinstance(v, str) and v.isdigit()):
        i = int(v)
        return LABELS[i] if 0 <= i < len(LABELS) else None
    s = str(v).strip().lower()
    s = _alias.get(s, s)
    return s if s in LABELS else None

def _map_row(x):
    text = x.get("text") or x.get("sentence") or x.get("content") or x.get("input") or ""
    lab = _norm_label(x.get("label", x.get("labels", x.get("answer"))))
    return {"text": text, "choices": LABELS, "answer": lab}

ds = Dataset.from_list([{**r, **_map_row(r)} for r in ds_raw])
bad = [i for i, r in enumerate(ds) if r["answer"] not in LABELS]
print("Samples with unusable label:", len(bad))
assert len(bad) == 0, "Found unparseable labels; please check the field mapping."

Loaded flare-fpb test: 970 columns: ['id', 'query', 'answer', 'text', 'choices', 'gold']
Samples with unusable label: 0


In [3]:
# Step 3: Install dependencies, configure OpenAI, and record experiment metadata
%pip install -q "openai==1.40.2" "httpx==0.27.2" "httpcore==1.0.5" \
               "pandas>=2.2.2" "tqdm>=4.66.4" "requests>=2.31.0"

import os, getpass, json, time, platform
from importlib.metadata import version, PackageNotFoundError

# o3适配：使用标准版模型
MODEL = "o3"
BASE_URL = "https://api.openai.com/v1"

api_key = os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
if not api_key:
    api_key = getpass.getpass("Paste your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = api_key

# o3适配：调整文件命名以区分版本
run_tag = f"flare_fpb_{MODEL}_standard"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
meta_path = f"{save_dir}/{run_tag}_metadata.json"

def ver(pkg: str) -> str:
    try:
        return version(pkg)
    except PackageNotFoundError:
        return "not-installed"

# o3适配：在元数据中标注模型版本信息
meta = {
    "dataset": "TheFinAI/flare-fpb",
    "split": "test",
    "labels": list(LABELS),
    "model": MODEL,
    "model_variant": "standard",  # o3适配：明确标注为标准版
    "openai_sdk": ver("openai"),
    "httpx": ver("httpx"),
    "httpcore": ver("httpcore"),
    "datasets_version": ver("datasets"),
    "pandas": ver("pandas"),
    "tqdm": ver("tqdm"),
    "time_utc": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
    "python": platform.python_version(),
    "base_url": BASE_URL,
    "note": "o3 standard model evaluation - adapted for standard version capabilities"
}

os.makedirs(save_dir, exist_ok=True)
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("Meta saved ->", meta_path)
print("MODEL:", MODEL, "(standard) | BASE_URL:", BASE_URL)
print("OPENAI_API_KEY is set:", bool(os.environ.get("OPENAI_API_KEY")))

Note: you may need to restart the kernel to use updated packages.


Paste your OpenAI API key:  ········


Meta saved -> /content/flare_fpb_o3_standard_metadata.json
MODEL: o3 (standard) | BASE_URL: https://api.openai.com/v1
OPENAI_API_KEY is set: True


In [4]:
# Step 4: Inference & evaluation loop (o3 standard adaptation)
import requests, json, os, re, time
import pandas as pd
from tqdm import tqdm

def _strip_code_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", s)
        s = re.sub(r"\s*```$", "", s)
    return s.strip()

def _extract_output_text(data: dict) -> str | None:
    t = data.get("output_text")
    if isinstance(t, str) and t.strip():
        return t
    for o in data.get("output", []):
        for p in o.get("content", []):
            if p.get("type") == "output_text":
                tt = p.get("text")
                if isinstance(tt, str) and tt.strip():
                    return tt
    return None

def _make_user_text(sentence: str, choices=("",)):
    # o3适配：保持原有提示词结构，但为o3标准版优化token分配
    return (
        "Task: classify the sentence into exactly one of these labels: "
        f"{', '.join(choices)}.\n\n"
        f"Sentence: {sentence}\n\n"
        "Return ONLY a JSON object on a single line, exactly in this form:\n"
        "{\"label\":\"negative|neutral|positive\"}\n"
        "No code fences, no extra text, no explanation."
    )

def ask_o3_textjson_once(sentence, choices=("negative", "neutral", "positive"), max_tok=256):  # o3适配：增加初始token限制
    url = f"{BASE_URL.rstrip('/')}/responses"
    headers = {
        "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}",
        "Content-Type": "application/json",
    }
    user_text = _make_user_text(sentence, choices)
    payload = {
        "model": MODEL,
        "input": [{
            "role": "user",
            "content": [{"type": "input_text", "text": user_text}]
        }],
        "max_output_tokens": int(max_tok),
        "reasoning": {"effort": "low"},
    }
    # o3适配：增加超时时间以适应标准版处理
    r = requests.post(url, headers=headers, json=payload, timeout=180)  # o3适配：延长超时
    if r.status_code != 200:
        raise RuntimeError(f"Responses API error {r.status_code}: {r.text[:500]}")
    data = r.json()
    if data.get("status") == "incomplete":
        reason = (data.get("incomplete_details") or {}).get("reason")
        raise RuntimeError(f"incomplete:{reason}")

    txt = _extract_output_text(data)
    if not isinstance(txt, str) or not txt.strip():
        raise RuntimeError(f"No output_text in response. Snippet: {json.dumps(data)[:400]}")

    txt = _strip_code_fences(txt)
    obj = json.loads(txt)
    lab = obj.get("label")
    if lab not in choices:
        raise RuntimeError(f"Invalid label {lab!r}; raw json: {obj}")
    return lab

def ask_o3_textjson(sentence, choices=("negative", "neutral", "positive")):
    # o3适配：调整重试策略以适应标准版特性
    for max_tok in (256, 512, 1024):  # o3适配：增加token预算层级
        delay = 2.0  # o3适配：增加初始延迟
        for attempt in range(6):  # o3适配：增加重试次数
            try:
                return ask_o3_textjson_once(sentence, choices, max_tok=max_tok)
            except RuntimeError as e:
                msg = str(e)

                if "Responses API error 5" in msg or "server_error" in msg:
                    time.sleep(delay); delay = min(delay*2, 45); continue  # o3适配：调整最大延迟
                if "Responses API error 429" in msg:
                    time.sleep(delay); delay = min(delay*2, 90); continue  # o3适配：限流时更长延迟
                if "incomplete:max_output_tokens" in msg:
                    break
                # o3适配：添加连接错误的特殊处理
                if "connection" in msg.lower() or "timeout" in msg.lower():
                    time.sleep(delay); delay = min(delay*2, 60); continue
                raise
    raise RuntimeError("Exhausted retries and token budgets for this sample.")

run_tag = f"flare_fpb_{MODEL}_standard"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
err_path = f"{save_dir}/{run_tag}_errors.csv"

rows_done = []
done_idx = set()
if os.path.exists(pred_path):
    old = pd.read_csv(pred_path)
    if "row_idx" in old.columns:
        rows_done = old.to_dict("records")
        done_idx = set(old["row_idx"].tolist())
        print(f"[resume] loaded {len(done_idx)} completed rows.")

err_rows = []
buf = []
save_every = 30  # o3适配：减少保存频率以降低I/O开销

total = len(ds)
print(f"Starting o3 standard model evaluation on {total} samples...")

for i in tqdm(range(total)):
    if i in done_idx:
        continue
    x = ds[i]
    text = x["text"]
    gold = x["answer"]

    try:
        pred = ask_o3_textjson(text, LABELS)
        raw = json.dumps({"label": pred})
    except Exception as e:
        pred = "UNKNOWN"
        raw = f"ERROR: {type(e).__name__}: {e}"
        err_rows.append({"row_idx": i, "id": x.get("id", i), "error": raw, "text": text})

    buf.append({
        "row_idx": i,
        "id": x.get("id", i),
        "text": text,
        "pred_raw": raw,
        "pred": pred,
        "label": gold
    })

    if len(buf) % save_every == 0:
        out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
        out.to_csv(pred_path, index=False)
        if err_rows:
            pd.DataFrame(err_rows).to_csv(err_path, index=False)
        print(f"[checkpoint] saved {len(out)}/{total} -> {pred_path}")

out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
out.to_csv(pred_path, index=False)
if err_rows:
    pd.DataFrame(err_rows).to_csv(err_path, index=False)
print(f"[done] o3 standard evaluation completed -> {pred_path}")
if os.path.exists(err_path):
    err_count = len(pd.read_csv(err_path)) if os.path.getsize(err_path) > 0 else 0
    print(f"[errors] {err_count} errors logged -> {err_path}")

[resume] loaded 970 completed rows.
Starting o3 standard model evaluation on 970 samples...


100%|████████████████████████████████████████████████████████████████████████████| 970/970 [00:00<00:00, 998889.00it/s]

[done] o3 standard evaluation completed -> /content/flare_fpb_o3_standard_predictions.csv





In [5]:
# Step 5: Install scikit-learn first
%pip install -q scikit-learn

# Then compute Macro-F1 and Accuracy
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# 加载预测结果
df = pd.read_csv(pred_path).sort_values("row_idx").drop_duplicates("row_idx", keep="last")
ok = df[df["pred"] != "UNKNOWN"].copy()

print(f"o3 Standard Model Evaluation Results:")
print(f"Total samples: {len(df)}")
print(f"Successful predictions: {len(ok)}")
print(f"Failed predictions: {len(df) - len(ok)}")

if len(ok) > 0:
    # 计算评估指标
    f1_macro = f1_score(ok["label"], ok["pred"], labels=LABELS, average="macro", zero_division=0)
    f1_micro = f1_score(ok["label"], ok["pred"], labels=LABELS, average="micro", zero_division=0)
    f1_weighted = f1_score(ok["label"], ok["pred"], labels=LABELS, average="weighted", zero_division=0)
    accuracy = accuracy_score(ok["label"], ok["pred"])
    
    print("\n" + "="*50)
    print("EVALUATION RESULTS - ChatGPT o3 Standard")
    print("="*50)
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"F1-Macro:  {f1_macro:.4f}")
    print(f"F1-Micro:  {f1_micro:.4f}")
    print(f"F1-Weighted: {f1_weighted:.4f}")
    
    # 详细分类报告
    print("\nDetailed Classification Report:")
    print(classification_report(ok["label"], ok["pred"], labels=LABELS, zero_division=0))
    
    # 混淆矩阵
    print("Confusion Matrix:")
    cm = confusion_matrix(ok["label"], ok["pred"], labels=LABELS)
    cm_df = pd.DataFrame(cm, index=LABELS, columns=LABELS)
    print(cm_df)
    
    # 保存评估结果
    eval_results = {
        "model": MODEL,
        "dataset": "TheFinAI/flare-fpb",
        "split": "test",
        "total_samples": len(df),
        "successful_predictions": len(ok),
        "failed_predictions": len(df) - len(ok),
        "accuracy": float(accuracy),
        "f1_macro": float(f1_macro),
        "f1_micro": float(f1_micro),
        "f1_weighted": float(f1_weighted),
        "evaluation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
        "confusion_matrix": cm.tolist(),
        "labels": LABELS
    }
    
    eval_path = f"{save_dir}/{run_tag}_evaluation_results.json"
    with open(eval_path, "w") as f:
        json.dump(eval_results, f, indent=2)
    print(f"\nEvaluation results saved -> {eval_path}")
    
else:
    print("No successful predictions to evaluate!")

Note: you may need to restart the kernel to use updated packages.
o3 Standard Model Evaluation Results:
Total samples: 970
Successful predictions: 970
Failed predictions: 0

EVALUATION RESULTS - ChatGPT o3 Standard
Accuracy:  0.8124
F1-Macro:  0.8136
F1-Micro:  0.8124
F1-Weighted: 0.8125

Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.97      0.87       116
     neutral       0.87      0.82      0.84       577
    positive       0.72      0.74      0.73       277

    accuracy                           0.81       970
   macro avg       0.79      0.84      0.81       970
weighted avg       0.82      0.81      0.81       970

Confusion Matrix:
          negative  neutral  positive
negative       112        4         0
neutral         25      471        81
positive         4       68       205

Evaluation results saved -> /content/flare_fpb_o3_standard_evaluation_results.json
