In [17]:
# Step 1: Install dependencies & set Hugging Face token
%pip install -q "datasets>=2.19.0" "huggingface_hub>=0.24"
import os
import getpass

# 直接设置Hugging Face token，跳过登录界面
hf_token = getpass.getpass("Paste your Hugging Face token: ")
os.environ['HF_TOKEN'] = hf_token
os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token

print("Hugging Face token set successfully!")

Note: you may need to restart the kernel to use updated packages.


Paste your Hugging Face token:  ········


Hugging Face token set successfully!


In [18]:
# Step 2: Load FLARE-FPB test set and normalize labels
from datasets import load_dataset, Dataset

LABELS = ["negative", "neutral", "positive"]

ds_raw = load_dataset("TheFinAI/flare-fpb", split="test")
print("Loaded flare-fpb test:", len(ds_raw), "columns:", ds_raw.column_names)

_alias = {"pos": "positive", "neg": "negative", "neu": "neutral",
          "bullish": "positive", "bearish": "negative"}

def _norm_label(v):
    if v is None: 
        return None
    if isinstance(v, (int, float)) or (isinstance(v, str) and v.isdigit()):
        i = int(v)
        return LABELS[i] if 0 <= i < len(LABELS) else None
    s = str(v).strip().lower()
    s = _alias.get(s, s)
    return s if s in LABELS else None

def _map_row(x):
    text = x.get("text") or x.get("sentence") or x.get("content") or x.get("input") or ""
    lab = _norm_label(x.get("label", x.get("labels", x.get("answer"))))
    return {"text": text, "choices": LABELS, "answer": lab}

ds = Dataset.from_list([{**r, **_map_row(r)} for r in ds_raw])
bad = [i for i, r in enumerate(ds) if r["answer"] not in LABELS]
print("Samples with unusable label:", len(bad))
assert len(bad) == 0, "Found unparseable labels; please check the field mapping."

Loaded flare-fpb test: 970 columns: ['id', 'query', 'answer', 'text', 'choices', 'gold']
Samples with unusable label: 0


In [19]:
# Step 3: Install dependencies, configure OpenAI, and record experiment metadata
%pip install -q "openai==1.40.2" "httpx==0.27.2" "httpcore==1.0.5" \
               "pandas>=2.2.2" "tqdm>=4.66.4" "requests>=2.31.0"

import os, getpass, json, time, platform
from importlib.metadata import version, PackageNotFoundError

# GPT-4o适配：使用GPT-4o模型
MODEL = "gpt-4o"
BASE_URL = "https://api.openai.com/v1"

api_key = os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
if not api_key:
    api_key = getpass.getpass("Paste your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = api_key

# GPT-4o适配：调整文件命名以区分版本
run_tag = f"flare_fpb_{MODEL.replace('-', '_')}"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
meta_path = f"{save_dir}/{run_tag}_metadata.json"

def ver(pkg: str) -> str:
    try:
        return version(pkg)
    except PackageNotFoundError:
        return "not-installed"

# GPT-4o适配：在元数据中标注模型版本信息
meta = {
    "dataset": "TheFinAI/flare-fpb",
    "split": "test",
    "labels": list(LABELS),
    "model": MODEL,
    "model_variant": "omni",  # GPT-4o适配：标注为omni版本
    "openai_sdk": ver("openai"),
    "httpx": ver("httpx"),
    "httpcore": ver("httpcore"),
    "datasets_version": ver("datasets"),
    "pandas": ver("pandas"),
    "tqdm": ver("tqdm"),
    "time_utc": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
    "python": platform.python_version(),
    "base_url": BASE_URL,
    "note": "GPT-4o model evaluation - adapted for omni model capabilities"
}

os.makedirs(save_dir, exist_ok=True)
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("Meta saved ->", meta_path)
print("MODEL:", MODEL, "(omni) | BASE_URL:", BASE_URL)
print("OPENAI_API_KEY is set:", bool(os.environ.get("OPENAI_API_KEY")))

Note: you may need to restart the kernel to use updated packages.
Meta saved -> /content/flare_fpb_gpt_4o_metadata.json
MODEL: gpt-4o (omni) | BASE_URL: https://api.openai.com/v1
OPENAI_API_KEY is set: True


In [20]:
# Step 4修正版: 使用Chat Completions API for GPT-4o
import openai
import pandas as pd
from tqdm import tqdm
import json
import time

# 初始化OpenAI客户端
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def ask_gpt4o_chatcompletion(sentence, choices=("negative", "neutral", "positive")):
    """使用Chat Completions API for GPT-4o"""
    
    user_text = (
        "Task: classify the financial sentence into exactly one of these sentiment labels: "
        f"{', '.join(choices)}.\n\n"
        f"Sentence: {sentence}\n\n"
        "Return ONLY a JSON object on a single line, exactly in this form:\n"
        "{\"label\":\"negative|neutral|positive\"}\n"
        "No code fences, no extra text, no explanation."
    )
    
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[{
                    "role": "user", 
                    "content": user_text
                }],
                max_tokens=50,
                temperature=0
            )
            
            content = response.choices[0].message.content.strip()
            
            # 清理可能的代码块标记
            if content.startswith("```"):
                content = content.replace("```json", "").replace("```", "").strip()
            
            # 解析JSON
            obj = json.loads(content)
            lab = obj.get("label")
            if lab not in choices:
                raise ValueError(f"Invalid label: {lab}")
            return lab
            
        except json.JSONDecodeError:
            # 如果JSON解析失败，尝试提取label
            import re
            match = re.search(r'"label"\s*:\s*"([^"]+)"', content)
            if match:
                lab = match.group(1)
                if lab in choices:
                    return lab
            if attempt < 2:
                time.sleep(2)
                continue
            raise RuntimeError(f"Failed to parse JSON from: {content}")
            
        except Exception as e:
            if attempt < 2:
                time.sleep(2)
                continue
            raise e
    
    raise RuntimeError("All retry attempts failed")

run_tag = f"flare_fpb_{MODEL.replace('-', '_')}"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions_corrected.csv"
err_path = f"{save_dir}/{run_tag}_errors_corrected.csv"

rows_done = []
done_idx = set()
if os.path.exists(pred_path):
    old = pd.read_csv(pred_path)
    if "row_idx" in old.columns:
        rows_done = old.to_dict("records")
        done_idx = set(old["row_idx"].tolist())
        print(f"[resume] loaded {len(done_idx)} completed rows.")

err_rows = []
buf = []
save_every = 20  # 更频繁的保存以便调试

total = len(ds)
print(f"Starting GPT-4o model evaluation (Corrected) on {total} samples...")

# 先测试一个样本
print("Testing one sample first...")
test_sample = ds[0]
try:
    test_pred = ask_gpt4o_chatcompletion(test_sample["text"], LABELS)
    print(f"Test successful! Prediction: {test_pred}, Expected: {test_sample['answer']}")
except Exception as e:
    print(f"Test failed: {e}")
    # 如果测试失败，显示更多调试信息
    print("Debug info - Model:", MODEL)
    print("Debug info - API key set:", bool(os.environ.get("OPENAI_API_KEY")))

# 如果测试成功，继续批量处理
for i in tqdm(range(total)):
    if i in done_idx:
        continue
    x = ds[i]
    text = x["text"]
    gold = x["answer"]

    try:
        pred = ask_gpt4o_chatcompletion(text, LABELS)
        raw = json.dumps({"label": pred})
    except Exception as e:
        pred = "UNKNOWN"
        raw = f"ERROR: {type(e).__name__}: {e}"
        err_rows.append({"row_idx": i, "id": x.get("id", i), "error": raw, "text": text[:100]})  # 只保存部分文本

    buf.append({
        "row_idx": i,
        "id": x.get("id", i),
        "text": text,
        "pred_raw": raw,
        "pred": pred,
        "label": gold
    })

    if len(buf) % save_every == 0:
        out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
        out.to_csv(pred_path, index=False)
        if err_rows:
            pd.DataFrame(err_rows).to_csv(err_path, index=False)
        print(f"[checkpoint] saved {len(out)}/{total} -> {pred_path}")

out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
out.to_csv(pred_path, index=False)
if err_rows:
    pd.DataFrame(err_rows).to_csv(err_path, index=False)
print(f"[done] GPT-4o evaluation completed -> {pred_path}")
if os.path.exists(err_path):
    err_count = len(pd.read_csv(err_path)) if os.path.getsize(err_path) > 0 else 0
    print(f"[errors] {err_count} errors logged -> {err_path}")

[resume] loaded 970 completed rows.
Starting GPT-4o model evaluation (Corrected) on 970 samples...
Testing one sample first...
Test successful! Prediction: neutral, Expected: positive


100%|███████████████████████████████████████████████████████████████████████████| 970/970 [00:00<00:00, 1141547.38it/s]

[done] GPT-4o evaluation completed -> /content/flare_fpb_gpt_4o_predictions_corrected.csv





In [21]:
# Step 4和Step 5之间的调试代码
import pandas as pd
import os

# 检查错误日志
err_path = "/content/flare_fpb_gpt_4o_errors_corrected.csv"
if os.path.exists(err_path):
    errors_df = pd.read_csv(err_path)
    print("错误类型统计:")
    print(errors_df['error'].value_counts().head(10))
    print("\n前几个错误示例:")
    print(errors_df.head()[['row_idx', 'error']])
else:
    print("错误日志文件不存在")

# 检查预测文件内容
pred_path = "/content/flare_fpb_gpt_4o_predictions_corrected.csv"
if os.path.exists(pred_path):
    pred_df = pd.read_csv(pred_path)
    print(f"\n预测文件样本数: {len(pred_df)}")
    print("预测结果统计:")
    print(pred_df['pred'].value_counts())
    print("\n前几个预测示例:")
    print(pred_df[['row_idx', 'pred', 'label']].head(10))
else:
    print("预测文件不存在")

错误日志文件不存在

预测文件样本数: 970
预测结果统计:
pred
neutral     564
positive    267
negative    139
Name: count, dtype: int64

前几个预测示例:
   row_idx      pred     label
0        0   neutral  positive
1        1  positive  positive
2        2  positive  positive
3        3  positive  positive
4        4  positive  positive
5        5  positive  positive
6        6   neutral  positive
7        7  positive  positive
8        8   neutral  positive
9        9  positive  positive


In [22]:
# Step 5: Compute Macro-F1 and Accuracy
%pip install -q scikit-learn

import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import json
import time

# 重新定义变量
MODEL = "gpt-4o"
run_tag = f"flare_fpb_{MODEL.replace('-', '_')}"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions_corrected.csv"
LABELS = ["negative", "neutral", "positive"]

# 加载预测结果
df = pd.read_csv(pred_path).sort_values("row_idx").drop_duplicates("row_idx", keep="last")
ok = df[df["pred"] != "UNKNOWN"].copy()

print(f"GPT-4o Model Evaluation Results:")
print(f"Total samples: {len(df)}")
print(f"Successful predictions: {len(ok)}")
print(f"Failed predictions: {len(df) - len(ok)}")

if len(ok) > 0:
    # 计算评估指标
    f1_macro = f1_score(ok["label"], ok["pred"], labels=LABELS, average="macro", zero_division=0)
    f1_micro = f1_score(ok["label"], ok["pred"], labels=LABELS, average="micro", zero_division=0)
    f1_weighted = f1_score(ok["label"], ok["pred"], labels=LABELS, average="weighted", zero_division=0)
    accuracy = accuracy_score(ok["label"], ok["pred"])
    
    print("\n" + "="*50)
    print("EVALUATION RESULTS - GPT-4o")
    print("="*50)
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"F1-Macro:  {f1_macro:.4f}")
    print(f"F1-Micro:  {f1_micro:.4f}")
    print(f"F1-Weighted: {f1_weighted:.4f}")
    
    # 详细分类报告
    print("\nDetailed Classification Report:")
    print(classification_report(ok["label"], ok["pred"], labels=LABELS, zero_division=0))
    
    # 混淆矩阵
    print("Confusion Matrix:")
    cm = confusion_matrix(ok["label"], ok["pred"], labels=LABELS)
    cm_df = pd.DataFrame(cm, index=LABELS, columns=LABELS)
    print(cm_df)
    
    # 保存评估结果
    eval_results = {
        "model": MODEL,
        "dataset": "TheFinAI/flare-fpb",
        "split": "test",
        "total_samples": len(df),
        "successful_predictions": len(ok),
        "failed_predictions": len(df) - len(ok),
        "accuracy": float(accuracy),
        "f1_macro": float(f1_macro),
        "f1_micro": float(f1_micro),
        "f1_weighted": float(f1_weighted),
        "evaluation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
        "confusion_matrix": cm.tolist(),
        "labels": LABELS
    }
    
    eval_path = f"{save_dir}/{run_tag}_evaluation_results.json"
    with open(eval_path, "w") as f:
        json.dump(eval_results, f, indent=2)
    print(f"\nEvaluation results saved -> {eval_path}")
    
else:
    print("No successful predictions to evaluate!")

Note: you may need to restart the kernel to use updated packages.
GPT-4o Model Evaluation Results:
Total samples: 970
Successful predictions: 970
Failed predictions: 0

EVALUATION RESULTS - GPT-4o
Accuracy:  0.8330
F1-Macro:  0.8323
F1-Micro:  0.8330
F1-Weighted: 0.8321

Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.97      0.89       116
     neutral       0.87      0.85      0.86       577
    positive       0.76      0.74      0.75       277

    accuracy                           0.83       970
   macro avg       0.82      0.85      0.83       970
weighted avg       0.83      0.83      0.83       970

Confusion Matrix:
          negative  neutral  positive
negative       113        2         1
neutral         24      491        62
positive         2       71       204

Evaluation results saved -> /content/flare_fpb_gpt_4o_evaluation_results.json
