In [1]:
# Step 1: Install dependencies & set Hugging Face token
%pip install -q "datasets>=2.19.0" "huggingface_hub>=0.24"
import os
import getpass

# 直接设置Hugging Face token，跳过登录界面
hf_token = getpass.getpass("Paste your Hugging Face token: ")
os.environ['HF_TOKEN'] = hf_token
os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token

print("Hugging Face token set successfully!")

Note: you may need to restart the kernel to use updated packages.


Paste your Hugging Face token:  ········


Hugging Face token set successfully!


In [2]:
# Step 2: Load FLARE-FPB test set and normalize labels
from datasets import load_dataset, Dataset

LABELS = ["negative", "neutral", "positive"]

ds_raw = load_dataset("TheFinAI/flare-fpb", split="test")
print("Loaded flare-fpb test:", len(ds_raw), "columns:", ds_raw.column_names)

_alias = {"pos": "positive", "neg": "negative", "neu": "neutral",
          "bullish": "positive", "bearish": "negative"}

def _norm_label(v):
    if v is None: 
        return None
    if isinstance(v, (int, float)) or (isinstance(v, str) and v.isdigit()):
        i = int(v)
        return LABELS[i] if 0 <= i < len(LABELS) else None
    s = str(v).strip().lower()
    s = _alias.get(s, s)
    return s if s in LABELS else None

def _map_row(x):
    text = x.get("text") or x.get("sentence") or x.get("content") or x.get("input") or ""
    lab = _norm_label(x.get("label", x.get("labels", x.get("answer"))))
    return {"text": text, "choices": LABELS, "answer": lab}

ds = Dataset.from_list([{**r, **_map_row(r)} for r in ds_raw])
bad = [i for i, r in enumerate(ds) if r["answer"] not in LABELS]
print("Samples with unusable label:", len(bad))
assert len(bad) == 0, "Found unparseable labels; please check the field mapping."

Loaded flare-fpb test: 970 columns: ['id', 'query', 'answer', 'text', 'choices', 'gold']
Samples with unusable label: 0


In [3]:
# Step 3: Install dependencies, configure DeepSeek Reasoner, and record experiment metadata
%pip install -q "openai==1.40.2" "httpx==0.27.2" "httpcore==1.0.5" \
               "pandas>=2.2.2" "tqdm>=4.66.4" "requests>=2.31.0"

import os, getpass, json, time, platform
from importlib.metadata import version, PackageNotFoundError

# DeepSeek Reasoner适配：使用deepseek-reasoner模型
MODEL = "deepseek-reasoner"
BASE_URL = "https://api.deepseek.com/v1"

api_key = os.getenv("DEEPSEEK_API_KEY") or os.getenv("API_KEY")
if not api_key:
    api_key = getpass.getpass("Paste your DeepSeek API key: ")
os.environ["DEEPSEEK_API_KEY"] = api_key

# DeepSeek Reasoner适配：调整文件命名以区分模型
run_tag = f"flare_fpb_{MODEL.replace('-', '_')}"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
meta_path = f"{save_dir}/{run_tag}_metadata.json"

def ver(pkg: str) -> str:
    try:
        return version(pkg)
    except PackageNotFoundError:
        return "not-installed"

# DeepSeek Reasoner适配：在元数据中标注模型版本信息
meta = {
    "dataset": "TheFinAI/flare-fpb",
    "split": "test",
    "labels": list(LABELS),
    "model": MODEL,
    "model_variant": "reasoner",
    "openai_sdk": ver("openai"),
    "httpx": ver("httpx"),
    "httpcore": ver("httpcore"),
    "datasets_version": ver("datasets"),
    "pandas": ver("pandas"),
    "tqdm": ver("tqdm"),
    "time_utc": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
    "python": platform.python_version(),
    "base_url": BASE_URL,
    "note": "DeepSeek Reasoner model evaluation with reasoning capabilities"
}

os.makedirs(save_dir, exist_ok=True)
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("Meta saved ->", meta_path)
print("MODEL:", MODEL, "| BASE_URL:", BASE_URL)
print("DEEPSEEK_API_KEY is set:", bool(os.environ.get("DEEPSEEK_API_KEY")))

Note: you may need to restart the kernel to use updated packages.


Paste your DeepSeek API key:  ········


Meta saved -> /content/flare_fpb_deepseek_reasoner_metadata.json
MODEL: deepseek-reasoner | BASE_URL: https://api.deepseek.com/v1
DEEPSEEK_API_KEY is set: True


In [4]:
# Step 4: Inference & evaluation loop (DeepSeek Reasoner adaptation)
import requests, json, os, re, time
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

def _strip_code_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", s)
        s = re.sub(r"\s*```$", "", s)
    return s.strip()

def _extract_final_answer(text: str) -> str:
    """从推理文本中提取最终答案"""
    # 查找JSON格式的答案
    json_pattern = r'\{[^{}]*"label"\s*:\s*"(negative|neutral|positive)"[^{}]*\}'
    match = re.search(json_pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).lower()
    
    # 查找明确的最终答案标记
    final_patterns = [
        r"final answer:\s*(negative|neutral|positive)",
        r"answer:\s*(negative|neutral|positive)",
        r"label:\s*(negative|neutral|positive)",
        r"所以(是|选择)?\s*(负面|中性|正面|negative|neutral|positive)",
        r"因此(是|选择)?\s*(负面|中性|正面|negative|neutral|positive)"
    ]
    
    for pattern in final_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            answer = match.group(2) or match.group(1)
            # 处理中文标签
            chinese_map = {"负面": "negative", "中性": "neutral", "正面": "positive"}
            return chinese_map.get(answer.lower(), answer.lower())
    
    # 如果找不到明确答案，在整个文本中搜索标签
    text_lower = text.lower()
    for label in LABELS:
        if label in text_lower:
            # 确保不是在其他上下文中提到的
            context_check = re.search(rf'\b{label}\b', text_lower)
            if context_check:
                return label
    
    return None

def _make_user_text(sentence: str, choices=("",)):
    # DeepSeek Reasoner适配：优化提示词以利用推理能力
    return (
        "You are a financial sentiment analysis expert. "
        "Analyze the following sentence and classify its sentiment into exactly one of these labels: "
        f"{', '.join(choices)}.\n\n"
        f"Sentence: {sentence}\n\n"
        "Please reason step by step about the sentiment, then provide your final answer.\n"
        "After your reasoning, return ONLY a JSON object on a single line, exactly in this form:\n"
        "{\"label\":\"negative|neutral|positive\"}\n"
        "No code fences, no extra text after the JSON."
    )

def ask_deepseek_reasoner_once(sentence, choices=("negative", "neutral", "positive")):
    client = OpenAI(
        api_key=os.environ['DEEPSEEK_API_KEY'],
        base_url=BASE_URL
    )
    
    user_text = _make_user_text(sentence, choices)
    
    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "user", "content": user_text}
            ],
            max_tokens=512,  # 增加token限制以容纳推理过程
            temperature=0.1,
            stream=False
        )
        
        content = response.choices[0].message.content.strip()
        
        # 首先尝试直接解析JSON
        try:
            content_clean = _strip_code_fences(content)
            obj = json.loads(content_clean)
            lab = obj.get("label")
            if lab and lab in choices:
                return lab, content  # 返回标签和完整推理内容
        except json.JSONDecodeError:
            pass
        
        # 如果JSON解析失败，从推理文本中提取答案
        final_answer = _extract_final_answer(content)
        if final_answer and final_answer in choices:
            return final_answer, content
        
        raise RuntimeError(f"Could not extract valid label from response: {content}")
        
    except Exception as e:
        raise RuntimeError(f"API call failed: {str(e)}")

def ask_deepseek_reasoner(sentence, choices=("negative", "neutral", "positive")):
    # DeepSeek Reasoner适配：调整重试策略
    delay = 3.0  # 增加初始延迟
    for attempt in range(4):
        try:
            return ask_deepseek_reasoner_once(sentence, choices)
        except RuntimeError as e:
            msg = str(e)
            
            if "rate limit" in msg.lower() or "429" in msg:
                time.sleep(delay)
                delay = min(delay * 2, 60)
                continue
            if "server" in msg.lower() or "timeout" in msg.lower() or "busy" in msg.lower():
                time.sleep(delay)
                delay = min(delay * 2, 60)
                continue
            # 对于内容解析错误，直接重试
            if "Could not extract" in msg or "JSON" in msg:
                time.sleep(delay)
                delay = min(delay * 1.5, 30)
                continue
            raise
    
    raise RuntimeError("Exhausted retries for this sample.")

run_tag = f"flare_fpb_{MODEL.replace('-', '_')}"
save_dir = "/content"
pred_path = f"{save_dir}/{run_tag}_predictions.csv"
reasoning_path = f"{save_dir}/{run_tag}_reasoning.json"
err_path = f"{save_dir}/{run_tag}_errors.csv"

rows_done = []
done_idx = set()
if os.path.exists(pred_path):
    old = pd.read_csv(pred_path)
    if "row_idx" in old.columns:
        rows_done = old.to_dict("records")
        done_idx = set(old["row_idx"].tolist())
        print(f"[resume] loaded {len(done_idx)} completed rows.")

err_rows = []
reasoning_data = {}
buf = []
save_every = 20  # 减少保存频率，因为推理模型较慢

total = len(ds)
print(f"Starting DeepSeek Reasoner model evaluation on {total} samples...")

for i in tqdm(range(total)):
    if i in done_idx:
        continue
    x = ds[i]
    text = x["text"]
    gold = x["answer"]

    try:
        pred, reasoning = ask_deepseek_reasoner(text, LABELS)
        raw = json.dumps({"label": pred})
        # 保存推理过程
        reasoning_data[i] = {
            "text": text,
            "gold_label": gold,
            "predicted_label": pred,
            "reasoning": reasoning,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        }
    except Exception as e:
        pred = "UNKNOWN"
        raw = f"ERROR: {type(e).__name__}: {e}"
        err_rows.append({"row_idx": i, "id": x.get("id", i), "error": raw, "text": text})

    buf.append({
        "row_idx": i,
        "id": x.get("id", i),
        "text": text,
        "pred_raw": raw,
        "pred": pred,
        "label": gold
    })

    if len(buf) % save_every == 0:
        out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
        out.to_csv(pred_path, index=False)
        # 保存推理数据
        with open(reasoning_path, "w") as f:
            json.dump(reasoning_data, f, indent=2, ensure_ascii=False)
        if err_rows:
            pd.DataFrame(err_rows).to_csv(err_path, index=False)
        print(f"[checkpoint] saved {len(out)}/{total} -> {pred_path}")

out = pd.DataFrame(rows_done + buf).sort_values("row_idx")
out.to_csv(pred_path, index=False)
# 最终保存推理数据
with open(reasoning_path, "w") as f:
    json.dump(reasoning_data, f, indent=2, ensure_ascii=False)
if err_rows:
    pd.DataFrame(err_rows).to_csv(err_path, index=False)
print(f"[done] DeepSeek Reasoner evaluation completed -> {pred_path}")
print(f"[reasoning] Reasoning data saved -> {reasoning_path}")
if os.path.exists(err_path):
    err_count = len(pd.read_csv(err_path)) if os.path.getsize(err_path) > 0 else 0
    print(f"[errors] {err_count} errors logged -> {err_path}")

Starting DeepSeek Reasoner model evaluation on 970 samples...


  2%|█▌                                                                            | 20/970 [18:18<14:05:01, 53.37s/it]

[checkpoint] saved 20/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


  4%|███▎                                                                           | 40/970 [29:59<5:25:55, 21.03s/it]

[checkpoint] saved 40/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


  6%|████▉                                                                          | 60/970 [40:29<6:08:56, 24.33s/it]

[checkpoint] saved 60/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


  8%|██████▌                                                                        | 80/970 [49:54<5:50:35, 23.64s/it]

[checkpoint] saved 80/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 10%|███████▋                                                                   | 100/970 [1:09:09<15:52:21, 65.68s/it]

[checkpoint] saved 100/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 12%|█████████▎                                                                 | 120/970 [1:22:08<12:42:21, 53.81s/it]

[checkpoint] saved 120/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 14%|██████████▊                                                                | 140/970 [1:34:50<13:40:13, 59.29s/it]

[checkpoint] saved 140/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 16%|████████████▌                                                               | 160/970 [1:46:57<4:13:21, 18.77s/it]

[checkpoint] saved 160/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 19%|██████████████                                                              | 180/970 [1:57:42<6:59:19, 31.85s/it]

[checkpoint] saved 180/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 21%|███████████████▋                                                            | 200/970 [2:11:31<6:12:02, 28.99s/it]

[checkpoint] saved 200/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 23%|█████████████████▏                                                          | 220/970 [2:19:11<4:25:19, 21.23s/it]

[checkpoint] saved 220/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 25%|██████████████████▊                                                         | 240/970 [2:27:28<5:28:30, 27.00s/it]

[checkpoint] saved 240/970 -> /content/flare_fpb_deepseek_reasoner_predictions.csv


 27%|████████████████████▎                                                       | 259/970 [2:36:40<7:10:06, 36.30s/it]


UnicodeEncodeError: 'gbk' codec can't encode character '\xf1' in position 31: illegal multibyte sequence

In [5]:
# Step 5: Install scikit-learn first
%pip install -q scikit-learn

# Then compute Macro-F1 and Accuracy
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# 加载预测结果
df = pd.read_csv(pred_path).sort_values("row_idx").drop_duplicates("row_idx", keep="last")
ok = df[df["pred"] != "UNKNOWN"].copy()

print(f"DeepSeek Reasoner Model Evaluation Results:")
print(f"Total samples: {len(df)}")
print(f"Successful predictions: {len(ok)}")
print(f"Failed predictions: {len(df) - len(ok)}")

if len(ok) > 0:
    # 计算评估指标
    f1_macro = f1_score(ok["label"], ok["pred"], labels=LABELS, average="macro", zero_division=0)
    f1_micro = f1_score(ok["label"], ok["pred"], labels=LABELS, average="micro", zero_division=0)
    f1_weighted = f1_score(ok["label"], ok["pred"], labels=LABELS, average="weighted", zero_division=0)
    accuracy = accuracy_score(ok["label"], ok["pred"])
    
    print("\n" + "="*50)
    print("EVALUATION RESULTS - DeepSeek Reasoner")
    print("="*50)
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"F1-Macro:  {f1_macro:.4f}")
    print(f"F1-Micro:  {f1_micro:.4f}")
    print(f"F1-Weighted: {f1_weighted:.4f}")
    
    # 详细分类报告
    print("\nDetailed Classification Report:")
    print(classification_report(ok["label"], ok["pred"], labels=LABELS, zero_division=0))
    
    # 混淆矩阵
    print("Confusion Matrix:")
    cm = confusion_matrix(ok["label"], ok["pred"], labels=LABELS)
    cm_df = pd.DataFrame(cm, index=LABELS, columns=LABELS)
    print(cm_df)
    
    # 保存评估结果
    eval_results = {
        "model": MODEL,
        "dataset": "TheFinAI/flare-fpb",
        "split": "test",
        "total_samples": len(df),
        "successful_predictions": len(ok),
        "failed_predictions": len(df) - len(ok),
        "accuracy": float(accuracy),
        "f1_macro": float(f1_macro),
        "f1_micro": float(f1_micro),
        "f1_weighted": float(f1_weighted),
        "evaluation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
        "confusion_matrix": cm.tolist(),
        "labels": LABELS,
        "reasoning_data_available": len(reasoning_data) if 'reasoning_data' in locals() else 0
    }
    
    eval_path = f"{save_dir}/{run_tag}_evaluation_results.json"
    with open(eval_path, "w") as f:
        json.dump(eval_results, f, indent=2)
    print(f"\nEvaluation results saved -> {eval_path}")
    
else:
    print("No successful predictions to evaluate!")

Note: you may need to restart the kernel to use updated packages.
DeepSeek Reasoner Model Evaluation Results:
Total samples: 260
Successful predictions: 223
Failed predictions: 37

EVALUATION RESULTS - DeepSeek Reasoner
Accuracy:  0.9417
F1-Macro:  0.3233
F1-Micro:  0.9417
F1-Weighted: 0.9700

Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         0
     neutral       0.00      0.00      0.00         0
    positive       1.00      0.94      0.97       223

    accuracy                           0.94       223
   macro avg       0.33      0.31      0.32       223
weighted avg       1.00      0.94      0.97       223

Confusion Matrix:
          negative  neutral  positive
negative         0        0         0
neutral          0        0         0
positive         1       12       210

Evaluation results saved -> /content/flare_fpb_deepseek_reasoner_evaluation_results.json
