In [1]:
%pip -q install --upgrade --force-reinstall \
  openai>=1.30.0 datasets>=2.19.0 scikit-learn>=1.4.2 tqdm>=4.66.4 huggingface_hub>=0.24.0 \
  pandas==2.2.2 numpy==2.0.2 "pyarrow<20" requests==2.32.4 "pydantic<2.12"


In [2]:
import os, getpass
from openai import OpenAI

# Set your DeepSeek API key securely
if not os.getenv("DEEPSEEK_API_KEY"):
    os.environ["DEEPSEEK_API_KEY"] = getpass.getpass("DeepSeek API key: ")

# DeepSeek uses an OpenAI-compatible endpoint
client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
MODEL_NAME = "deepseek-reasoner"   # DeepSeek-R1 reasoning model
print("Client ready ✅  Model:", MODEL_NAME)


DeepSeek API key: ··········
Client ready ✅  Model: deepseek-reasoner


In [5]:
# Verify HF token has gated access, then load en-fpb
import os, getpass
from huggingface_hub import HfApi, HfFolder, notebook_login
from datasets import load_dataset

token = HfFolder.get_token()
if not token:
    notebook_login()
    token = HfFolder.get_token()
if not token:
    token = os.getenv("HF_TOKEN") or getpass.getpass("Paste HF token (hf_...): ")

api = HfApi()
print("HF user:", api.whoami(token=token).get("name"))


info = api.dataset_info("TheFinAI/en-fpb", token=token)
print("Access OK. SHA:", info.sha[:8], "...")


ds_all = load_dataset("TheFinAI/en-fpb", token=token)
SPLIT = next((s for s in ["test","validation","valid","train"] if s in ds_all), list(ds_all.keys())[0])
ds = ds_all[SPLIT]
print(f"Loaded split: {SPLIT} (n={len(ds)})")


HF user: Hanlin0914
Access OK. SHA: 98f0a91c ...
Loaded split: test (n=970)


In [6]:
from typing import List, Tuple
import json, re

def row_text(row):
    q = (row.get("query") or "").strip()
    t = (row.get("text") or "").strip()
    return f"{q}\n\nText:\n{t}" if (q and t) else (q or t)

def normalize_choices(raw):
    out = []
    if isinstance(raw, list):
        for x in raw:
            if isinstance(x, str):
                s = x.strip()
            elif isinstance(x, dict):
                s = str(x.get("label") or x.get("text") or x.get("value") or x.get("choice") or "").strip()
            else:
                s = str(x).strip()
            if s: out.append(s)
    return out

def gold_to_label(row, choices: List[str]):
    g = row.get("gold", None) if row.get("gold", None) is not None else row.get("answer", None)
    if isinstance(g, int):
        return choices[g] if 0 <= g < len(choices) else str(g)
    if isinstance(g, str):
        for c in choices:
            if g.strip().lower() == c.lower():
                return c
        return g.strip()
    return str(g)


In [7]:
import time, random
from openai import APIError, APIConnectionError, RateLimitError

MAX_RETRIES = 6
QPS_DELAY   = 1.0  # seconds

def call_deepseek_json(text: str, labels: List[str]) -> Tuple[str, float, dict]:
    """Return (label, confidence, obj). Tries to parse JSON; falls back by token match."""
    sys = "Return only valid JSON. Choose exactly one label from the provided set."
    user = f"""Choose ONE label from this set:
{labels}

Return ONLY this JSON (no prose):
{{"label": "<one of {labels}>", "confidence": <0..1>}}

Text:
{text}"""

    last = None
    for k in range(MAX_RETRIES):
        try:
            r = client.chat.completions.create(
                model=MODEL_NAME,
                temperature=0,
                messages=[{"role":"system","content":sys},
                          {"role":"user","content":user}],
            )
            raw = r.choices[0].message.content.strip()
            try:
                obj = json.loads(raw)
                lab = obj.get("label"); conf = float(obj.get("confidence", 0.0))
                for L in labels:
                    if str(lab).lower() == L.lower(): lab = L
                if lab not in labels: lab = labels[0]
                return lab, conf, obj
            except Exception:
                low = raw.lower()
                for L in labels:
                    if re.search(rf"\b{re.escape(L.lower())}\b", low):
                        return L, 0.0, {"label": L, "raw": raw}
                return labels[0], 0.0, {"label": labels[0], "raw": raw}
        except (RateLimitError, APIConnectionError, APIError) as e:
            last = e
            time.sleep((2**k) + random.random())
    raise last or RuntimeError("DeepSeek call failed")


In [11]:

import math, json, re, time
from tqdm import tqdm
import pandas as pd
from collections import OrderedDict
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix

BATCH_SIZE   = 8
MAX_SAMPLES  = 0
idx = list(range(len(ds)))[: (MAX_SAMPLES or len(ds))]

def batch_prompt(batch):
    """
    batch = [{"id": i, "text": ..., "choices": [...]}, ...]
    """
    items = []
    for it in batch:
        items.append({
            "id": it["id"],
            "choices": it["choices"],
            "text": it["text"]
        })
    return (
        "For each item, choose EXACTLY one label from that item's choices.\n"
        "Return ONLY a JSON array of objects: [{\"id\": <int>, \"label\": <string>}].\n\n"
        f"Items:\n{json.dumps(items, ensure_ascii=False)}"
    )

def call_batch(batch):
    # Works with OpenAI/xAI/DeepSeek chat-completions
    sys = "Return ONLY valid JSON (no prose)."
    user = batch_prompt(batch)
    r = client.chat.completions.create(
        model=MODEL_NAME, temperature=0,
        messages=[{"role":"system","content":sys},{"role":"user","content":user}]
    )
    raw = r.choices[0].message.content.strip()
    try:
        arr = json.loads(raw)
        # {id, label}
        out = { int(x["id"]): str(x["label"]) for x in arr if "id" in x and "label" in x }
        # case-insensitive label alignment
        for it in batch:
            cset = it["choices"]
            lab = out.get(it["id"], cset[0])
            for C in cset:
                if lab.lower() == C.lower():
                    out[it["id"]] = C; break
        return out, raw
    except Exception:
        # salvage: match by token per item
        out = {}
        for it in batch:
            low = raw.lower()
            guess = next((C for C in it["choices"] if re.search(rf"\b{re.escape(C.lower())}\b", low)), it["choices"][0])
            out[it["id"]] = guess
        return out, raw

# build batches
examples = []
for i in idx:
    row = ds[i]
    choices = normalize_choices(row.get("choices")) or ["negative","neutral","positive"]
    examples.append({"id": i, "text": row_text(row), "choices": choices, "gold": gold_to_label(row, choices)})

rows = []
for b in tqdm(range(0, len(examples), BATCH_SIZE), desc="Micro-batching"):
    batch = examples[b:b+BATCH_SIZE]
    preds, raw = call_batch(batch)
    for it in batch:
        rows.append({"idx": it["id"], "text": it["text"], "gold": it["gold"], "pred": preds[it["id"]]})

# metrics
df = pd.DataFrame(sorted(rows, key=lambda r: r["idx"]))
label_set = list(OrderedDict.fromkeys(
    (df["gold"].tolist() + df["pred"].tolist())
))
df_ok = df[df["pred"].isin(label_set)]
if len(df_ok):
    P,R,F1,S = precision_recall_fscore_support(df_ok["gold"], df_ok["pred"], labels=label_set, zero_division=0)
    acc = accuracy_score(df_ok["gold"], df_ok["pred"])
    f1_micro = f1_score(df_ok["gold"], df_ok["pred"], average="micro")
    f1_macro = f1_score(df_ok["gold"], df_ok["pred"], average="macro")
    print({"n":len(df_ok),"acc":round(acc,4),"f1_micro":round(f1_micro,4),"f1_macro":round(f1_macro,4)})
else:
    print("No valid predictions (check parsing).")


Micro-batching: 100%|██████████| 122/122 [1:10:31<00:00, 34.69s/it]

{'n': 970, 'acc': 0.7165, 'f1_micro': 0.7165, 'f1_macro': 0.7448}



