In [None]:
%pip install -q "openai>=1.40.0" "httpx>=0.27.2" "datasets>=2.19.0" "huggingface_hub>=0.24" "tqdm>=4.66.4" "pandas>=2.2.2"

In [None]:
# Setup DeepSeek (OpenAI-compatible) + paths
import os, getpass, json, time, platform, re
from importlib.metadata import version, PackageNotFoundError

def ver(p):
    try: from importlib.metadata import version; return version(p)
    except Exception: return None

BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
MODEL    = os.getenv("DEEPSEEK_MODEL", "deepseek-v3")
api_key  = os.getenv("DEEPSEEK_API_KEY") or os.getenv("API_KEY")
if not api_key:
    api_key = getpass.getpass("Paste your DeepSeek API key: ")
os.environ["DEEPSEEK_API_KEY"] = api_key

run_dir  = "./results_deepseekv3"
os.makedirs(run_dir, exist_ok=True)
pred_path = os.path.join(run_dir, "predictions.csv")
meta_path = os.path.join(run_dir, "meta.json")

meta = {
    "model": MODEL,
    "base_url": BASE_URL,
    "python": platform.python_version(),
    "time_utc": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
    "openai_pkg": ver("openai"),
    "httpx": ver("httpx"),
    "datasets": ver("datasets"),
}
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("MODEL:", MODEL)
print("BASE_URL:", BASE_URL)
print("pred_path ->", pred_path)


In [None]:
# Load FPB test split (TheFinAI/flare-fpb) and normalize labels
from datasets import load_dataset
import pandas as pd

LABELS = ["negative","neutral","positive"]
_alias = {"pos":"positive","neg":"negative","neu":"neutral",
          "bullish":"positive","bearish":"negative"}

ds_raw = load_dataset("TheFinAI/flare-fpb", split="test")
print("Loaded flare-fpb test:", len(ds_raw), "columns:", ds_raw.column_names)

# Try to locate the text and label fields heuristically
text_col_candidates = ["text","sentence","content","tweet","headline"]
label_col_candidates = ["label","labels","sentiment","target"]
cols = ds_raw.column_names

text_col = next((c for c in text_col_candidates if c in cols), None)
label_col = next((c for c in label_col_candidates if c in cols), None)
if text_col is None:
    # default to the first string-typed column
    for c in cols:
        if isinstance(ds_raw[0][c], str):
            text_col = c; break
if label_col is None:
    # default to a column named 'label' like mapping; else throw
    label_col = "label" if "label" in cols else cols[-1]

print("Using text_col:", text_col, "| label_col:", label_col)

def norm_label(x):
    v = str(x).strip().lower()
    return _alias.get(v, v)

# Build a pandas frame for easy iteration
df = pd.DataFrame({ "row_idx": range(len(ds_raw)),
                    "text": [ds_raw[i][text_col] for i in range(len(ds_raw))],
                    "label": [norm_label(ds_raw[i][label_col]) for i in range(len(ds_raw))]})
df.head()


In [None]:
            # Inference with DeepSeek using OpenAI-compatible Chat Completions
            from openai import OpenAI
            from tqdm import tqdm
            import pandas as pd, time

            client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url=BASE_URL)

            def classify(txt: str) -> str:
                prompt = f"""Classify the sentiment of the following financial text
as exactly one of: negative, neutral, positive.
Return ONLY the single word label.

Text:
{txt}
"""
                msgs = [
                    {"role": "system", "content": "You are a careful, deterministic classifier."},
                    {"role": "user", "content": prompt},
                ]
                try:
                    resp = client.chat.completions.create(
                        model=MODEL,
                        messages=msgs,
                        temperature=0.0,
                        max_tokens=5,
                    )
                    out = resp.choices[0].message.content.strip().lower()
                except Exception as e:
                    # Attempt common fallbacks if the given model is unavailable
                    for alt in ["deepseek-v3","deepseek-chat","deepseek-reasoner"]:
                        try:
                            resp = client.chat.completions.create(
                                model=alt,
                                messages=msgs,
                                temperature=0.0,
                                max_tokens=5,
                            )
                            out = resp.choices[0].message.content.strip().lower()
                            break
                        except Exception:
                            out = f"UNKNOWN"
                    else:
                        out = "UNKNOWN"

                # Clean to expected vocabulary
                out = out.replace(".", "").replace("'", "").strip()
                if out not in {"negative","neutral","positive"}:
                    # Try to map variants like 'pos', 'neg', etc.
                    aliases = {"pos":"positive","neg":"negative","neu":"neutral"}
                    out = aliases.get(out, "UNKNOWN")
                return out

            preds = []
            for i, row in tqdm(df.iterrows(), total=len(df), desc="Scoring"):
                pred = classify(row["text"])
                preds.append({"row_idx": int(row["row_idx"]), "label": row["label"], "pred": pred})

            pred_df = pd.DataFrame(preds)
            pred_df.to_csv(pred_path, index=False)
            print("Saved predictions to:", pred_path)
            pred_df.head()


In [None]:
# Compute F1 (macro) and Accuracy, mirroring the user's evaluation cell
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

# Load predictions
df = pd.read_csv(pred_path).sort_values("row_idx").drop_duplicates("row_idx", keep="last")
ok = df[df["pred"] != "UNKNOWN"].copy()

# Ensure LABELS are defined
try:
    LABELS
except NameError:
    LABELS = ["negative", "neutral", "positive"]

# --- Clean & normalize labels/predictions ---
def clean_labels(df, col):
    df[col] = df[col].astype(str).str.strip().str.lower()
    # Map numeric labels to string equivalents
    df[col] = df[col].replace({
        "0": "negative", "1": "neutral", "2": "positive",
        "neg": "negative", "neu": "neutral", "pos": "positive"
    })
    return df

ok = clean_labels(ok, "label")
ok = clean_labels(ok, "pred")

# Keep only valid labels
ok = ok[ok["label"].isin(LABELS) & ok["pred"].isin(LABELS)]

# --- Compute metrics ---
f1ma = f1_score(ok["label"], ok["pred"], labels=LABELS, average="macro", zero_division=0)
acc  = accuracy_score(ok["label"], ok["pred"])

print(f"F1: {f1ma:.4f}, Accuracy: {acc:.4f}, kept {len(ok)}/{len(df)} rows")
