In [1]:
!pip -q install --upgrade openai datasets scikit-learn tqdm huggingface_hub


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m116.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarro

In [2]:
# General config
DATASET_NAME = "TheFinAI/en-fpb"
SPLIT = "train"      # the dataset provides a 'train' split (~3.1k examples)
SEED = 42
N_SAMPLES = 100      # set to None for full dataset

# Sentiment labels we expect from the model
ALLOWED = ["positive", "neutral", "negative"]

# System prompt to force single-word outputs
SYSTEM_PROMPT = (
    "You are a finance sentiment rater. "
    "Return exactly one word from this set: positive, neutral, negative. "
    "No explanation. If uncertain, choose neutral."
)


In [3]:
import os, getpass

# Clear any bad value first
os.environ.pop("DEEPSEEK_API_KEY", None)

key = getpass.getpass("Paste your REAL DeepSeek API key (hidden): ").strip()

# Basic sanity checks (don’t paste keys in chat)
if not key.startswith("sk-"):
    raise ValueError("This doesn't look like a DeepSeek API key (expected to start with 'sk-').")
if " " in key or "\n" in key or "\t" in key:
    raise ValueError("Your key contains whitespace characters. Paste exactly the key only.")
if len(key) < 30:
    raise ValueError("Key looks too short. Double-check you copied the full key.")

os.environ["DEEPSEEK_API_KEY"] = key
print("Key set. Last 6 chars:", key[-6:])


Paste your REAL DeepSeek API key (hidden): ··········
Key set. Last 6 chars: 1f573d


In [5]:
!pip -q install --upgrade huggingface_hub datasets
from huggingface_hub import login, whoami
import getpass

hf_token = getpass.getpass("Paste your Hugging Face token (hidden): ").strip()
login(token=hf_token)          # stores the token in your Colab env
print("Logged in as:", whoami().get("name", "unknown"))

from datasets import load_dataset

# defaults if you skipped earlier config cells
try: SEED
except NameError: SEED = 42
try: N_SAMPLES
except NameError: N_SAMPLES = 50  # set to None for full split

def load_en_fpb(hf_token):
    try:
        # Newer datasets versions
        return load_dataset("TheFinAI/en-fpb", split="train", token=hf_token)
    except TypeError:
        # Older datasets versions
        return load_dataset("TheFinAI/en-fpb", split="train", use_auth_token=hf_token)

ds = load_en_fpb(hf_token)
if N_SAMPLES is not None:
    ds = ds.shuffle(seed=SEED).select(range(min(N_SAMPLES, len(ds))))
print("Rows to evaluate:", len(ds))
ds[0]


Paste your Hugging Face token (hidden): ··········
Logged in as: Hanlin0914
Rows to evaluate: 100


{'id': 'fpb2204',
 'query': 'Analyze the sentiment of this statement extracted from a financial news article. Provide your answer as either negative, positive, or neutral.\nText: In August , Latvijas Finieris ordered all production lines for a new green veneer mill to be built in Ukmerge , Lithuania .\nAnswer:',
 'answer': 'neutral',
 'text': 'In August , Latvijas Finieris ordered all production lines for a new green veneer mill to be built in Ukmerge , Lithuania .',
 'choices': ['positive', 'neutral', 'negative'],
 'gold': 1}

In [8]:
# Cell 6 (self-contained): pick a DeepSeek model if missing, then run inference with caching
import os, requests, json, pathlib, re, time
from tqdm import tqdm

# ---- Guard: dataset must be loaded (Cell A2/B1) ----
assert 'ds' in globals(), "Dataset 'ds' not found. Run your dataset-loading cell first."

# ---- Auth / HTTP setup ----
if not os.environ.get("DEEPSEEK_API_KEY"):
    raise RuntimeError("Set DEEPSEEK_API_KEY first (the real key from your DeepSeek console).")

BASE_URL = "https://api.deepseek.com/v1"
HEADERS = {
    "Authorization": f"Bearer {os.environ['DEEPSEEK_API_KEY']}",
    "Content-Type": "application/json",
}

# ---- Pick a model if MODEL is not already defined ----
def _ping(model: str) -> bool:
    try:
        r = requests.post(
            f"{BASE_URL}/chat/completions",
            headers=HEADERS,
            json={
                "model": model,
                "messages": [
                    {"role":"system","content":"Say ok"},
                    {"role":"user","content":"ok"}
                ],
                "temperature": 0,
                "max_tokens": 2,
            },
            timeout=30,
        )
        return r.status_code == 200
    except Exception:
        return False

MODEL = globals().get("MODEL")
if not MODEL:
    for cand in ["deepseek-reasoner","deepseek-r1","deepseek-reasoner-70b",
                 "deepseek-reasoner-32b","deepseek-r1-distill","deepseek-reasoner-lite",
                 "deepseek-chat","deepseek-v3"]:
        if _ping(cand):
            MODEL = cand
            break
    if not MODEL:
        raise RuntimeError("No accessible DeepSeek model with this key (401/403). Check your key/billing.")

print("Using MODEL:", MODEL)

# ---- Prompting helpers ----
ALLOWED = ["positive", "neutral", "negative"]
SYSTEM_PROMPT = (
    "You are a finance sentiment rater. "
    "Return exactly one word from this set: positive, neutral, negative. "
    "No explanation. If uncertain, choose neutral."
)

def _normalize_label(s: str) -> str:
    if not s: return "neutral"
    s = s.strip().lower()
    for lab in ALLOWED:
        if re.search(rf"\b{lab}\b", s):
            return lab
    for tok in re.sub(r"[^a-z]", " ", s).split():
        if tok in ALLOWED:
            return tok
    return "neutral"

def _user_prompt(text: str) -> str:
    return (
        "Classify the sentiment of the following financial news statement as "
        "one of: positive, neutral, negative.\n\n"
        f"Statement: {text}\n\n"
        "Answer with only one word."
    )

def _chat_once(prompt: str, tries: int = 6) -> str:
    backoff = 1.5
    for _ in range(tries):
        r = requests.post(
            f"{BASE_URL}/chat/completions",
            headers=HEADERS,
            json={
                "model": MODEL,
                "messages": [
                    {"role":"system","content": SYSTEM_PROMPT},
                    {"role":"user","content": prompt}
                ],
                "temperature": 0,
                "max_tokens": 6
            },
            timeout=60
        )
        if r.status_code == 200:
            return r.json()["choices"][0]["message"]["content"]
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(backoff); backoff *= 1.8
            continue
        if r.status_code in (401, 403):
            raise RuntimeError("DeepSeek authentication failed (401/403). Double-check DEEPSEEK_API_KEY.")
        raise RuntimeError(f"API error {r.status_code}: {r.text[:250]}")
    raise RuntimeError("Exceeded retries to DeepSeek API.")

# ---- Inference with caching ----
CACHE_PATH = pathlib.Path(f"{MODEL.replace('/','_')}_en_fpb_predictions.jsonl")
pred_rows, seen_ids = [], set()

if CACHE_PATH.exists():
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            pred_rows.append(obj); seen_ids.add(obj["id"])
    print(f"Resumed from cache: {len(pred_rows)} rows")

for i, row in enumerate(tqdm(ds, total=len(ds))):
    rid = row.get("id", f"row_{i}")
    if rid in seen_ids:
        continue
    raw = _chat_once(_user_prompt(row["text"]))
    pred = _normalize_label(raw)
    gold = (row.get("answer") or row.get("label")).lower()
    out = {"id": rid, "text": row["text"], "gold": gold, "pred": pred, "raw": raw}
    pred_rows.append(out)
    with open(CACHE_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(out, ensure_ascii=False) + "\n")

len(pred_rows)


Using MODEL: deepseek-reasoner


100%|██████████| 100/100 [02:21<00:00,  1.41s/it]


100

In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

df = pd.DataFrame(pred_rows)
y_true = df["gold"].str.lower().values
y_pred = df["pred"].str.lower().values

acc = accuracy_score(y_true, y_pred)
macro_f1 = f1_score(y_true, y_pred, average="macro")

print(f"MODEL USED : {MODEL}")
print(f"Samples    : {len(df)}")
print(f"Accuracy   : {acc:.4f}")
print(f"Macro F1   : {macro_f1:.4f}\n")

print(classification_report(y_true, y_pred, labels=ALLOWED, digits=4, zero_division=0))
cm = pd.DataFrame(confusion_matrix(y_true, y_pred, labels=ALLOWED), index=ALLOWED, columns=ALLOWED)
print("\nConfusion matrix (rows=true, cols=pred):")
cm


MODEL USED : deepseek-reasoner
Samples    : 100
Accuracy   : 0.5200
Macro F1   : 0.2281

              precision    recall  f1-score   support

    positive     0.0000    0.0000    0.0000        33
     neutral     0.5200    1.0000    0.6842        52
    negative     0.0000    0.0000    0.0000        15

    accuracy                         0.5200       100
   macro avg     0.1733    0.3333    0.2281       100
weighted avg     0.2704    0.5200    0.3558       100


Confusion matrix (rows=true, cols=pred):


Unnamed: 0,positive,neutral,negative
positive,0,33,0
neutral,0,52,0
negative,0,15,0
