News S&P 500 next-day direction

This builds a daily dataset from a CSV of headlines + close prices, then trains a small baseline classifier to predict whether the next day closes higher than the current day.

Two feature modes:
- bag-of-words: fast, shallow baseline on the concatenated daily headlines.
- FinBERT: uses a pretrained sentiment model to produce daily sentiment probabilities (neg/neu/pos) + headline count.


imports

In [None]:
import re
import hashlib
import os

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import matplotlib.pyplot as plt

Hashed bag-of-words BoW

We tokenize by extracting only letters (a–z, A–Z), lowercasing, then counting tokens into hash bins.
we use `log1p` to reduce the impact of frequent words.



In [41]:
word_re = re.compile(r"[a-zA-Z]+")

def toks(s):
    # Return list of lowercase word tokens from string
    if not isinstance(s, str):
        return []
    return word_re.findall(s.lower())

def stable_hash(word, bins):
    #Using md5 so it is stable across runs
    h = hashlib.md5(word.encode("utf-8")).hexdigest()
    return int(h[:8], 16) % bins

def make_hash_X(texts, bins=50000):
    # Build hashed BoW features for a list of texts
    X = np.zeros((len(texts), bins), dtype=np.float32)
    for i, t in enumerate(texts):
        for w in toks(t):
            X[i, stable_hash(w, bins)] += 1.0
    X = np.log1p(X)
    return torch.tensor(X, dtype=torch.float32)

Training loop, logistic regression

We treat the task as binary classification:
y = 1 if close_next > close
y = 0 otherwise

Model is very simple, it is one linear layer outputting a single logit.
Loss is defined as BCEWithLogitsLoss expects logits directly
Note, remmeber to delete print


In [42]:
def eval_acc(model, X, y):
    # Compute accuracy for a given model and data.
    dev = next(model.parameters()).device
    X, y = X.to(dev), y.to(dev)
    model.eval()
    with torch.no_grad():
        p = torch.sigmoid(model(X).squeeze(1))
        pred = (p >= 0.5).float()
        return (pred == y).float().mean().item()

def train_lr(Xtr, ytr, Xva, yva, epochs=15, lr=0.2, eval_every=1):
    #Train a logistic-regression style linear model
    dev = "cuda" if torch.cuda.is_available() else "cpu"
    Xtr, ytr = Xtr.to(dev), ytr.to(dev)
    Xva, yva = Xva.to(dev), yva.to(dev)

    model = torch.nn.Linear(Xtr.shape[1], 1).to(dev)
    opt = torch.optim.SGD(model.parameters(), lr=lr)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    for ep in range(1, epochs + 1):
        model.train()
        opt.zero_grad()
        logits = model(Xtr).squeeze(1)
        loss = loss_fn(logits, ytr)
        loss.backward()
        opt.step()

        if eval_every and (ep % eval_every == 0):
            va_acc = eval_acc(model, Xva, yva)
            print(f"ep {ep:02d} | loss {loss.item():.4f} | val_acc {va_acc:.4f}")
        else:
            print(f"ep {ep:02d} | loss {loss.item():.4f}")

    return model

def majority_baseline_acc(y_train, y_test):
    # Baseline: always predict the most common class in training
    p = 1.0 if np.mean(y_train) >= 0.5 else 0.0
    pred = np.full_like(y_test, p, dtype=float)
    acc = (pred == y_test).mean()
    print(f"majority_baseline_acc {acc:.4f}")
    return acc







FinBERT daily sentiment features, also caching.

For each day, run FinBERT on each headline then convert logits to probabilities with softmax (neg/neu/pos).
we then averege probabilities across headlines to get daily sentiment vector,
Add n_headlines as an extra feature as number of headlines. 

FinBERT is somehow slow. We cache the daily features to a CSV so repeated runs are fast. (it is fast running ones, but when testing 100 of times this is nice to have)



In [43]:
def finbert_feats(day_to_titles, model_name, bs=16, max_len=64):
    dev = "cuda" if torch.cuda.is_available() else "cpu"
    tokz = AutoTokenizer.from_pretrained(model_name)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_name).to(dev)
    mdl.eval()

    dates = []
    feats = []
    n_news = []

    for d, titles in day_to_titles.items():
        titles = [t for t in titles if isinstance(t, str) and t.strip() != ""]
        n_news.append(len(titles))

        if len(titles) == 0:
            # if a day has no headlines, set neutral
            dates.append(d)
            feats.append([0.0, 1.0, 0.0])
            continue

        probs_all = []
        for i in range(0, len(titles), bs):
            batch = titles[i:i + bs]
            enc = tokz(
                batch,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt",
            )
            enc = {k: v.to(dev) for k, v in enc.items()}
            with torch.no_grad():
                out = mdl(**enc)
                probs = torch.softmax(out.logits, dim=1)  # neg/neu/pos
                probs_all.append(probs.cpu())

        probs_all = torch.cat(probs_all, dim=0)
        feats.append(probs_all.mean(dim=0).tolist())
        dates.append(d)

    f = pd.DataFrame(feats, columns=["p_neg", "p_neu", "p_pos"])
    f["date"] = dates
    f["n_headlines"] = n_news
    return f

def get_finbert_daily(day_df, model_name, bs=16, max_len=64):
    safe = model_name.replace("/", "_")
    cache_path = f"finbert_daily_cache_{safe}_len{max_len}.csv"

    if os.path.exists(cache_path):
        f = pd.read_csv(cache_path)
        f["date"] = pd.to_datetime(f["date"]).dt.date
        return f

    day_map = dict(zip(day_df["date"], day_df["title_list"]))
    f = finbert_feats(day_map, model_name, bs=bs, max_len=max_len)
    f.to_csv(cache_path, index=False)
    return f




Load CSV and build dataset

Expected columns in the CSV:
- Date= date of the headline row
- cp= close price (or the price column you want to use)
- Title= headline text

Steps: 
1. Parse data
2. Aggregate by day (one groupby pass):
   - text: concatenated headlines (for BoW)
   - title_list: list of headlines (for FinBERT)
   - close: last close of that day
3. Create target y using the next day close
4. Split by year into train/val/test (time-based split)

OBS set bow of finbert manualy!!!


In [44]:
CSV_PATH = "sp500_headlines_2008_2024.csv"
MODE = "finbert"  # "bow" or finbert
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)

df = pd.read_csv(CSV_PATH)

df["date"] = pd.to_datetime(df["Date"], errors="coerce").dt.date
df["close"] = pd.to_numeric(df["CP"], errors="coerce")
df["title"] = df["Title"].astype(str)
df = df.dropna(subset=["date", "close", "title"]).copy()

# aggregate per day 
day = (
    df.sort_values("date")
      .groupby("date", as_index=False)
      .agg(
          text=("title", lambda s: " . ".join(s.tolist())),
          title_list=("title", lambda s: s.tolist()),
          close=("close", "last"),
      )
      .sort_values("date")
      .reset_index(drop=True)
)

# next day direction label
day["close_next"] = day["close"].shift(-1)
day = day.dropna(subset=["close_next"]).copy()
day["y"] = (day["close_next"] > day["close"]).astype(np.float32)

day["year"] = pd.to_datetime(day["date"]).dt.year

train = day[day["year"] <= 2018].copy()
val   = day[(day["year"] >= 2019) & (day["year"] <= 2021)].copy()
test  = day[day["year"] >= 2022].copy()

# baseline just to know if the model is doing anything useful
majority_baseline_acc(train["y"].values, test["y"].values)

ytr = torch.tensor(train["y"].values, dtype=torch.float32)
yva = torch.tensor(val["y"].values, dtype=torch.float32)
yte = torch.tensor(test["y"].values, dtype=torch.float32)

print("rows:", len(train), len(val), len(test))

majority_baseline_acc 0.4953
rows: 2222 747 537


Train depending on feature mode from earlier 

At the end we see test_acc


In [47]:
if MODE == "bow":
    Xtr = make_hash_X(train["text"].tolist(), bins=50000)
    Xva = make_hash_X(val["text"].tolist(), bins=50000)
    Xte = make_hash_X(test["text"].tolist(), bins=50000)

    model = train_lr(Xtr, ytr, Xva, yva, epochs=50, lr=0.2, eval_every=1)
    print(f"test_acc {eval_acc(model, Xte, yte):.4f}")

elif MODE == "finbert":
    FINBERT_MODEL = "ProsusAI/finbert"

    all_f = get_finbert_daily(day[["date", "title_list"]], FINBERT_MODEL, bs=16, max_len=64)

    ftr = train[["date"]].merge(all_f, on="date", how="left")
    fva = val[["date"]].merge(all_f, on="date", how="left")
    fte = test[["date"]].merge(all_f, on="date", how="left")

    Xtr = torch.tensor(ftr[["p_neg", "p_neu", "p_pos", "n_headlines"]].values, dtype=torch.float32)
    Xva = torch.tensor(fva[["p_neg", "p_neu", "p_pos", "n_headlines"]].values, dtype=torch.float32)
    Xte = torch.tensor(fte[["p_neg", "p_neu", "p_pos", "n_headlines"]].values, dtype=torch.float32)


    model = train_lr(Xtr, ytr, Xva, yva, epochs=500, lr=0.5, eval_every=2)
    print(f"test_acc {eval_acc(model, Xte, yte):.4f}")

else:
    raise ValueError("Mistakes were made")

ep 01 | loss 0.8361
ep 02 | loss 0.7046 | val_acc 0.5529
ep 03 | loss 0.7000
ep 04 | loss 0.6982 | val_acc 0.5502
ep 05 | loss 0.6973
ep 06 | loss 0.6967 | val_acc 0.5515
ep 07 | loss 0.6962
ep 08 | loss 0.6958 | val_acc 0.5582
ep 09 | loss 0.6954
ep 10 | loss 0.6950 | val_acc 0.5676
ep 11 | loss 0.6947
ep 12 | loss 0.6944 | val_acc 0.5689
ep 13 | loss 0.6941
ep 14 | loss 0.6938 | val_acc 0.5689
ep 15 | loss 0.6936
ep 16 | loss 0.6933 | val_acc 0.5703
ep 17 | loss 0.6931
ep 18 | loss 0.6929 | val_acc 0.5689
ep 19 | loss 0.6927
ep 20 | loss 0.6925 | val_acc 0.5730
ep 21 | loss 0.6924
ep 22 | loss 0.6922 | val_acc 0.5730
ep 23 | loss 0.6921
ep 24 | loss 0.6920 | val_acc 0.5743
ep 25 | loss 0.6918
ep 26 | loss 0.6917 | val_acc 0.5743
ep 27 | loss 0.6916
ep 28 | loss 0.6915 | val_acc 0.5743
ep 29 | loss 0.6914
ep 30 | loss 0.6913 | val_acc 0.5730
ep 31 | loss 0.6913
ep 32 | loss 0.6912 | val_acc 0.5743
ep 33 | loss 0.6911
ep 34 | loss 0.6910 | val_acc 0.5770
ep 35 | loss 0.6910
ep 36 | los

naive bayes experiment, not sure if i have the time to 

In [46]:
#naive bayes experiment

def _nb_make_hash_counts(texts, bins=50000):
    X = np.zeros((len(texts), bins), dtype=np.float32)
    for i, t in enumerate(texts):
        for w in toks(t):
            X[i, stable_hash(w, bins)] += 1.0
    return X

def _nb_fit_multinomial(X, y, alpha=1.0):
    y = np.asarray(y).astype(np.int64)
    n, d = X.shape

    n1 = int(y.sum())
    n0 = n - n1

    logp0 = np.log((n0 + 1e-12) / n)
    logp1 = np.log((n1 + 1e-12) / n)


    X0 = X[y == 0].sum(axis=0) + alpha
    X1 = X[y == 1].sum(axis=0) + alpha

    logphi0 = np.log(X0 / X0.sum())
    logphi1 = np.log(X1 / X1.sum())

    return {"logp0": logp0, "logp1": logp1, "logphi0": logphi0, "logphi1": logphi1}

def _nb_predict_p1(m, X):
    s0 = m["logp0"] + X @ m["logphi0"]
    s1 = m["logp1"] + X @ m["logphi1"]
    mx = np.maximum(s0, s1)
    p1 = np.exp(s1 - mx) / (np.exp(s0 - mx) + np.exp(s1 - mx))
    return p1

def _nb_acc(m, X, y, name="test"):
    y = np.asarray(y).astype(np.float32)
    p1 = _nb_predict_p1(m, X)
    pred = (p1 >= 0.5).astype(np.float32)
    acc = (pred == y).mean()
    print(f"nb_{name}_acc {acc:.4f}")
    return acc



try:
    #
    Xtr_nb = _nb_make_hash_counts(train["text"].tolist(), bins=50000)
    Xva_nb = _nb_make_hash_counts(val["text"].tolist(), bins=50000)
    Xte_nb = _nb_make_hash_counts(test["text"].tolist(), bins=50000)

    nb_model = _nb_fit_multinomial(Xtr_nb, train["y"].values, alpha=1.0)
    _nb_acc(nb_model, Xva_nb, val["y"].values, name="val")
    _nb_acc(nb_model, Xte_nb, test["y"].values, name="test")

except NameError:
    print("Mistakes were made again")


nb_val_acc 0.5382
nb_test_acc 0.4991
