In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm

# 1) HuggingFace / FinBERT imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer, util

# 2) Scikit‐learn for regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import yfinance as yf

# ============================================
#  STEP A: LOAD RAW DATA
# ============================================

# ---- A.1: News CSV ----
# Assumed format: "news_2022.csv" with columns:
#    Date (e.g. "2022-01-03 09:15:00 UTC"), 
#    Article_title (headline string)
news_df = pd.read_csv("nasdaq_2022.csv", parse_dates=["Date"])
news_df.rename(columns={"Date": "timestamp", "Article_title": "headline"}, inplace=True)

# If there is a 'Ticker' column in your CSV, you can keep it. 
# For simplicity below, we assume all headlines refer to **one** ticker. If you have multiple tickers,
# you can add a "Ticker" column and then groupby ticker as shown later.

# Extract just the calendar date for grouping (UTC → date)
news_df["date"] = news_df["timestamp"].dt.date

# ---- A.2: Returns CSV ----
# Assumed format: "returns_2022.csv" with at least columns:
#    Date (e.g. "2022-01-03"),
#    Close (float)
# Download daily adjusted close prices for Google (GOOG) for 2022
goog = yf.download("GOOG", start="2022-01-01", end="2023-01-01", progress=False)
ret_df = goog.reset_index()[["Date", "Close"]]
ret_df.rename(columns={"Date": "date", "Close": "close"}, inplace=True)
ret_df["date"] = ret_df["date"].dt.date

# Compute next‐day return per date. We'll align "news on D" → "return on D+1".
ret_df["next_day_return"] = (
    ret_df["close"].shift(-1) - ret_df["close"]
) / ret_df["close"]

# Drop the last row (it has no next‐day price)
ret_df = ret_df.iloc[:-1].copy()

# ============================================
#  STEP B: SET UP EMBEDDERS & FINBERT
# ============================================

# ---- B.1: Sentence‐Transformer for relevance ----
# We'll load a pretrained sentence‐transformer (e.g. all‐MPNet‐base‐v2) to embed headlines + a ticker description.
# If you prefer a financial‐domain SBERT (e.g. "patrickvonplaten/finbert-sentiment" or any custom model),
# just swap the model name below.
sbert_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Pre‐compute an embedding for your stock description. For example:
stock_description = (
    "Alphabet Inc. (Google) is a global technology company specializing in internet-related services and products, including search, advertising, cloud computing, software, and hardware. "
    "Its core products include Google Search, YouTube, Android, Google Cloud, and more."
)
stock_desc_emb = sbert_model.encode(stock_description, convert_to_tensor=True)
# If you have multiple tickers, you could do a dict { "AAPL": embAAPL, "TSLA": embTSLA, ... }.

# ---- B.2: FinBERT sentiment pipeline ----
# We will load the FinBERT model from HuggingFace. Note: this may download ~400 MB the first time.
finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
finbert_pipeline = pipeline(
    "sentiment-analysis",
    model=finbert_model,
    tokenizer=finbert_tokenizer,
    device=0  # Use GPU (cuda:0)
)

# Helper: take the pipeline output and return (P_pos, P_neg, P_neu)
def finbert_probs_for_text(text: str):
    """
    Returns a tuple (P_pos, P_neg, P_neu) for the given headline text.
    The FinBERT pipeline returns something like:
      [ {"label":"Positive","score":0.897}, {"label":"Neutral","score":0.070}, {"label":"Negative","score":0.033} ]
    We reorder or match appropriately.
    """
    raw = finbert_pipeline(text, truncation=True, max_length=128)
    # The raw output may be a list of 3 dicts; we map them into a dict label→score
    probs = { d["label"].lower(): d["score"] for d in raw }
    # Ensure all three keys exist; if missing, set to 0:
    Ppos = probs.get("positive", 0.0)
    Pneu = probs.get("neutral", 0.0)
    Pneg = probs.get("negative", 0.0)
    return Ppos, Pneg, Pneu

  from .autonotebook import tqdm as notebook_tqdm



YF.download() has changed argument auto_adjust default to True



1 Failed download:
['NVDA']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
Device set to use cuda:0


In [5]:
# ============================================
#  STEP C: PROCESS EACH ARTICLE
# ============================================

# We'll build three new columns in news_df:
#   1) r_i = ## relevance (cosine between stock_desc_emb and headline_emb)
#   2) P_pos, P_neg, P_neu (from FinBERT)
#   3) sentiment_scalar = P_pos - P_neg
#   4) feat_pos = r_i * P_pos, feat_neg = r_i * P_neg, feat_neu = r_i * P_neu

relevance_list = []
Ppos_list = []
Pneg_list = []
Pneu_list = []
sent_scalar_list = []
feat_pos_list = []
feat_neg_list = []
feat_neu_list = []

# For speed, do all headline embeddings in batch:
all_headlines = news_df["headline"].tolist()
headline_embs = sbert_model.encode(all_headlines, convert_to_tensor=True, show_progress_bar=True)

for idx, row in tqdm(news_df.iterrows(), total=len(news_df)):
    h_emb = headline_embs[idx]  # torch.Tensor
    # 1) relevance = cosine(stock_desc_emb, headline_embs[idx])
    cos_sim = util.cos_sim(stock_desc_emb, h_emb).item()
    relevance_list.append(cos_sim)

    # 2) FinBERT probabilities:
    Ppos, Pneg, Pneu = finbert_probs_for_text(row["headline"])
    Ppos_list.append(Ppos)
    Pneg_list.append(Pneg)
    Pneu_list.append(Pneu)

    # 3) sentiment_scalar = P_pos - P_neg
    s_scalar = Ppos - Pneg
    sent_scalar_list.append(s_scalar)

    # 4) features for regression
    feat_pos_list.append(cos_sim * Ppos)
    feat_neg_list.append(cos_sim * Pneg)
    feat_neu_list.append(cos_sim * Pneu)

# Attach new columns
news_df["relevance"] = relevance_list
news_df["P_pos"] = Ppos_list
news_df["P_neg"] = Pneg_list
news_df["P_neu"] = Pneu_list
news_df["sentiment_scalar"] = sent_scalar_list
news_df["feat_pos"] = feat_pos_list
news_df["feat_neg"] = feat_neg_list
news_df["feat_neu"] = feat_neu_list

# (Optional) Drop very low‐relevance articles (e.g. r_i < 0.2)
relevance_threshold = 0.2
news_df = news_df[news_df["relevance"] >= relevance_threshold].copy()

Batches:   0%|          | 0/8762 [00:00<?, ?it/s]

100%|██████████| 280354/280354 [53:48<00:00, 86.83it/s] 


In [None]:
import pickle

## Save all relevant lists to disk using pickle
#with open("relevance_list.pkl", "wb") as f:
#    pickle.dump(relevance_list, f)
#
#with open("Ppos_list.pkl", "wb") as f:
#    pickle.dump(Ppos_list, f)
#
#with open("Pneg_list.pkl", "wb") as f:
#    pickle.dump(Pneg_list, f)
#
#with open("Pneu_list.pkl", "wb") as f:
#    pickle.dump(Pneu_list, f)
#
#with open("sent_scalar_list.pkl", "wb") as f:
#    pickle.dump(sent_scalar_list, f)
#
#with open("feat_pos_list.pkl", "wb") as f:
#    pickle.dump(feat_pos_list, f)
#
#with open("feat_neg_list.pkl", "wb") as f:
#    pickle.dump(feat_neg_list, f)
#
#with open("feat_neu_list.pkl", "wb") as f:
#    pickle.dump(feat_neu_list, f)

In [2]:
import pickle

with open("relevance_list.pkl", "rb") as f:
    relevance_list = pickle.load(f)

with open("Ppos_list.pkl", "rb") as f:
    Ppos_list = pickle.load(f)

with open("Pneg_list.pkl", "rb") as f:
    Pneg_list = pickle.load(f)

with open("Pneu_list.pkl", "rb") as f:
    Pneu_list = pickle.load(f)

with open("sent_scalar_list.pkl", "rb") as f:
    sent_scalar_list = pickle.load(f)

with open("feat_pos_list.pkl", "rb") as f:
    feat_pos_list = pickle.load(f)

with open("feat_neg_list.pkl", "rb") as f:
    feat_neg_list = pickle.load(f)

with open("feat_neu_list.pkl", "rb") as f:
    feat_neu_list = pickle.load(f)

In [29]:
# ============================================
#  STEP D: AGGREGATE TO DAILY “NormSentiment”
# ============================================

epsilon = 1e-6

# Ensure 'relevance' and 'sentiment_scalar' columns exist in news_df
if "relevance" not in news_df.columns or "sentiment_scalar" not in news_df.columns:
    news_df = news_df.copy()
    news_df["relevance"] = relevance_list
    news_df["sentiment_scalar"] = sent_scalar_list

# 1) Compute RawSentiment_D = sum_i (r_i * (P_pos_i - P_neg_i)) per date
# 2) Compute TotalRelevance_D = sum_i (r_i) per date
# 3) NormSentiment_D = RawSentiment_D / (TotalRelevance_D + ε)

grouped = news_df.groupby("date")
daily_raw  = grouped.apply(lambda df: np.sum(df["relevance"] * df["sentiment_scalar"])).rename("RawSentiment")
daily_rel  = grouped["relevance"].sum().rename("TotalRelevance")

daily_sentiment_df = pd.concat([daily_raw, daily_rel], axis=1).reset_index()
daily_sentiment_df["NormSentiment"] = (
    daily_sentiment_df["RawSentiment"] / (daily_sentiment_df["TotalRelevance"] + epsilon)
)

# Now `daily_sentiment_df` has columns:
#    date | RawSentiment | TotalRelevance | NormSentiment

# ============================================
#  STEP E: MERGE DAILY SENTIMENT WITH RETURNS
# ============================================

# Our ret_df has columns [date, close, next_day_return].
# We want to align “news on date D” → “return on date D+1”.
# We already computed ret_df["next_day_return"] via shift(-1). 
# So we can merge on `date`=D between daily_sentiment_df (signal on D) and ret_df (return on D).

# Flatten ret_df columns if they are MultiIndex
if isinstance(ret_df.columns, pd.MultiIndex):
    ret_df.columns = ['_'.join([str(i) for i in col if i]).strip('_') for col in ret_df.columns.values]

merged_daily = pd.merge(
    left = daily_sentiment_df,
    right = ret_df[["date", "next_day_return"]],
    on = "date",
    how = "inner"
)

# merged_daily now has: date | RawSentiment | TotalRelevance | NormSentiment | next_day_return
# Save daily_sentiment_df to CSV
daily_sentiment_df.to_csv("daily_sentiment_df.csv", index=False)

  daily_raw  = grouped.apply(lambda df: np.sum(df["relevance"] * df["sentiment_scalar"])).rename("RawSentiment")


In [11]:
# ============================================
#  STEP F: LEARN f VIA RIDGE REGRESSION
# ============================================

# We want to train on **article‐level** data. That is,
# each row is a single article_i, with (feat_pos, feat_neg, feat_neu)  → realized next_day_return_i.
# But we only know the next‐day return at the daily level. 
# So we attach each article_i (with `date`=D) to ret(D+1). Since we already computed ret_df["next_day_return"],
# we do an article‐level merge:
# Ensure feat_pos, feat_neg, feat_neu columns exist in news_df
if not all(col in news_df.columns for col in ["feat_pos", "feat_neg", "feat_neu"]):
    news_df = news_df.copy()
    news_df["feat_pos"] = feat_pos_list
    news_df["feat_neg"] = feat_neg_list
    news_df["feat_neu"] = feat_neu_list

articles_with_returns = pd.merge(
    left = news_df,
    right = ret_df[["date", "next_day_return"]],
    on = "date",
    how = "inner"
)
# Now each article_i has three features: feat_pos_i, feat_neg_i, feat_neu_i, 
# and label = next_day_return (same for all articles on date D).

X = articles_with_returns[["feat_pos", "feat_neg", "feat_neu"]]
y = articles_with_returns["next_day_return"]

# Split into train / test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Extract learned coefficients
coeffs_df = pd.DataFrame({
    "feature": ["r * P_pos", "r * P_neg", "r * P_neu"],
    "coefficient": ridge.coef_
})

print("\n=== Learned coefficients for f(r,P) ===")
print(coeffs_df.to_string(index=False))
print(f"\nTest R^2 score = {ridge.score(X_test, y_test):.4f}")

# ============================================
#  STEP G: HOW TO USE THE LEARNED f GOING FORWARD
# ============================================

w_pos, w_neg, w_neu = ridge.coef_

# Suppose you want to compute, for a brand‐new article with (r_new, P_pos_new, P_neg_new, P_neu_new):
#       article_score = w_pos * (r_new * P_pos_new)
#                     + w_neg * (r_new * P_neg_new)
#                     + w_neu * (r_new * P_neu_new)
#
# Then you can re‐aggregate all article_scores for date D:
#   LearnedRaw_D = Σ_i article_score_i 
# and optionally
#   LearnedNorm_D = LearnedRaw_D / ( Σ_i r_i + ε )
#
# If you store (r, P_pos, P_neg, P_neu) in a DataFrame again, you just do:

# Example (pseudocode) if you have a DataFrame `new_articles_df` for “tomorrow’s headlines”:
# new_articles_df["article_score"] = (
#     w_pos * (new_articles_df["relevance"] * new_articles_df["P_pos"])
#   + w_neg * (new_articles_df["relevance"] * new_articles_df["P_neg"])
#   + w_neu * (new_articles_df["relevance"] * new_articles_df["P_neu"])
# )
# Then per date D: 
# LearnedRaw = new_articles_df.groupby("date")["article_score"].sum()
# LearnedNorm = LearnedRaw / (new_articles_df.groupby("date")["relevance"].sum() + epsilon)

# ============================================
#  STEP H: SUMMARY OUTPUTS
# ============================================

# 1) daily_sentiment_df  → has NormSentiment per day
print("\n=== Sample of daily_sentiment_df ===")
print(daily_sentiment_df.head().to_string(index=False))

# 2) merged_daily → (NormSentiment, next_day_return) per day
print("\n=== Sample of merged_daily (NormSentiment vs. next_day_return) ===")
print(merged_daily.head().to_string(index=False))

# 3) coeffs_df → gives w_pos, w_neg, w_neu
print("\n=== Learned f coefficients ===")
print(coeffs_df.to_string(index=False))



=== Learned coefficients for f(r,P) ===
  feature  coefficient
r * P_pos    -0.003063
r * P_neg     0.002731
r * P_neu    -0.002993

Test R^2 score = 0.0000

=== Sample of daily_sentiment_df ===
      date  RawSentiment  TotalRelevance  NormSentiment
2022-01-01     -0.663641        8.145452      -0.081474
2022-01-02      0.202113       15.495500       0.013043
2022-01-03      7.190523       68.578552       0.104851
2022-01-04      4.512509       72.306265       0.062408
2022-01-05      4.750061       73.277095       0.064823

=== Sample of merged_daily (NormSentiment vs. next_day_return) ===
      date  RawSentiment  TotalRelevance  NormSentiment  next_day_return
2022-01-03      7.190523       68.578552       0.104851        -0.004536
2022-01-04      4.512509       72.306265       0.062408        -0.046830
2022-01-05      4.750061       73.277095       0.064823        -0.000745
2022-01-06      6.030978       74.090826       0.081400        -0.003973
2022-01-07      0.339609       75.9

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

# 1) Split data
articles = articles_with_returns  # from previous merge
X = articles[["feat_pos", "feat_neg", "feat_neu"]]
y = articles["next_day_return"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2) Tune hyperparameters via a small grid
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, None],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["auto", "sqrt"]
}
grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1
)
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_

print("Best RF params:", grid.best_params_)
print("CV R2 (best):", grid.best_score_)
print("Test R2 :", best_rf.score(X_test, y_test))

# 3) Inspect feature importances
import pandas as pd
importances = best_rf.feature_importances_
feat_names = ["r * P_pos", "r * P_neg", "r * P_neu"]
feat_imp = pd.DataFrame({
    "feature": feat_names,
    "importance": importances
}).sort_values("importance", ascending=False)
print(feat_imp)

# 4) (Optional) Evaluate classification accuracy
#    You could binarize the target: up = (next_day_return > 0)
#    and train RandomForestClassifier on the same features.


54 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
38 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\thoma\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\thoma\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\thoma\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\thoma\AppData\Local\Programs\Python\Python312\Lib

Best RF params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 200}
CV R2 (best): 0.048969454771427
Test R2 : 0.10973861242436234
     feature  importance
2  r * P_neu    0.583131
0  r * P_pos    0.225721
1  r * P_neg    0.191147


In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import pandas as pd

X = merged_daily[['RawSentiment', 'TotalRelevance', 'NormSentiment']]
y = (merged_daily['next_day_return'] > 0).astype(int)

tscv = TimeSeriesSplit(n_splits=5)

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

cv_scores = []
for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    cv_scores.append(acc)
    
    print("Fold accuracy:", round(acc, 4))
    print(confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred))

print("\nAverage temporal CV accuracy:", round(np.mean(cv_scores), 4))


ValueError: Too many splits=5 for number of samples=250 with test_size=100 and gap=0.

In [25]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
import math
from sklearn.metrics import f1_score

# ------------------------------------------------------------------------------
# ASSUMPTIONS:
# 1) You already have `news_df` with columns:
#     ['date' (datetime.date),
#      'relevance' (float),
#      'P_pos', 'P_neg', 'P_neu' (floats)]
#
# 2) You already have `returns_df` with columns:
#     ['date' (datetime.date),
#      'next_day_return' (float)]
#
# 3) `news_df['date']` maps exactly to the same "date" field in `returns_df`.
# ------------------------------------------------------------------------------
# Example placeholders (remove these if you already loaded your data):
# news_df = pd.read_csv("news_processed.csv", parse_dates=["date"])
# returns_df = pd.read_csv("returns_processed.csv", parse_dates=["date"])
# returns_df["next_day_return"] = (returns_df["close"].shift(-1) - returns_df["close"]) / returns_df["close"]
# returns_df = returns_df.iloc[:-1]  # drop last row (no next‐day return)

# ------------------------------------------------------------------------------
# 1) Define a bunch of candidate f() functions
#    Each function takes a row of news_df (with r, P_pos, P_neg, P_neu) and returns a float
# ------------------------------------------------------------------------------

def f1(r, ppos, pneg, pneu):
    # Classic linear: r * (P_pos - P_neg)
    return r * (ppos - pneg)

def f2(r, ppos, pneg, pneu):
    # Only positive‐weighted by relevance
    return r * ppos

def f3(r, ppos, pneg, pneu):
    # Mix positive and neutral as a boost: r * (P_pos + 0.5 * P_neu)
    return r * (ppos + 0.5 * pneu)

def f4(r, ppos, pneg, pneu):
    # Drop relevance entirely, just “pos-neg”
    return (ppos - pneg)

def f5(r, ppos, pneg, pneu):
    # Exponential of the classic linear: exp( r * (P_pos - P_neg) )
    # (shifts everything >1, but we’ll still sum up per day)
    return math.exp(r * (ppos - pneg))

def f6(r, ppos, pneg, pneu):
    # r * exp(P_pos - P_neg)  (relevance times exponential of “net sentiment”)
    return r * math.exp(ppos - pneg)

def f7(r, ppos, pneg, pneu):
    # r * logistic( P_pos - P_neg )  to squash net sentiment into (0,1)
    z = ppos - pneg
    return r * (1 / (1 + math.exp(-z)))

def f8(r, ppos, pneg, pneu):
    # Polynomial: r * (P_pos - P_neg)^2  so that strong sentiment (pos or neg) is amplified
    return r * ((ppos - pneg) ** 2)

def f9(r, ppos, pneg, pneu):
    # “Hybrid” that penalizes neutral: r * (P_pos - P_neg) * (1 - P_neu)
    return r * (ppos - pneg) * (1 - pneu)

# You can add more variants here, e.g. f10, f11, etc.

f_functions = {
    "f1: r*(P_pos-P_neg)":      f1,
    "f2: r*P_pos":              f2,
    "f3: r*(P_pos+0.5P_neu)":    f3,
    "f4: (P_pos-P_neg)":         f4,
    "f5: exp(r*(P_pos-P_neg))":  f5,
    "f6: r*exp(P_pos-P_neg)":    f6,
    "f7: r*sigmoid(P_pos-P_neg)":f7,
    "f8: r*(P_pos-P_neg)^2":     f8,
    "f9: r*(P_pos-P_neg)*(1-P_neu)": f9
}

# ------------------------------------------------------------------------------
# 2) Loop over each f, aggregate daily, run time-series CV, print mean accuracy
# ------------------------------------------------------------------------------

results = {}

for name, func in f_functions.items():
    print(f"\n=== Evaluating {name} ===")
    
    # 2.a) Apply `func` to all rows of news_df to get an “article_score” column.
    #      We assume news_df has columns: ['date', 'relevance', 'P_pos', 'P_neg', 'P_neu'].
    news_df["article_score"] = news_df.apply(
        lambda row: func(row["relevance"], row["P_pos"], row["P_neg"], row["P_neu"]),
        axis=1
    )
    
    # 2.b) Aggregate per day: DailySentiment = sum of article_score for that date
    daily_sentiment = (
        news_df
        .groupby("date")["article_score"]
        .sum()
        .reset_index()
        .rename(columns={"article_score": "daily_sentiment"})
    )
    
    # 2.c) Merge with ret_df so that daily_sentiment(D) → next_day_return(D)
    merged = pd.merge(
        daily_sentiment, 
        ret_df[["date", "next_day_return"]],
        on="date",
        how="inner"
    )
    
    # 2.d) Build binary labels: 1 if next_day_return > 0, else 0
    merged["label"] = (merged["next_day_return"] > 0).astype(int)
    
    # 2.e) Prepare X, y
    X = merged[["daily_sentiment"]].copy()
    y = merged["label"].copy()
    
    # 2.f) TimeSeriesSplit CV
    tscv = TimeSeriesSplit(n_splits=5)
    fold_acc = []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        clf = RandomForestClassifier(
            n_estimators=200, 
            max_depth=None, 
            min_samples_leaf=5, 
            max_features="sqrt", 
            random_state=42,
            n_jobs=-1
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        fold_acc.append(acc)
        
        # Optionally, print each fold’s confusion matrix / classification report:
        # from sklearn.metrics import confusion_matrix, classification_report
        # print(confusion_matrix(y_test, y_pred))
        # print(classification_report(y_test, y_pred))
    
    mean_acc = np.mean(fold_acc)
    std_acc  = np.std(fold_acc)
    print(f"→ Mean CV accuracy = {mean_acc:.4f}  (± {std_acc:.4f})")
    results[name] = (mean_acc, std_acc)

# ------------------------------------------------------------------------------
# 3) Summarize all results at once
# ------------------------------------------------------------------------------
print("\n=== Summary of Mean CV Accuracies ===")
for name, (mean_acc, std_acc) in results.items():
    print(f"{name:30s} →  {mean_acc:.4f}  ± {std_acc:.4f}")

print("\n=== Summary of Mean CV F1 Scores ===")
for name, func in f_functions.items():
	# Recompute daily sentiment and labels for each function
	news_df["article_score"] = news_df.apply(
		lambda row: func(row["relevance"], row["P_pos"], row["P_neg"], row["P_neu"]),
		axis=1
	)
	daily_sentiment = (
		news_df
		.groupby("date")["article_score"]
		.sum()
		.reset_index()
		.rename(columns={"article_score": "daily_sentiment"})
	)
	merged = pd.merge(
		daily_sentiment, 
		ret_df[["date", "next_day_return"]],
		on="date",
		how="inner"
	)
	merged["label"] = (merged["next_day_return"] > 0).astype(int)
	X = merged[["daily_sentiment"]].copy()
	y = merged["label"].copy()
	tscv = TimeSeriesSplit(n_splits=5)
	f1_scores = []
	for train_idx, test_idx in tscv.split(X):
		X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
		y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
		clf = RandomForestClassifier(
			n_estimators=200, 
			max_depth=None, 
			min_samples_leaf=5, 
			max_features="sqrt", 
			random_state=42,
			n_jobs=-1
		)
		clf.fit(X_train, y_train)
		y_pred = clf.predict(X_test)
		f1 = f1_score(y_test, y_pred, zero_division=0)
		f1_scores.append(f1)
	mean_f1 = np.mean(f1_scores)
	std_f1 = np.std(f1_scores)
	print(f"{name:30s} →  {mean_f1:.4f}  ± {std_f1:.4f}")


=== Evaluating f1: r*(P_pos-P_neg) ===
→ Mean CV accuracy = 0.5463  (± 0.0995)

=== Evaluating f2: r*P_pos ===
→ Mean CV accuracy = 0.4927  (± 0.0679)

=== Evaluating f3: r*(P_pos+0.5P_neu) ===
→ Mean CV accuracy = 0.4683  (± 0.0605)

=== Evaluating f4: (P_pos-P_neg) ===
→ Mean CV accuracy = 0.4976  (± 0.0293)

=== Evaluating f5: exp(r*(P_pos-P_neg)) ===
→ Mean CV accuracy = 0.4732  (± 0.0396)

=== Evaluating f6: r*exp(P_pos-P_neg) ===
→ Mean CV accuracy = 0.5268  (± 0.0867)

=== Evaluating f7: r*sigmoid(P_pos-P_neg) ===
→ Mean CV accuracy = 0.4341  (± 0.0697)

=== Evaluating f8: r*(P_pos-P_neg)^2 ===
→ Mean CV accuracy = 0.4878  (± 0.0309)

=== Evaluating f9: r*(P_pos-P_neg)*(1-P_neu) ===
→ Mean CV accuracy = 0.5463  (± 0.0995)

=== Summary of Mean CV Accuracies ===
f1: r*(P_pos-P_neg)            →  0.5463  ± 0.0995
f2: r*P_pos                    →  0.4927  ± 0.0679
f3: r*(P_pos+0.5P_neu)         →  0.4683  ± 0.0605
f4: (P_pos-P_neg)              →  0.4976  ± 0.0293
f5: exp(r*(P_pos-

In [28]:
# Save the daily sentiment time series for the first function (f1: r*(P_pos-P_neg))
# This is the daily_sentiment series for f1, already computed in cell 8

# Recompute to ensure correct values for f1
def f1(r, ppos, pneg, pneu):
    return r * (ppos - pneg)

news_df["article_score_f1"] = news_df.apply(
    lambda row: f1(row["relevance"], row["P_pos"], row["P_neg"], row["P_neu"]),
    axis=1
)
daily_sentiment_f1 = (
    news_df
    .groupby("date")["article_score_f1"]
    .sum()
    .reset_index()
    .rename(columns={"article_score_f1": "daily_sentiment_f1"})
)

# Save to CSV
daily_sentiment_f1.to_csv("daily_sentiment_f1.csv", index=False)