# Notebook 2 — Modeling, Hypothesis Tests, and Signal Discovery

**Candidate:** Mohammad_Kamran  
**Date:** 2025-08-29

This notebook:
1. Loads the merged data from Notebook 1.  
2. Runs statistical tests comparing *Fear* vs *Greed*.  
3. Trains predictive models for *probability of a winning trade*.  
4. Explores account clusters and leverage-size regimes.  
5. Performs an **event study** around Fear↔Greed regime changes.


In [None]:
# !pip -q install pandas numpy matplotlib scipy statsmodels scikit-learn plotly
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [None]:
CSV_DIR = "csv_files"
merged_path = os.path.join(CSV_DIR, "trades_with_sentiment.csv")
df = pd.read_csv(merged_path, parse_dates=["trade_date"], low_memory=False)

# Basic sanitation
df["win"] = (df.get("closed_pnl", 0) > 0).astype(int)

# Encode sentiment
df["is_greed"] = (df["sentiment"].str.lower()=="greed").astype(int)
df["is_fear"]  = (df["sentiment"].str.lower()=="fear").astype(int)

# Fill leverage/size with medians to avoid NaNs for modeling
for col in ["leverage", "size"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = 0.0

# Side encoding
df["side_long"]  = (df.get("side","").astype(str).str.lower().str.contains("buy|long")).astype(int)
df["side_short"] = (df.get("side","").astype(str).str.lower().str.contains("sell|short")).astype(int)


## 1) Hypothesis tests (Fear vs Greed)


In [None]:
# Compare win_rate between Fear vs Greed
win_fear  = df.loc[df["is_fear"]==1, "win"].dropna()
win_greed = df.loc[df["is_greed"]==1, "win"].dropna()
t_stat, p_val = stats.ttest_ind(win_fear, win_greed, equal_var=False, nan_policy="omit")
print("Win-rate t-test Fear vs Greed: t=%.3f  p=%.3g" % (t_stat, p_val))

# Compare leverage
lev_fear  = df.loc[df["is_fear"]==1, "leverage"].dropna()
lev_greed = df.loc[df["is_greed"]==1, "leverage"].dropna()
t_stat2, p_val2 = stats.ttest_ind(lev_fear, lev_greed, equal_var=False, nan_policy="omit")
print("Leverage t-test Fear vs Greed: t=%.3f  p=%.3g" % (t_stat2, p_val2))


## 2) Predictive modeling: P(win) ~ sentiment + leverage + size + side


In [None]:
features = ["is_greed", "leverage", "size", "side_long", "side_short"]
X = df[features].copy()
y = df["win"].astype(int).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Logistic Regression
logit = LogisticRegression(max_iter=200)
logit.fit(X_train, y_train)
proba = logit.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, proba)
print("Logit AUC:", round(auc, 4))
print(classification_report(y_test, (proba>0.5).astype(int)))

# Random Forest (nonlinear interactions)
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
proba_rf = rf.predict_proba(X_test)[:,1]
auc_rf = roc_auc_score(y_test, proba_rf)
print("RF AUC:", round(auc_rf, 4))


In [None]:
# Feature importances
imp = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print(imp)
fig = plt.figure()
ax = fig.add_subplot(111)
imp.plot(kind="bar", ax=ax)
ax.set_title("Random Forest Feature Importances")
fig.tight_layout()
fig.savefig(os.path.join("outputs","rf_feature_importances.png"), dpi=160)


## 3) Account clusters (behavioral archetypes)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Build per-account features from Notebook 1 outputs if available
acct_path = os.path.join("csv_files","account_kpis_wide.csv")
acct = pd.read_csv(acct_path) if os.path.exists(acct_path) else None
if acct is not None:
    acct = acct.set_index("account")
    Xacct = acct.fillna(0.0).copy()
    Z = StandardScaler().fit_transform(Xacct)
    km = KMeans(n_clusters=4, random_state=42, n_init=10)
    labels = km.fit_predict(Z)
    acct["cluster"] = labels
    acct.to_csv(os.path.join("csv_files","account_clusters.csv"))
    display(acct.groupby("cluster").mean())
else:
    print("Run Notebook 1 to generate account_kpis_wide.csv first.")


## 4) Event study around regime changes
Define Fear→Greed and Greed→Fear switches, then measure average PnL before/after.


In [None]:
# Build sentiment series
sent = df[["trade_date","sentiment"]].drop_duplicates().sort_values("trade_date")
sent["is_greed"] = (sent["sentiment"].str.lower()=="greed").astype(int)
sent = sent.set_index("trade_date").asfreq("D").ffill().reset_index()

# Identify change points
sent["prev"] = sent["is_greed"].shift(1)
sent["chg"] = sent["is_greed"] - sent["prev"]  # +1 fear->greed ; -1 greed->fear
cp = sent[sent["chg"].isin([1,-1])]["trade_date"].tolist()

def window_pnl(center_date, days=3):
    w = df[(df["trade_date"]>=center_date-pd.Timedelta(days=days)) & 
           (df["trade_date"]<=center_date+pd.Timedelta(days=days))]
    agg = w.groupby("trade_date")["closed_pnl"].mean().reset_index()
    agg["rel_day"] = (agg["trade_date"] - center_date).dt.days
    return agg[["rel_day","closed_pnl"]]

# Aggregate event windows
all_win = []
for d in cp:
    all_win.append(window_pnl(d, days=3))
if all_win:
    ev = pd.concat(all_win, ignore_index=True)
    evg = ev.groupby("rel_day")["closed_pnl"].mean().reset_index()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(evg["rel_day"], evg["closed_pnl"], marker="o")
    ax.axvline(0, linestyle="--")
    ax.set_title("Event Study: Avg Closed PnL around Sentiment Regime Switches")
    ax.set_xlabel("Days from Switch")
    ax.set_ylabel("Avg Closed PnL")
    fig.tight_layout()
    fig.savefig(os.path.join("outputs","event_study_pnl.png"), dpi=160)
else:
    print("No regime switches found (check sentiment coverage).")
