# Notebook 2: Statistical Tests and Simple Models

This notebook examines relationships between sentiment and daily trading KPIs via:
- Non-parametric correlations
- OLS regression of `total_net_pnl_usd` on `fg_value` and controls
- Logistic regression for `positive_pnl_day` vs `fg_value`



In [7]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression

BASE_DIR = "/Users/kumarabhishek/Desktop/Assignment_task/ds_abhishek"
OUT_DATA = os.path.join(BASE_DIR, "outputs", "data")
OUT_FIG = os.path.join(BASE_DIR, "outputs", "figures")

merged_path = os.path.join(OUT_DATA, "daily_metrics_with_sentiment.csv")
df = pd.read_csv(merged_path, parse_dates=["date"]) 
print(df.shape)
df.head(3)


(480, 17)


Unnamed: 0,date,total_trade_value_usd,total_volume_usd,total_volume_tokens,avg_execution_price,total_fees_usd,total_pnl_usd,total_net_pnl_usd,avg_leverage,buy_volume_usd,sell_volume_usd,num_trades,num_accounts,num_coins,buy_sell_ratio,fg_value,classification
0,2023-05-01,477.0,477.0,0.2513,1898.133333,0.0,0.0,0.0,,477.0,0.0,3,1,1,,63.0,Greed
1,2023-12-05,50005.83,50005.83,11.80901,11038.3,12.501455,0.0,-12.501455,,25007.21,24998.62,9,1,2,1.000344,75.0,Extreme Greed
2,2023-12-14,113203.35,113203.35,4092.28901,8031.868818,28.300831,-205.434737,-233.735568,,51407.47,61795.88,11,1,4,0.831892,72.0,Greed


In [8]:
use = df.dropna(subset=["fg_value"]).copy()

rho, pval = stats.spearmanr(use["fg_value"], use["total_net_pnl_usd"], nan_policy="omit")
print(f"Spearman rho (FG vs Net PnL): {rho:.3f}, p={pval:.3g}")
controls = [c for c in ["total_volume_usd", "total_fees_usd", "num_trades", "avg_leverage"] if c in use.columns]

cols = ["total_net_pnl_usd", "fg_value"] + controls
ols_tmp = use[cols].apply(pd.to_numeric, errors="coerce")
ols_tmp = ols_tmp.replace([np.inf, -np.inf], np.nan).dropna()

if ols_tmp.empty:
    print("OLS skipped: no valid rows after cleaning")
else:
    y = ols_tmp["total_net_pnl_usd"]
    X = ols_tmp.drop(columns=["total_net_pnl_usd"]).copy()
    non_zero_var = [c for c in X.columns if X[c].std(ddof=0) > 0]
    X = X[non_zero_var]
    X = sm.add_constant(X, has_constant="add")
    if X.shape[1] < 2 or len(y) < 5:
        print(f"OLS skipped: insufficient predictors or rows. X shape={X.shape}, n={len(y)}")
    else:
        try:
            model = sm.OLS(y, X).fit(cov_type="HC3")
            print(model.summary())
        except Exception as e:
            print("OLS skipped due to error:", repr(e))


Spearman rho (FG vs Net PnL): 0.043, p=0.349
OLS skipped: no valid rows after cleaning


In [9]:
cls = use.copy()
cls["positive_pnl_day"] = (cls["total_net_pnl_usd"] > 0).astype(int)

features = ["fg_value"] + [c for c in ["total_volume_usd", "num_trades", "total_fees_usd"] if c in cls.columns]
cls_model = cls.dropna(subset=features + ["positive_pnl_day"]).copy()

for c in features:
    cls_model[c] = pd.to_numeric(cls_model[c], errors="coerce")
cls_model = cls_model.replace([np.inf, -np.inf], np.nan).dropna(subset=features + ["positive_pnl_day"]).copy()

X = cls_model[features]
y = cls_model["positive_pnl_day"]
non_zero = [c for c in X.columns if X[c].std(ddof=0) > 0]
X = X[non_zero]

if X.empty or len(np.unique(y)) < 2:
    print("Classification skipped: insufficient features or only one class present")
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    logit = LogisticRegression(max_iter=1000)
    try:
        logit.fit(X_train, y_train)
        proba = logit.predict_proba(X_test)[:, 1]
        if len(np.unique(y_test)) < 2:
            print("AUC skipped: y_test has a single class")
        else:
            auc = roc_auc_score(y_test, proba)
            print(f"Logistic AUC: {auc:.3f}")
        print(classification_report(y_test, (proba>0.5).astype(int)))
    except Exception as e:
        print("Classification skipped due to error:", repr(e))


Logistic AUC: 0.710
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.73      1.00      0.85        88

    accuracy                           0.73       120
   macro avg       0.37      0.50      0.42       120
weighted avg       0.54      0.73      0.62       120



In [10]:
#saving the dataset
use.to_csv(os.path.join(OUT_DATA, "daily_modeling_dataset.csv"), index=False)
print("Saved:", os.path.join(OUT_DATA, "daily_modeling_dataset.csv"))


Saved: /Users/kumarabhishek/Desktop/Assignment_task/ds_abhishek/outputs/data/daily_modeling_dataset.csv
