In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

# params
SYMBOLS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT", "DOGEUSDT"]
DATA_DIR = "binance_public_data"
START = "2018-01-01"
END   = "2025-12-31"

FREQ = 365
TEST_DAYS = 365

WIN_VOL = 20
LAM = 0.94
WIN_M1, WIN_M2, WIN_M3 = 5, 20, 60

H_STRESS = 5
Q_STRESS = 0.85

VOL_TARGET = 0.60
MAX_LEV = 1.0
WIN_PORTVOL = 60
COST_BPS = 10

# gate A
P_ON  = 0.70
P_OFF = 0.55
G_OFF = 0.20

def load_symbol(sym):
    fn = f"{DATA_DIR}/{sym}_1d_2021_2025.csv"  # adapte
    df = pd.read_csv(fn, parse_dates=["timestamp"], index_col="timestamp").sort_index()
    df = df.loc[START:END]
    df["ret"] = np.log(df["close"] / df["close"].shift(1))
    return df.dropna(subset=["ret"])

dfs = {s: load_symbol(s) for s in SYMBOLS}

# align
idx = None
for s in SYMBOLS:
    idx = dfs[s].index if idx is None else idx.intersection(dfs[s].index)

rets  = pd.DataFrame({s: dfs[s].loc[idx, "ret"] for s in SYMBOLS}).dropna()
close = pd.DataFrame({s: dfs[s].loc[rets.index, "close"] for s in SYMBOLS})
high  = pd.DataFrame({s: dfs[s].loc[rets.index, "high"] for s in SYMBOLS})
low   = pd.DataFrame({s: dfs[s].loc[rets.index, "low"] for s in SYMBOLS})
openp = pd.DataFrame({s: dfs[s].loc[rets.index, "open"] for s in SYMBOLS})
volu  = pd.DataFrame({s: dfs[s].loc[rets.index, "volume"] for s in SYMBOLS})

# vols / feats
vol_ewma = np.sqrt((rets**2).ewm(alpha=1-LAM).mean())
vol_roll = rets.rolling(WIN_VOL).std()

mom5  = close.pct_change(WIN_M1)
mom20 = close.pct_change(WIN_M2)
mom60 = close.pct_change(WIN_M3)

range_ = (high - low).replace(0, np.nan)
body_  = (close - openp).abs()
body_ratio = (body_ / range_)

vol_norm = volu / volu.rolling(20).mean()

# label stress (portfolio equal weight)
w_eq = pd.DataFrame(1.0, index=rets.index, columns=SYMBOLS)
w_eq = w_eq.div(w_eq.sum(axis=1), axis=0)

port_ret = (w_eq.shift(1) * rets).sum(axis=1)
realised_future_vol = port_ret.rolling(H_STRESS).std().shift(-1)

# split
cutoff_test = rets.index.max() - pd.Timedelta(days=TEST_DAYS)
train_mask = rets.index < cutoff_test

thr = realised_future_vol[train_mask].quantile(Q_STRESS)
y = (realised_future_vol >= thr).astype(int)

# X daily
X = pd.DataFrame(index=rets.index)
X["avg_vol20"] = vol_roll.mean(axis=1)
X["avg_ewma"]  = vol_ewma.mean(axis=1)
X["cs_disp"]   = vol_roll.std(axis=1)
X["btc_mom20"] = mom20["BTCUSDT"]
X["btc_mom60"] = mom60["BTCUSDT"]
X["avg_vol_norm"] = vol_norm.mean(axis=1)
X["avg_body_ratio"] = body_ratio.mean(axis=1)

ds = pd.concat([X, y.rename("stress")], axis=1).dropna()

X_all = ds.drop(columns=["stress"])
y_all = ds["stress"]

X_train = X_all[X_all.index < cutoff_test]
y_train = y_all[y_all.index < cutoff_test]
X_test  = X_all[X_all.index >= cutoff_test]
y_test  = y_all[y_all.index >= cutoff_test]

# model
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=4000, class_weight="balanced", C=1.0))
])

clf.fit(X_train, y_train)

p_train = clf.predict_proba(X_train)[:, 1]
p_test  = clf.predict_proba(X_test)[:, 1]

print("stress rate train:", float(y_train.mean()), "test:", float(y_test.mean()))
print("AUC test:", float(roc_auc_score(y_test, p_test)))
print("AP  test:", float(average_precision_score(y_test, p_test)))

p_stress = pd.Series(index=X_all.index, dtype=float)
p_stress.loc[X_train.index] = p_train
p_stress.loc[X_test.index]  = p_test
p_stress = p_stress.sort_index()

print("proba dates:", p_stress.index.min(), "->", p_stress.index.max(), "n", len(p_stress))

# weights base inv-vol
vol = rets.rolling(WIN_VOL).std()
w_inv = 1.0 / vol.replace(0, np.nan)
w_inv = w_inv.fillna(0.0)
w_inv = w_inv.div(w_inv.sum(axis=1).replace(0, np.nan), axis=0).fillna(0.0)

# vol targeting
w_lag = w_inv.shift(1).fillna(0.0)
port_ret_gross = (w_lag * rets).sum(axis=1)
port_vol = port_ret_gross.rolling(WIN_PORTVOL).std() * np.sqrt(FREQ)
scale = (VOL_TARGET / port_vol).clip(lower=0.0, upper=MAX_LEV).fillna(0.0)

# gate A (hystérésis)
p_use = p_stress.reindex(rets.index).ffill().fillna(0.0).clip(0.0, 1.0)

gate_ml = pd.Series(1.0, index=rets.index)
state = 0

for t in rets.index:
    p = float(p_use.loc[t])
    if state == 0 and p >= P_ON:
        state = 1
    elif state == 1 and p <= P_OFF:
        state = 0
    gate_ml.loc[t] = (G_OFF if state == 1 else 1.0)

print("avg gate:", float(gate_ml.mean()), "stress days:", int((gate_ml < 1.0).sum()))

# final weights
W_ml = w_inv.mul(scale, axis=0).mul(gate_ml, axis=0)
W_ml = W_ml.clip(lower=0.0)
W_ml = W_ml.div(W_ml.sum(axis=1).replace(0, np.nan), axis=0).fillna(0.0)

print("W_ml ready:", W_ml.index.min(), "->", W_ml.index.max(), "shape", W_ml.shape)


stress rate train: 0.13581129378127232 test: 0.10655737704918032

AUC test: 0.8448208264722027

AP  test: 0.5585310452959118

proba dates: 2021-03-03 00:00:00 -> 2025-12-31 00:00:00 n 1765

avg gate: 0.879013698630137 stress days: 276

W_ml ready: 2021-01-02 00:00:00 -> 2025-12-31 00:00:00 shape (1825, 6)

Sur la dernière année (test), le modèle arrive à repérer les jours de stress avec des scores corrects : AUC = 0.845 et AP = 0.559. Donc il capte un signal utile (pas parfait mais clairement > hasard).

La proportion de jours classés stress est cohérente avec le choix Q_STRESS = 0.85 : 13.6% en train et 10.7% en test (le test peut être un peu moins “volatile” que la période d’entraînement).

Le gate n’est pas activé tout le temps : gate moyen = 0.879 et 276 jours en mode stress, donc l’expo est réduite seulement quand le modèle “voit” un risque élevé.

Au final, on obtient une matrice de poids W_ml propre sur tout l’univers : 1825 jours × 6 cryptos (BTC, ETH, BNB, SOL, XRP, DOGE).