Comparer plusieurs modèles de classification (logistique, random forest, SVM) afin de prédire des mouvements extrêmes le lendemain sur un univers crypto (BTC, ETH, BNB, SOL, XRP, DOGE). On construit un dataset de features (momentum, volatilité, info de bougies, volume), on sépare une année finale en test, puis on fait une validation croisée temporelle sur le train pour choisir le meilleur modèle et régler quelques hyperparamètres. 

Enfin, on évalue les performances sur la dernière année avec des métriques adaptées aux classes déséquilibrées (AP et AUC), et on regarde aussi à quelle fréquence le modèle déclenche des signaux “forte confiance”.

In [None]:
import pandas as pd
import numpy as np

# params
SYMBOLS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT", "DOGEUSDT"]
DATA_DIR = "binance_public_data"
START = "2018-01-01"
END   = "2025-12-31"

FREQ = 365
TEST_DAYS = 365
Q_MOVE = 0.30

WIN_VOL = 20
LAM = 0.94
WIN_M1, WIN_M2, WIN_M3 = 5, 20, 60

def load_symbol(sym):
    fn = f"{DATA_DIR}/{sym}_1d_2021_2025.csv"  # adapte si besoin
    df = pd.read_csv(fn, parse_dates=["timestamp"], index_col="timestamp").sort_index()
    df = df.loc[START:END]
    df["ret"] = np.log(df["close"] / df["close"].shift(1))
    return df.dropna(subset=["ret"])

dfs = {s: load_symbol(s) for s in SYMBOLS}

# align
idx = None
for s in SYMBOLS:
    idx = dfs[s].index if idx is None else idx.intersection(dfs[s].index)

rets  = pd.DataFrame({s: dfs[s].loc[idx, "ret"] for s in SYMBOLS}).dropna()
close = pd.DataFrame({s: dfs[s].loc[rets.index, "close"] for s in SYMBOLS})
high  = pd.DataFrame({s: dfs[s].loc[rets.index, "high"] for s in SYMBOLS})
low   = pd.DataFrame({s: dfs[s].loc[rets.index, "low"] for s in SYMBOLS})
openp = pd.DataFrame({s: dfs[s].loc[rets.index, "open"] for s in SYMBOLS})
volu  = pd.DataFrame({s: dfs[s].loc[rets.index, "volume"] for s in SYMBOLS})

# features mats
vol_ewma = np.sqrt((rets**2).ewm(alpha=1-LAM).mean())
vol_roll = rets.rolling(WIN_VOL).std()

mom5  = close.pct_change(WIN_M1)
mom20 = close.pct_change(WIN_M2)
mom60 = close.pct_change(WIN_M3)

range_ = (high - low).replace(0, np.nan)
body_  = (close - openp).abs()
body_ratio = (body_ / range_)

vol_norm = volu / volu.rolling(20).mean()

# stack features
X = pd.concat([
    rets.stack().rename("ret_1"),
    mom5.stack().rename("ret_5"),
    mom20.stack().rename("ret_20"),
    mom60.stack().rename("ret_60"),
    vol_ewma.stack().rename("vol_ewma"),
    vol_roll.stack().rename("vol_roll20"),
    (vol_ewma / vol_ewma.rolling(252).mean()).stack().rename("vol_ratio"),
    body_ratio.stack().rename("body_ratio"),
    vol_norm.stack().rename("vol_norm"),
], axis=1).dropna()

# future return label
fut = rets.shift(-1).stack().rename("ret_fut")
df_all = pd.concat([X, fut], axis=1).dropna()

# thresholds on TRAIN only later -> here just store
# split dates
cutoff_test = df_all.index.get_level_values(0).max() - pd.Timedelta(days=TEST_DAYS)

train_full = df_all[df_all.index.get_level_values(0) < cutoff_test].copy()
test_final = df_all[df_all.index.get_level_values(0) >= cutoff_test].copy()

print("train dates:", train_full.index.get_level_values(0).min(), "->", train_full.index.get_level_values(0).max(), "n", len(train_full))
print("test  dates:", test_final.index.get_level_values(0).min(), "->", test_final.index.get_level_values(0).max(), "n", len(test_final))

# time folds in train (3 folds)
train_dates = sorted(train_full.index.get_level_values(0).unique())
n_dates = len(train_dates)
fold_edges = [int(n_dates*0.5), int(n_dates*0.7), int(n_dates*0.85), n_dates]

folds = []
start = 0
for edge in fold_edges[:-1]:
    tr_end = edge
    va_end = fold_edges[fold_edges.index(edge)+1]
    tr_dates = train_dates[:tr_end]
    va_dates = train_dates[tr_end:va_end]
    folds.append((tr_dates, va_dates))

print("folds:", [(len(t), len(v)) for t,v in folds])


train dates: 2021-09-10 00:00:00 -> 2024-12-29 00:00:00 n 7242

test  dates: 2024-12-30 00:00:00 -> 2025-12-30 00:00:00 n 2196

folds: [(603, 241), (844, 181), (1025, 182)]