In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid

# labels thresholds computed on train only
down_thr = train_full["ret_fut"].quantile(Q_MOVE)
up_thr   = train_full["ret_fut"].quantile(1 - Q_MOVE)

def make_labels(df):
    y_down = (df["ret_fut"] <= down_thr).astype(int)
    y_up   = (df["ret_fut"] >= up_thr).astype(int)
    return y_down, y_up

y_down_train, y_up_train = make_labels(train_full)
y_down_test, y_up_test   = make_labels(test_final)

X_train = train_full.drop(columns=["ret_fut"])
X_test  = test_final.drop(columns=["ret_fut"])

# models list
models = {
    "logit": Pipeline([("scaler", StandardScaler()),
                       ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))]),
    "rf": RandomForestClassifier(n_estimators=400, max_depth=6, random_state=42),
    "svm_rbf": Pipeline([("scaler", StandardScaler()),
                         ("clf", SVC(kernel="rbf", probability=True, class_weight="balanced"))]),
}

# cv eval
def eval_model(model, X, y, folds):
    aps, aucs = [], []
    for tr_dates, va_dates in folds:
        tr_idx = X.index.get_level_values(0).isin(tr_dates)
        va_idx = X.index.get_level_values(0).isin(va_dates)

        Xtr, ytr = X.loc[tr_idx], y.loc[tr_idx]
        Xva, yva = X.loc[va_idx], y.loc[va_idx]

        if len(np.unique(ytr)) < 2 or len(np.unique(yva)) < 2:
            continue

        m = model
        m.fit(Xtr, ytr)
        p = m.predict_proba(Xva)[:, 1]
        aps.append(average_precision_score(yva, p))
        aucs.append(roc_auc_score(yva, p))
    return float(np.mean(aps)) if aps else np.nan, float(np.mean(aucs)) if aucs else np.nan

# benchmark basic
rows = []
for name, model in models.items():
    ap_d, auc_d = eval_model(model, X_train, y_down_train, folds)
    ap_u, auc_u = eval_model(model, X_train, y_up_train, folds)
    rows.append({"model": name, "AP_down": ap_d, "AUC_down": auc_d, "AP_up": ap_u, "AUC_up": auc_u})

bench = pd.DataFrame(rows).sort_values("AP_down", ascending=False)
print("=== CV benchmark (train only) ===")
print(bench)

# pick best model family per task
best_down_name = bench.sort_values("AP_down", ascending=False).iloc[0]["model"]
best_up_name   = bench.sort_values("AP_up", ascending=False).iloc[0]["model"]

print("\nbest_down:", best_down_name, "best_up:", best_up_name)

# gridsearch light per best family
grids = {
    "logit": [
        {"clf__C": [0.1, 0.5, 1.0, 2.0]},
    ],
    "rf": [
        {"n_estimators": [300, 600],
         "max_depth": [3, 6, 10],
         "min_samples_leaf": [1, 5, 10]},
    ],
    "svm_rbf": [
        {"clf__C": [0.5, 1.0, 2.0],
         "clf__gamma": ["scale", 0.1, 0.01]},
    ],
}

def clone_model(name):
    return models[name]

def grid_search(name, y):
    best = {"name": name, "params": None, "AP": -np.inf, "AUC": -np.inf}
    for params in ParameterGrid(grids[name]):
        m = clone_model(name)
        m.set_params(**params)
        ap, auc = eval_model(m, X_train, y, folds)
        if np.isnan(ap):
            continue
        if ap > best["AP"]:
            best = {"name": name, "params": params, "AP": ap, "AUC": auc}
    return best

best_down = grid_search(best_down_name, y_down_train)
best_up   = grid_search(best_up_name,   y_up_train)

print("\n=== best after grid (train CV) ===")
print("DOWN:", best_down)
print("UP  :", best_up)

# fit final on full train, test on final year
def fit_and_test(best, y_train, y_test):
    m = clone_model(best["name"])
    if best["params"] is not None:
        m.set_params(**best["params"])
    m.fit(X_train, y_train)
    p = m.predict_proba(X_test)[:, 1]
    ap = average_precision_score(y_test, p)
    auc = roc_auc_score(y_test, p)
    return m, p, float(ap), float(auc)

m_down, p_down, ap_down, auc_down = fit_and_test(best_down, y_down_train, y_down_test)
m_up,   p_up,   ap_up,   auc_up   = fit_and_test(best_up,   y_up_train,   y_up_test)

print("\n=== FINAL TEST (last 1y) ===")
print("DOWN AP:", ap_down, "AUC:", auc_down)
print("UP   AP:", ap_up,   "AUC:", auc_up)

# signal coverage diagnostics
# trigger high confidence quantile based on train probs
train_pdown = m_down.predict_proba(X_train)[:, 1]
train_pup   = m_up.predict_proba(X_train)[:, 1]

thr_down = float(np.quantile(train_pdown, 0.95))
thr_up   = float(np.quantile(train_pup,   0.95))

cov_down = float((p_down >= thr_down).mean())
cov_up   = float((p_up >= thr_up).mean())

print("thr_down:", thr_down, "coverage_down:", cov_down)
print("thr_up  :", thr_up,   "coverage_up  :", cov_up)

=== CV benchmark (train only) ===

     model   AP_down  AUC_down     AP_up    AUC_up
0    logit  0.346724  0.596781  0.345489  0.546129
1       rf  0.332014  0.585816  0.339219  0.558626
2  svm_rbf  0.329953  0.578053  0.331228  0.550104

best_down: logit best_up: logit

=== best after grid (train CV) ===

DOWN: {'name': 'logit', 'params': {'clf__C': 2.0}, 'AP': 0.34674515004628387, 'AUC': 0.5967473689866093}
UP  : {'name': 'logit', 'params': {'clf__C': 0.1}, 'AP': 0.34564588138708224, 'AUC': 0.5461791044376753}

=== FINAL TEST (last 1y) ===

DOWN AP: 0.37792683223062695 AUC: 0.5795918051363537
UP   AP: 0.33955081724059694 AUC: 0.568304707379135
thr_down: 0.6346630567397514 coverage_down: 0.02185792349726776
thr_up  : 0.6346630567397514 coverage_up  : 0.04052823315118397

Le benchmark CV montre que la régression logistique reste le meilleur choix parmi les trois familles testées, même face à des modèles plus complexes (RF, SVM). Sur la dernière année, les scores restent modestes (AUC autour de 0.57–0.58), ce qui confirme que prédire la direction des “gros moves” est difficile avec ces features simples.

Un point important est la couverture très faible quand on déclenche uniquement les signaux à très haute confiance (quantile 95% des probas) : environ 2% des observations pour le signal “down” et 4% pour le signal “up”. Donc le modèle peut être exploitable comme filtre “rare mais fort”, mais en pratique il activera très peu souvent, ce qui limite l’impact sur une stratégie d’allocation si on garde des seuils aussi stricts.