In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeClassifier,Ridge
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from catboost import CatBoostClassifier,CatBoostRegressor
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

In [48]:
X = pd.read_csv('data/X_train.csv',index_col='ROW_ID')
X_test_final = pd.read_csv('data/X_test.csv',index_col='ROW_ID')
y = pd.read_csv('data/y_train.csv',index_col='ROW_ID')
y_bin = (y > 0).astype(int) #binariser l outcome pour faire de la classification et pas regression
sample_submission = pd.read_csv('data/sample_submission.csv',index_col='ROW_ID')

In [49]:

RET_features = [f'RET_{i}' for i in range(1, 20)]
SIGNED_VOLUME_features = [f'SIGNED_VOLUME_{i}' for i in range(1, 20)]
TURNOVER_features = ['AVG_DAILY_TURNOVER']

for i in [3, 5, 10, 15, 20]:
    X[f'AVERAGE_PERF_{i}'] = X[RET_features[:i]].mean(1)
    X[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')
    X_test_final[f'AVERAGE_PERF_{i}'] = X_test_final[RET_features[:i]].mean(1)
    X_test_final[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X_test_final.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')

features = RET_features + SIGNED_VOLUME_features + TURNOVER_features
features += [f'AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]
features += [f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]

In [50]:
X["RET_VOLATILITY_20"] = X[RET_features].std(axis=1)
X_test_final["RET_VOLATILITY_20"] = X_test_final[RET_features].std(axis=1)

X["RET_MOMENTUM"] = X["RET_1"] - X[RET_features].mean(axis=1)
X_test_final["RET_MOMENTUM"] = X_test_final["RET_1"] - X_test_final[RET_features].mean(axis=1)

# Ratio perf/vol (Sharpe-like)
X["RET_SHARPE"] = X["RET_MOMENTUM"] / (X["RET_VOLATILITY_20"] + 1e-6)
X_test_final["RET_SHARPE"] = X_test_final["RET_MOMENTUM"] / (X_test_final["RET_VOLATILITY_20"] + 1e-6)

# Liquidity volatility
X["SIGNED_VOLUME_VOL"] = X[SIGNED_VOLUME_features].std(axis=1)
X_test_final["SIGNED_VOLUME_VOL"] = X_test_final[SIGNED_VOLUME_features].std(axis=1)

X["TS_num"] = X["TS"].str.extract("(\d+)").astype(int)
X_test_final["TS_num"] = X_test_final["TS"].str.extract("(\d+)").astype(int)
X["RET_TREND_5"] = X["RET_20"] - X["RET_15"]
X["VOL_TREND_5"] = X["SIGNED_VOLUME_20"] - X["SIGNED_VOLUME_15"]

# Décalage du dernier retour
X["RET_LAG1"] = X["RET_1"]
X["RET_DIFF1"] = X["RET_1"] - X["RET_2"]

# Skewness et kurtosis sur les 20 derniers jours
X["RET_SKEW_20"] = X[RET_features].skew(axis=1)
X["RET_KURT_20"] = X[RET_features].kurt(axis=1)

# Ratio volume/performance (liquidité relative)
X["VOL_PERF_RATIO"] = X[SIGNED_VOLUME_features].mean(axis=1) / (X[RET_features].std(axis=1) + 1e-6)

# Moyenne pondérée des performances récentes (momentum lissé)
weights = np.linspace(1, 2, 19)
X["RET_WEIGHTED_MOMENTUM"] = (X[RET_features].values * weights).sum(axis=1) / weights.sum()




  X["TS_num"] = X["TS"].str.extract("(\d+)").astype(int)
  X_test_final["TS_num"] = X_test_final["TS"].str.extract("(\d+)").astype(int)


In [51]:
features = (
    RET_features
    + SIGNED_VOLUME_features
    + TURNOVER_features
    + [f'AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]
    + [f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]
    + ["RET_VOLATILITY_20", "RET_MOMENTUM", "RET_SHARPE", "SIGNED_VOLUME_VOL","VOL_TREND_5","RET_TREND_5"]
    +["TS_num"]
    +["VOL_PERF_RATIO","RET_KURT_20","RET_SKEW_20","RET_DIFF1","RET_LAG1"]
)

In [52]:
features=[
 'RET_SHARPE',
 'RET_15',
 'RET_TREND_5',
 'AVERAGE_PERF_20',
 'ALLOCATIONS_AVERAGE_PERF_5',
 'RET_1',
 'RET_6',
 'SIGNED_VOLUME_3',
 'SIGNED_VOLUME_19',
 'ALLOCATIONS_AVERAGE_PERF_10',
 'ALLOCATIONS_AVERAGE_PERF_3',
 'ALLOCATIONS_AVERAGE_PERF_15',
 'RET_MOMENTUM']

In [53]:
unique_dates = np.sort(X['TS'].unique())
n_splits = 5  # nombre de folds OOF
fold_size = len(unique_dates) // n_splits

In [54]:
folds = []
for i in range(n_splits):
    val_dates = unique_dates[i*fold_size : (i+1)*fold_size]
    train_dates = unique_dates[:i*fold_size].tolist() + unique_dates[(i+1)*fold_size:].tolist()
    folds.append((train_dates, val_dates))
models = {
    "Ridge": RidgeClassifier(alpha=1.0),
    "kNN": KNeighborsClassifier(n_neighbors=15, weights="distance", n_jobs=-1),
    #"RandomForest": RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1),
    #"LightGBM": LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=31, subsample=0.8, colsample_bytree=0.8, random_state=42)
}
oof_preds = {name: np.zeros(len(X)) for name in models.keys()}
y_bin_values = y_bin.values.ravel()

In [57]:
for fold_idx, (train_dates, val_dates) in enumerate(folds):
    print(f"\n=== Fold {fold_idx+1} ===")
    
    train_idx = X['TS'].isin(train_dates)
    val_idx = X['TS'].isin(val_dates)
    
    X_train_fold = X.loc[train_idx, features].fillna(0)
    y_train_fold = y_bin.loc[train_idx].values.ravel()
    X_val_fold = X.loc[val_idx, features].fillna(0)
    
    # Encodage allocation par fold pour éviter leakage
    enc = TargetEncoder()
    X_train_fold["alloc_enc"] = enc.fit_transform(X.loc[train_idx, "ALLOCATION"], y_train_fold)
    X_val_fold["alloc_enc"] = enc.transform(X.loc[val_idx, "ALLOCATION"])
    
    for name, model in models.items():
        # Standardiser Ridge
        if "Ridge" in name:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_fold)
            X_val_scaled = scaler.transform(X_val_fold)
            model.fit(X_train_scaled, y_train_fold)
            preds = model.decision_function(X_val_scaled)
            oof_preds[name][val_idx] = preds
        else:
            model.fit(X_train_fold, y_train_fold)
            preds = model.predict_proba(X_val_fold)[:,1]
            oof_preds[name][val_idx] = preds


=== Fold 1 ===


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b



=== Fold 2 ===

=== Fold 3 ===


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b



=== Fold 4 ===


  ret = a @ b
  ret = a @ b
  ret = a @ b



=== Fold 5 ===


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [None]:
for name, preds in oof_preds.items():
    y_pred_bin = (preds > 0).astype(int)
    acc = accuracy_score(y_bin_values, y_pred_bin)
    print(f"{name} OOF Accuracy: {acc:.4f}")

Ridge OOF Accuracy: 0.5255
kNN OOF Accuracy: 0.5027

Ensemble OOF Accuracy: 0.49750062415046187
