In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
X = pd.read_csv('data/X_train.csv',index_col='ROW_ID')
X_test_final = pd.read_csv('data/X_test.csv',index_col='ROW_ID')
y_train = pd.read_csv('data/y_train.csv',index_col='ROW_ID')
y_bin = (y_train > 0).astype(int) #binariser l outcome pour faire de la classification et pas regression
sample_submission = pd.read_csv('data/sample_submission.csv',index_col='ROW_ID')

In [3]:

RET_features = [f'RET_{i}' for i in range(1, 20)]
SIGNED_VOLUME_features = [f'SIGNED_VOLUME_{i}' for i in range(1, 20)]
TURNOVER_features = ['AVG_DAILY_TURNOVER']

for i in [3, 5, 10, 15, 20]:
    X[f'AVERAGE_PERF_{i}'] = X[RET_features[:i]].mean(1)
    X[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')
    X_test_final[f'AVERAGE_PERF_{i}'] = X_test_final[RET_features[:i]].mean(1)
    X_test_final[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X_test_final.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')

features = RET_features + SIGNED_VOLUME_features + TURNOVER_features
features += [f'AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]
features += [f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]

In [4]:
unique_dates = np.sort(X['TS'].unique())
n = len(unique_dates)
train_dates = unique_dates[: int(0.6 * n)]
tuning_dates = unique_dates[int(0.6 * n): int(0.8 * n)]
test_dates = unique_dates[int(0.8 * n):]

train_idx = X['TS'].isin(train_dates)
tuning_idx = X['TS'].isin(tuning_dates)
test_idx = X['TS'].isin(test_dates)

X_train, y_train = X.loc[train_idx, features], y_bin.loc[train_idx]
X_tune, y_tune = X.loc[tuning_idx, features], y_bin.loc[tuning_idx]
X_test, y_test = X.loc[test_idx, features], y_bin.loc[test_idx]

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_tune_scaled = scaler.transform(X_tune)
X_test_scaled = scaler.transform(X_test)

In [6]:
models = {
    "RidgeClassifier": RidgeClassifier(alpha=1.0, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1),
    "LightGBM": lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    ),
}

In [7]:
results = {}
for name, model in models.items():
    print(f"\n{'='*30}\nTraining {name}...\n{'='*30}")
    
    if name == "RidgeClassifier":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train.values.ravel())
        y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"✅ Test Accuracy ({name}): {acc:.4f}")


Training RidgeClassifier...
✅ Test Accuracy (RidgeClassifier): 0.5146

Training RandomForest...


  y = column_or_1d(y, warn=True)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


✅ Test Accuracy (RandomForest): 0.5169

Training LightGBM...
[LightGBM] [Info] Number of positive: 54276, number of negative: 53819
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 108095, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502114 -> initscore=0.008456
[LightGBM] [Info] Start training from score 0.008456
✅ Test Accuracy (LightGBM): 0.5119


In [8]:
print("\n--- Résumé des modèles ---")
for name, acc in results.items():
    print(f"{name:20s}: {acc:.4f}")


--- Résumé des modèles ---
RidgeClassifier     : 0.5146
RandomForest        : 0.5169
LightGBM            : 0.5119
