In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import RidgeClassifier,Ridge
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from catboost import CatBoostClassifier,CatBoostRegressor

In [28]:
X = pd.read_csv('data/X_train.csv',index_col='ROW_ID')
X_test_final = pd.read_csv('data/X_test.csv',index_col='ROW_ID')
y = pd.read_csv('data/y_train.csv',index_col='ROW_ID')
y_bin = (y > 0).astype(int) #binariser l outcome pour faire de la classification et pas regression
sample_submission = pd.read_csv('data/sample_submission.csv',index_col='ROW_ID')

In [29]:

RET_features = [f'RET_{i}' for i in range(1, 20)]
SIGNED_VOLUME_features = [f'SIGNED_VOLUME_{i}' for i in range(1, 20)]
TURNOVER_features = ['AVG_DAILY_TURNOVER']

for i in [3, 5, 10, 15, 20]:
    X[f'AVERAGE_PERF_{i}'] = X[RET_features[:i]].mean(1)
    X[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')
    X_test_final[f'AVERAGE_PERF_{i}'] = X_test_final[RET_features[:i]].mean(1)
    X_test_final[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X_test_final.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')

features = RET_features + SIGNED_VOLUME_features + TURNOVER_features
features += [f'AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]
features += [f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in [3, 5, 10, 15, 20]]

In [30]:
unique_dates = np.sort(X['TS'].unique())
n = len(unique_dates)
train_dates = unique_dates[: int(0.8 * n)]   # 80% train
test_dates = unique_dates[int(0.8 * n):]     # 20% test

train_idx = X['TS'].isin(train_dates)
test_idx = X['TS'].isin(test_dates)

X_train, y_train_bin, y_train_cont = X.loc[train_idx, features], y_bin.loc[train_idx], y.loc[train_idx]
X_test, y_test_bin, y_test_cont = X.loc[test_idx, features], y_bin.loc[test_idx], y.loc[test_idx]


In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.fillna(0))
X_test_scaled = scaler.transform(X_test.fillna(0))

In [32]:
models_classif = {
    "RidgeClassifier": RidgeClassifier(alpha=1e-2, fit_intercept=False),
    "RandomForest": RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1),
    "LightGBM": lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    ),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss',
    ),
    "CatBoost": CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        random_seed=42,
        verbose=0
    )
}

In [33]:
models_regress = {
    "Ridge": Ridge(alpha=1e-2, fit_intercept=False),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1),
    "LightGBMRegressor": lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    ),
    "XGBoostRegressor": xgb.XGBRegressor(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ),
    "CatBoostRegressor": CatBoostRegressor(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        random_seed=42,
        verbose=0
    )
}

In [35]:
results_classif = {}
for name, model in models_classif.items():
    print(f"\n{'='*30}\nTraining {name} (classification)...\n{'='*30}")
    if "Ridge" in name:
        model.fit(X_train_scaled, y_train_bin)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train.fillna(0), y_train_bin.values.ravel())
        y_pred = model.predict(X_test.fillna(0))
    
    acc = accuracy_score(y_test_bin, y_pred)
    results_classif[name] = acc
    print(f"✅ Test Accuracy ({name}): {acc:.4f}")



Training RidgeClassifier (classification)...
✅ Test Accuracy (RidgeClassifier): 0.5175

Training RandomForest (classification)...


  y = column_or_1d(y, warn=True)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


✅ Test Accuracy (RandomForest): 0.5178

Training LightGBM (classification)...
[LightGBM] [Info] Number of positive: 72375, number of negative: 71795
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016942 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 144170, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502012 -> initscore=0.008046
[LightGBM] [Info] Start training from score 0.008046
✅ Test Accuracy (LightGBM): 0.5133

Training XGBoost (classification)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Test Accuracy (XGBoost): 0.5168

Training CatBoost (classification)...
✅ Test Accuracy (CatBoost): 0.5176


In [36]:
results_regress = {}
for name, model in models_regress.items():
    print(f"\n{'='*30}\nTraining {name} (regression)...\n{'='*30}")
    if "Ridge" in name:
        model.fit(X_train_scaled, y_train_cont)
        y_pred_cont = model.predict(X_test_scaled)
    else:
        model.fit(X_train.fillna(0), y_train_cont.values.ravel())
        y_pred_cont = model.predict(X_test.fillna(0))
    
    # Convertir en classe binaire pour comparer accuracy
    y_pred_class = (y_pred_cont > 0).astype(int)
    acc = accuracy_score(y_test_bin, y_pred_class)
    results_regress[name] = acc
    print(f"✅ Test Accuracy ({name} - regression->class): {acc:.4f}")


Training Ridge (regression)...
✅ Test Accuracy (Ridge - regression->class): 0.5136

Training RandomForestRegressor (regression)...


  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


✅ Test Accuracy (RandomForestRegressor - regression->class): 0.5175

Training LightGBMRegressor (regression)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 144170, number of used features: 49
[LightGBM] [Info] Start training from score 0.000013
✅ Test Accuracy (LightGBMRegressor - regression->class): 0.5086

Training XGBoostRegressor (regression)...
✅ Test Accuracy (XGBoostRegressor - regression->class): 0.5135

Training CatBoostRegressor (regression)...
✅ Test Accuracy (CatBoostRegressor - regression->class): 0.5133


In [40]:
# Arrondir à 0.01
results_classif_rounded = {k: round(v, 3) for k, v in results_classif.items()}
results_regress_rounded = {k: round(v, 3) for k, v in results_regress.items()}

print("\nClassification results (arrondi à 0.01):")
print(results_classif_rounded)

print("\nRegression->Classification results (arrondi à 0.01):")
print(results_regress_rounded)



Classification results (arrondi à 0.01):
{'RidgeClassifier': 0.517, 'RandomForest': 0.518, 'LightGBM': 0.513, 'XGBoost': 0.517, 'CatBoost': 0.518}

Regression->Classification results (arrondi à 0.01):
{'Ridge': 0.514, 'RandomForestRegressor': 0.518, 'LightGBMRegressor': 0.509, 'XGBoostRegressor': 0.513, 'CatBoostRegressor': 0.513}
