In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# 1. Chargement des données
X_train_df = pd.read_csv("X_train.csv")
y_train_df = pd.read_csv("y_train.csv")
data = X_train_df.merge(y_train_df, on="ROW_ID")



XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/imade/Downloads/qrt-asset-allocation-performance-forecasting/.venv/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <E8D72161-CCD1-3423-9388-36D4CA0A7524> /Users/imade/Downloads/qrt-asset-allocation-performance-forecasting/.venv/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file)"]


In [None]:
# 2. Fonction de Feature Engineering enrichie
def create_rich_features(df):
    ret_cols = [f'RET_{i}' for i in range(1, 21)]
    vol_cols = [f'SIGNED_VOLUME_{i}' for i in range(1, 21)]

    df["RET_MEAN"] = df[ret_cols].mean(axis=1)
    df["RET_STD"] = df[ret_cols].std(axis=1)
    df["RET_MEDIAN"] = df[ret_cols].median(axis=1)
    df["RET_SKEW"] = df[ret_cols].skew(axis=1)
    df["RET_KURT"] = df[ret_cols].kurtosis(axis=1)
    
    x = np.arange(len(ret_cols))
    df['RET_MOMENTUM'] = df[ret_cols].apply(lambda y: np.polyfit(x, y.fillna(0), 1)[0], axis=1)

    df["VOL_MEAN"] = df[vol_cols].mean(axis=1)
    df["VOL_STD"] = df[vol_cols].std(axis=1)
    
    df["SHARPE_RATIO_PROXY"] = df["RET_MEAN"] / (df["RET_STD"] + 1e-6)

    for w in [5, 10]:
        ret_window = ret_cols[:w]
        df[f"RET_AVG_{w}"] = df[ret_window].mean(axis=1)
        df[f"GLOBAL_AVG_RET_{w}"] = df.groupby("TS")[f"RET_AVG_{w}"].transform("mean")
    
    return df

data = create_rich_features(data)



In [None]:
# 3. Préparation finale des données
data['y_binary'] = (data['TARGET'] > 0).astype(int)

# Utilisons toutes les features créées + les features originales
original_features = [f'RET_{i}' for i in range(1, 21)] + [f'SIGNED_VOLUME_{i}' for i in range(1, 21)] + ['AVG_DAILY_TURNOVER']
created_features = ['RET_MEAN', 'RET_STD', 'RET_MEDIAN', 'RET_SKEW', 'RET_KURT', 'RET_MOMENTUM', 
                    'VOL_MEAN', 'VOL_STD', 'SHARPE_RATIO_PROXY', 'RET_AVG_5', 'GLOBAL_AVG_RET_5', 
                    'RET_AVG_10', 'GLOBAL_AVG_RET_10']

feature_cols = original_features + created_features
X = data[feature_cols].copy()
y = data['y_binary']

# Remplacer les infinis potentiels (dus à la division par zéro) et les NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True) # Une imputation simple mais souvent efficace pour commencer

# 4. Entraînement avec Cross-Validation et Early Stopping
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(data))
models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"===== FOLD {fold+1} =====")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = xgb.XGBClassifier(
        n_estimators=1000,      # Augmente le nombre d'arbres, l'early stopping trouvera le meilleur
        max_depth=4,            # Un peu moins profond pour éviter l'overfitting
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,              # Ajout d'un peu de régularisation
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50 # Arrête après 50 tours sans amélioration
    )
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              verbose=False)
              
    val_preds = model.predict(X_val)
    oof_preds[val_idx] = val_preds
    models.append(model)
    print(f"Score Fold {fold+1}: {accuracy_score(y_val, val_preds)}")

# Score de validation croisée global
print(f"\nScore CV global: {accuracy_score(y, oof_preds)}")