In [21]:
################ CHARGEMENT ################

import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score


X_train_df = pd.read_csv('X_train.csv')
y_train_df = pd.read_csv('y_train.csv')

train_data = X_train_df.merge(y_train_df, on='ROW_ID')
test_data = pd.read_csv('X_test.csv')

In [22]:
################ FEATURES part1 : mu et sigma ################

def mle_params_all_from_group(g, r_cols, delta_t=1.0):
    """Calcule les paramètres mu et sigma pour un groupe d'allocation."""
    arr = g[r_cols].to_numpy(dtype=float)
    mean_r = arr.mean()
    var_r = arr.var(ddof=0)
    sigma_hat = np.sqrt(var_r / delta_t)
    mu_hat = (mean_r / delta_t) + 0.5 * (sigma_hat**2)
    return pd.Series({"mu_alloc": mu_hat, "sigma_alloc": sigma_hat})

def compute_allocation_param(df, r_cols, delta_t=1.0):
    """Applique le calcul de mu et sigma à toutes les allocations d'un dataframe."""
    out = (df.groupby("ALLOCATION", observed=True)
           .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
           .reset_index()
           )
    return out

In [23]:
################ FEATURES part2 : ligne par ligne ################

def create_base_features(df):
    """Crée les features de base calculées par ligne (row-wise)."""
    ret_cols = [f'RET_{i}' for i in range(1, 21)]
    vol_cols = [f'SIGNED_VOLUME_{i}' for i in range(1, 21)]
    
    df['ret_mean_5d'] = df[ret_cols[:5]].mean(axis=1)
    # df['ret_std_5d'] = df[ret_cols[:5]].std(axis=1)
    df['ret_mean_20d'] = df[ret_cols].mean(axis=1)
    df['momentum_ratio_5_20'] = df['ret_mean_5d'] / (df['ret_mean_20d'] + 1e-6)

    df['vol_mean_3d'] = df[vol_cols[:3]].mean(axis=1)
    rolling_std = df[ret_cols].T.rolling(window=3).std().T
    df['meta_volatility'] = rolling_std.std(axis=1)
    df['interaction_ret_turnover'] = df['ret_mean_5d'] * df['AVG_DAILY_TURNOVER']
    return df

train_data = create_base_features(train_data)
test_data = create_base_features(test_data)
train_data['y_binary'] = (train_data['target'] > 0).astype(int)


In [25]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score


original_ts_cols = [f'RET_{i}' for i in range(1, 21)] + [f'SIGNED_VOLUME_{i}' for i in range(1, 21)]
final_features_to_use = [col for col in train_data.columns if col not in (['ROW_ID', 'target', 'y_binary'] + original_ts_cols)]

N_SPLITS = 20
gkf = GroupKFold(n_splits=N_SPLITS)
test_preds = [] 
oof_preds_proba = np.zeros(len(train_data))
return_cols = [f'RET_{i}' for i in range(1, 21)]

print(final_features_to_use)
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_data, train_data['y_binary'], groups=train_data['TS'])):
    train_fold_data = train_data.iloc[train_idx].copy()
    val_fold_data = train_data.iloc[val_idx].copy()
    
    y_train = train_fold_data['y_binary']
    y_val = val_fold_data['y_binary']
    
    alloc_params_fold = compute_allocation_param(train_fold_data, return_cols)
    train_fold_data = train_fold_data.merge(alloc_params_fold[['ALLOCATION', "mu_alloc", "sigma_alloc"]], on="ALLOCATION", how='left')
    val_fold_data = val_fold_data.merge(alloc_params_fold[['ALLOCATION', "mu_alloc", "sigma_alloc"]], on="ALLOCATION", how='left')
    
    features_to_process = ['ret_mean_5d', 'momentum_ratio_5_20', 'interaction_ret_turnover', 'mu_alloc', 'sigma_alloc', 'vol_mean_3d', 'ret_mean_20d']

    
    for feature in features_to_process:
        market_map = train_fold_data.groupby('TS', observed=False)[feature].mean()
        train_fold_data[f'diff_from_market_{feature}'] = train_fold_data[feature] - train_fold_data['TS'].map(market_map)
        val_fold_data[f'diff_from_market_{feature}'] = val_fold_data[feature] - val_fold_data['TS'].map(market_map)
        train_fold_data[f'{feature}_rank'] = train_fold_data.groupby('TS', observed=False)[feature].rank(pct=True)
        val_fold_data[f'{feature}_rank'] = val_fold_data.groupby('TS', observed=False)[feature].rank(pct=True)

    all_features_for_model = final_features_to_use + ['mu_alloc', 'sigma_alloc'] + [f'diff_from_market_{f}' for f in features_to_process] + [f'{f}_rank' for f in features_to_process]
    
    X_train_final = train_fold_data[all_features_for_model].copy()
    X_val_final = val_fold_data[all_features_for_model].copy()
    
    X_train_final.fillna(0, inplace=True)
    X_val_final.fillna(0, inplace=True)

    categorical_features = ['ALLOCATION', 'TS']
    model = cb.CatBoostClassifier(
        iterations=2000, learning_rate=0.015, depth=6, l2_leaf_reg=4,
        loss_function='Logloss', eval_metric='Accuracy',
        random_seed=fold, verbose=0, early_stopping_rounds=150
    )
    model.fit(X_train_final, y_train, cat_features=categorical_features, eval_set=(X_val_final, y_val))
    
    val_preds_proba = model.predict_proba(X_val_final)[:, 1]
    oof_preds_proba[val_idx] = val_preds_proba.flatten()
    print(f"Score du Fold {fold+1}: {accuracy_score(y_val, (val_preds_proba > 0.5).astype(int))}")
    
    # X_test
    X_test_fold = test_data.copy()
    X_test_fold = X_test_fold.merge(alloc_params_fold[['ALLOCATION', "mu_alloc", "sigma_alloc"]], on="ALLOCATION", how='left')
    for feature in features_to_process:
        market_map = train_fold_data.groupby('TS', observed=False)[feature].mean() # On utilise la map du train
        X_test_fold[f'diff_from_market_{feature}'] = X_test_fold[feature] - X_test_fold['TS'].map(market_map)
        X_test_fold[f'{feature}_rank'] = X_test_fold.groupby('TS', observed=False)[feature].rank(pct=True)
        
    X_test_final = X_test_fold[all_features_for_model].copy()
    X_test_final.fillna(0, inplace=True)

    preds = model.predict_proba(X_test_final)[:, 1]
    test_preds.append(preds)

final_predictions_binary = (oof_preds_proba > 0.5).astype(int)
final_score = accuracy_score(train_data['y_binary'], final_predictions_binary)
print(f"Score CV global sur {N_SPLITS} Folds (avec seuil 0.5) : {final_score}")


['TS', 'ALLOCATION', 'AVG_DAILY_TURNOVER', 'ret_mean_5d', 'ret_mean_20d', 'momentum_ratio_5_20', 'vol_mean_3d', 'meta_volatility', 'interaction_ret_turnover']


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 1: 0.5224128389596016


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 2: 0.5299391256225788


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 3: 0.5303818483674598


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 4: 0.5309352517985612


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 5: 0.5163254012174876


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 6: 0.5245157719977864


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 7: 0.5174322080796901


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 8: 0.5251798561151079


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 9: 0.5319313779745435


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 10: 0.5394576646375208


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 11: 0.5234089651355839


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 12: 0.5194244604316547


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 13: 0.5370226895406751


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 14: 0.5095875139353401


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 15: 0.5238573021181717


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 16: 0.5258639910813824


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 17: 0.5124860646599777


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 18: 0.5347826086956522


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 19: 0.5276477146042363


  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))


Score du Fold 20: 0.512263099219621
Score CV global sur 20 Folds (avec seuil 0.5) : 0.524752420316791


In [7]:
################ CHECKING FEATURE IMPORTANCE ################

feature_importances = model.get_feature_importance()
feature_names = model.feature_names_ 

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print(importance_df)


                                      feature  importance
0                                          TS   37.946455
1                                  ALLOCATION   11.761531
15                  diff_from_market_mu_alloc    8.744635
22                              mu_alloc_rank    7.696413
23                           sigma_alloc_rank    7.374690
10                                   mu_alloc    6.147024
2                          AVG_DAILY_TURNOVER    4.392056
16               diff_from_market_sigma_alloc    4.220002
11                                sigma_alloc    3.496745
18              diff_from_market_ret_mean_20d    2.416715
24                           vol_mean_3d_rank    1.671145
7                                 vol_mean_3d    1.183842
12               diff_from_market_ret_mean_5d    1.167446
9                    interaction_ret_turnover    0.871049
17               diff_from_market_vol_mean_3d    0.618775
4                                  ret_std_5d    0.206240
14  diff_from_

In [9]:
################ SUBMIT ################

best_score = 0
best_proportion = 0.5
for proportion_to_predict_1 in np.arange(0.1, 0.9, 0.01):
    threshold = np.quantile(oof_preds_proba, 1 - proportion_to_predict_1)
    score = accuracy_score(train_data['y_binary'], oof_preds_proba > threshold)
    if score > best_score:
        best_score = score
        best_proportion = proportion_to_predict_1

print(f"Meilleur score CV avec seuil optimisé : {best_score}")
print(f"Correspond à une proportion de '1' de : {best_proportion}")


final_predictions_proba = np.mean(test_preds, axis=0)

final_threshold = np.quantile(final_predictions_proba, 1 - best_proportion)

print(f"Seuil final appliqué sur le test set : {final_threshold:.4f}")

final_predictions = (final_predictions_proba > final_threshold).astype(int)


submission = pd.DataFrame({'ROW_ID': test_data['ROW_ID'], 'prediction': final_predictions})
submission.to_csv('submission20_ret20diff_vol3d.csv', index=False)

print("Fichier de soumission créé !")
print(f"Nombre de 1 prédits : {(final_predictions == 1).sum()} sur {len(final_predictions)}")

Meilleur score CV avec seuil optimisé : 0.5258620211378956
Correspond à une proportion de '1' de : 0.4299999999999998
Seuil final appliqué sur le test set : 0.4984
Fichier de soumission créé !
Nombre de 1 prédits : 3326 sur 7735


In [None]:
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 1: 0.5220807969009408
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 2: 0.5263973436635307
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 3: 0.5291643608190371
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 4: 0.5329275041505257
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 5: 0.5189817376867737
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 6: 0.5251798561151079
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 7: 0.5227448810182623
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 8: 0.5238516878804649
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 9: 0.534366353071389
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 10: 0.5417819590481461
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 11: 0.5216380741560598
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 12: 0.5123408965135584
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 13: 0.5355838406198118
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 14: 0.5042363433667781
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 15: 0.525752508361204
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 16: 0.5229654403567447
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 17: 0.5324414715719064
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 18: 0.5385730211817168
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 19: 0.5284280936454849
/var/folders/yp/c6z3dm3s1s1dx6tz47yy15gc0000gn/T/ipykernel_25993/3182933904.py:15: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda g: mle_params_all_from_group(g, r_cols, delta_t))
Score du Fold 20: 0.5228539576365663

Score CV global sur 20 Folds (avec seuil 0.5) : 0.52612




                                      feature  importance
0                                          TS   37.946455
1                                  ALLOCATION   11.761531
15                  diff_from_market_mu_alloc    8.744635
22                              mu_alloc_rank    7.696413
23                           sigma_alloc_rank    7.374690
10                                   mu_alloc    6.147024
2                          AVG_DAILY_TURNOVER    4.392056
16               diff_from_market_sigma_alloc    4.220002
11                                sigma_alloc    3.496745
18              diff_from_market_ret_mean_20d    2.416715
24                           vol_mean_3d_rank    1.671145
7                                 vol_mean_3d    1.183842
12               diff_from_market_ret_mean_5d    1.167446
9                    interaction_ret_turnover    0.871049
17               diff_from_market_vol_mean_3d    0.618775
4                                  ret_std_5d    0.206240
14  diff_from_market_interaction_ret_turnover    0.076024
20                   momentum_ratio_5_20_rank    0.009213
21              interaction_ret_turnover_rank    0.000000
13       diff_from_market_momentum_ratio_5_20    0.000000
19                           ret_mean_5d_rank    0.000000
8                             meta_volatility    0.000000
6                         momentum_ratio_5_20    0.000000
5                                ret_mean_20d    0.000000
3                                 ret_mean_5d    0.000000
25                          ret_mean_20d_rank    0.000000
