In [16]:
################ CHARGEMENT ################
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score



X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

train_data = X_train.merge(y_train, on='ROW_ID')
test_data = pd.read_csv('X_test.csv')

In [17]:
################ FEATURES ################

def create_all_features(df):
    ret_cols = [f'RET_{i}' for i in range(1, 21)]
    vol_cols = [f'SIGNED_VOLUME_{i}' for i in range(1, 21)]
    df['ret_mean_5d'] = df[ret_cols[:5]].mean(axis=1) 
    df['ret_median_5d'] = df[ret_cols[:5]].median(axis=1) 
    df['ret_std_5d'] = df[ret_cols[:5]].std(axis=1) 

    df['vol_mean_5d'] = df[vol_cols[:5]].mean(axis=1) 
    df['ret_mean_20d'] = df[ret_cols].mean(axis=1) 

    df['momentum_ratio_5_20'] = df['ret_mean_5d'] / (df['ret_mean_20d'] + 1e-6) 
    rolling_std = df[ret_cols].T.rolling(window=3).std().T # une espece de volatilité sur fenetre glissante de 3 jours
    df['meta_volatility'] = rolling_std.std(axis=1) # la vola de cette vola : indice de stabilité
    df['interaction_ret_turnover'] = df['ret_mean_5d'] * df['AVG_DAILY_TURNOVER']

  
    return df



train_data = create_all_features(train_data)
test_data = create_all_features(test_data)



In [18]:
################ TRAINING ################

train_data['y_binary'] = (train_data['target'] > 0).astype(int)

# features = les RET_i + les SIGNED_VOLUME_i + ds la partie FEATURES
original_ts_cols = [f'RET_{i}' for i in range(1, 21)] + [f'SIGNED_VOLUME_{i}' for i in range(1, 21)]
features_to_use = [col for col in train_data.columns if col not in (['ROW_ID', 'target', 'y_binary'] + original_ts_cols)]

# features_to_use = [
#     "TS", "ALLOCATION", "AVG_DAILY_TURNOVER", "ret_mean_5d", "ret_std_5d",
#     'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility',
#     'interaction_ret_turnover', "vol_mean_5d", "vol_std_5d"
# ]

X = train_data[features_to_use].copy()
y = train_data['y_binary']
groups = train_data['TS']
X_test = test_data[features_to_use].copy()

X.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(features_to_use)
N_SPLITS = 20 # TO CHECK : à augmenter? 
gkf = GroupKFold(n_splits=N_SPLITS)
test_preds = []
oof_preds = np.zeros(len(X))
oof_preds_proba = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx].copy()
    X_val, y_val = X.iloc[val_idx].copy(), y.iloc[val_idx].copy()
    
    features_to_process = ['ret_mean_5d', 'momentum_ratio_5_20', 'interaction_ret_turnover', "ret_median_5d"]
    for feature in features_to_process:
        market_map = X_train.groupby('TS')[feature].mean()
        # ici on va comparer X_train à toutes les lignes d'une meme TS 
        X_train[f'diff_from_market_{feature}'] = X_train[feature] - X_train['TS'].map(market_map)
        X_val[f'diff_from_market_{feature}'] = X_val[feature] - X_val['TS'].map(market_map)
        X_train[f'{feature}_rank'] = X_train.groupby('TS')[feature].rank(pct=True)
        X_val[f'{feature}_rank'] = X_val.groupby('TS')[feature].rank(pct=True)

    X_train.fillna(0, inplace=True)
    X_val.fillna(0, inplace=True)
    
    categorical_features = ['ALLOCATION', 'TS']
    # TO DO : Optuna ?
    model = cb.CatBoostClassifier(
        iterations=2000, learning_rate=0.015, depth=6, l2_leaf_reg=4,
        loss_function='Logloss', eval_metric='Accuracy',
        random_seed=fold, verbose=0, early_stopping_rounds=150
    )
    model.fit(X_train, y_train, cat_features=categorical_features, eval_set=(X_val, y_val))
    
    val_preds_proba = model.predict_proba(X_val)[:, 1]
    oof_preds_proba[val_idx] = val_preds_proba.flatten()

    
    val_preds = (val_preds_proba > 0.5).astype(int)
    oof_preds[val_idx] = val_preds.flatten() 
    print(f"Score du Fold {fold+1}: {accuracy_score(y_val, val_preds)}")
    # X_test
    X_test_fold = X_test.copy()
    for feature in features_to_process:
        market_map = X_train.groupby('TS')[feature].mean()
        X_test_fold[f'diff_from_market_{feature}'] = X_test_fold[feature] - X_test_fold['TS'].map(market_map)
        X_test_fold[f'{feature}_rank'] = X_test_fold.groupby('TS')[feature].rank(pct=True)
    X_test_fold.fillna(0, inplace=True)

    preds = model.predict_proba(X_test_fold)[:, 1]
    test_preds.append(preds)

final_score = accuracy_score(y, oof_preds)
print(f"Score CV global sur {N_SPLITS} Folds : {final_score}")

['TS', 'ALLOCATION', 'AVG_DAILY_TURNOVER', 'ret_mean_5d', 'ret_median_5d', 'ret_std_5d', 'vol_mean_5d', 'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility', 'interaction_ret_turnover']
Score du Fold 1: 0.5237410071942447
Score du Fold 2: 0.5247371333702269
Score du Fold 3: 0.53480907581627
Score du Fold 4: 0.5224128389596016
Score du Fold 5: 0.5268400664084117
Score du Fold 6: 0.5250691754288876
Score du Fold 7: 0.5275041505257333
Score du Fold 8: 0.5140011068068622
Score du Fold 9: 0.5366906474820143
Score du Fold 10: 0.5432208079690094
Score du Fold 11: 0.5244050913115661
Score du Fold 12: 0.5105700055340343
Score du Fold 13: 0.5352517985611511
Score du Fold 14: 0.5096989966555184
Score du Fold 15: 0.5292084726867335
Score du Fold 16: 0.5229654403567447
Score du Fold 17: 0.5306577480490524
Score du Fold 18: 0.5301003344481605
Score du Fold 19: 0.5287625418060201
Score du Fold 20: 0.5082497212931996
Score CV global sur 20 Folds : 0.5254514688340869


In [19]:
################ CHECKING FEATURE IMPORTANCE ################

feature_importances = model.get_feature_importance()
feature_names = model.feature_names_ 

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print(importance_df)


                                      feature  importance
0                                          TS   40.096762
2                          AVG_DAILY_TURNOVER   13.551751
1                                  ALLOCATION   10.916264
7                                ret_mean_20d    9.262371
6                                 vol_mean_5d    5.627927
16              interaction_ret_turnover_rank    4.629109
12                           ret_mean_5d_rank    4.319184
11               diff_from_market_ret_mean_5d    2.987413
10                   interaction_ret_turnover    2.624745
3                                 ret_mean_5d    2.292685
17             diff_from_market_ret_median_5d    0.993238
18                         ret_median_5d_rank    0.773982
15  diff_from_market_interaction_ret_turnover    0.702441
4                               ret_median_5d    0.544687
9                             meta_volatility    0.278506
5                                  ret_std_5d    0.259987
8             

In [None]:
################ SUBMIT ################

best_score = 0
best_proportion = 0.5
for proportion_to_predict_1 in np.arange(0.1, 0.9, 0.01):
    threshold = np.quantile(oof_preds_proba, 1 - proportion_to_predict_1)
    score = accuracy_score(y, oof_preds_proba > threshold)
    if score > best_score:
        best_score = score
        best_proportion = proportion_to_predict_1

print(f"Meilleur score CV avec seuil optimisé : {best_score}")
print(f"Correspond à une proportion de '1' de : {best_proportion}")


final_predictions_proba = np.mean(test_preds, axis=0)

final_threshold = np.quantile(final_predictions_proba, 1 - best_proportion)

print(f"Seuil final appliqué sur le test set : {final_threshold}")

final_predictions = (final_predictions_proba > final_threshold).astype(int)


submission = pd.DataFrame({'ROW_ID': test_data['ROW_ID'], 'prediction': final_predictions})
submission.to_csv('submission20_catboost_vol3d.csv', index=False)


print((final_predictions == 1).sum())


Meilleur score CV avec seuil optimisé : 0.52560
Correspond à une proportion de '1' de : 0.44
Seuil final appliqué sur le test set : 0.4981

Fichier de soumission créé !
Nombre de 1 prédits : 3403 sur 7735


In [None]:
# ['TS', 'ALLOCATION', 'AVG_DAILY_TURNOVER', 'ret_mean_5d', 'ret_median_5d', 'ret_std_5d', 'vol_mean_5d', 'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility', 'interaction_ret_turnover']
# Score du Fold 1: 0.5237410071942447
# Score du Fold 2: 0.5247371333702269
# Score du Fold 3: 0.53480907581627
# Score du Fold 4: 0.5224128389596016
# Score du Fold 5: 0.5268400664084117
# Score du Fold 6: 0.5250691754288876
# Score du Fold 7: 0.5275041505257333
# Score du Fold 8: 0.5140011068068622
# Score du Fold 9: 0.5366906474820143
# Score du Fold 10: 0.5432208079690094
# Score du Fold 11: 0.5244050913115661
# Score du Fold 12: 0.5105700055340343
# Score du Fold 13: 0.5352517985611511
# Score du Fold 14: 0.5096989966555184
# Score du Fold 15: 0.5292084726867335
# Score du Fold 16: 0.5229654403567447
# Score du Fold 17: 0.5306577480490524
# Score du Fold 18: 0.5301003344481605
# Score du Fold 19: 0.5287625418060201
# Score du Fold 20: 0.5082497212931996
# Score CV global sur 20 Folds : 0.5254514688340869


# ['TS', 'ALLOCATION', 'AVG_DAILY_TURNOVER', 'ret_mean_5d', 'ret_median_5d', 'ret_std_5d', 'vol_mean_3d', 'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility', 'interaction_ret_turnover']
# Score du Fold 1: 0.5278361925843941
# Score du Fold 2: 0.5249584947426674
# Score du Fold 3: 0.5369120088544549
# Score du Fold 4: 0.5380188157166574
# Score du Fold 5: 0.5268400664084117
# Score du Fold 6: 0.5225235196458218
# Score du Fold 7: 0.5223021582733813
# Score du Fold 8: 0.5178749308245711
# Score du Fold 9: 0.5337022689540675
# Score du Fold 10: 0.5408965135583841
# Score du Fold 11: 0.5240730492529053
# Score du Fold 12: 0.5105700055340343
# Score du Fold 13: 0.5344770337576092
# Score du Fold 14: 0.5051282051282051
# Score du Fold 15: 0.5292084726867335
# Score du Fold 16: 0.5249721293199554
# Score du Fold 17: 0.5302118171683389
# Score du Fold 18: 0.5302118171683389
# Score du Fold 19: 0.5308807134894091
# Score du Fold 20: 0.5004459308807135
# Score CV global sur 20 Folds : 0.5256123609531471