In [12]:
################ CHARGEMENT ################
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score



X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

train_data = X_train.merge(y_train, on='ROW_ID')
test_data = pd.read_csv('X_test.csv')

In [13]:
################ FEATURES ################

def create_all_features(df):
    ret_cols = [f'RET_{i}' for i in range(1, 21)]
    df['ret_mean_5d'] = df[ret_cols[:5]].mean(axis=1) 
    df['ret_std_5d'] = df[ret_cols[:5]].std(axis=1) 
    df['ret_mean_20d'] = df[ret_cols].mean(axis=1) 
    df['momentum_ratio_5_20'] = df['ret_mean_5d'] / (df['ret_mean_20d'] + 1e-6) 
    rolling_std = df[ret_cols].T.rolling(window=3).std().T # une espece de volatilité sur fenetre glissante de 3 jours
    df['meta_volatility'] = rolling_std.std(axis=1) # la vola de cette vola : indice de stabilité
    df['interaction_ret_turnover'] = df['ret_mean_5d'] * df['AVG_DAILY_TURNOVER']
    return df

train_data = create_all_features(train_data)
test_data = create_all_features(test_data)


In [7]:
train_data.columns

Index(['ROW_ID', 'TS', 'ALLOCATION', 'RET_20', 'RET_19', 'RET_18', 'RET_17',
       'RET_16', 'RET_15', 'RET_14', 'RET_13', 'RET_12', 'RET_11', 'RET_10',
       'RET_9', 'RET_8', 'RET_7', 'RET_6', 'RET_5', 'RET_4', 'RET_3', 'RET_2',
       'RET_1', 'SIGNED_VOLUME_20', 'SIGNED_VOLUME_19', 'SIGNED_VOLUME_18',
       'SIGNED_VOLUME_17', 'SIGNED_VOLUME_16', 'SIGNED_VOLUME_15',
       'SIGNED_VOLUME_14', 'SIGNED_VOLUME_13', 'SIGNED_VOLUME_12',
       'SIGNED_VOLUME_11', 'SIGNED_VOLUME_10', 'SIGNED_VOLUME_9',
       'SIGNED_VOLUME_8', 'SIGNED_VOLUME_7', 'SIGNED_VOLUME_6',
       'SIGNED_VOLUME_5', 'SIGNED_VOLUME_4', 'SIGNED_VOLUME_3',
       'SIGNED_VOLUME_2', 'SIGNED_VOLUME_1', 'AVG_DAILY_TURNOVER', 'target',
       'ret_mean_5d', 'ret_std_5d', 'ret_mean_20d', 'momentum_ratio_5_20',
       'meta_volatility', 'interaction_ret_turnover', 'y_binary'],
      dtype='object')

In [14]:
################ TRAINING ################

train_data['y_binary'] = (train_data['target'] > 0).astype(int)

# features = les RET_i + les SIGNED_VOLUME_i + ds la partie FEATURES
original_ts_cols = [f'RET_{i}' for i in range(1, 21)] + [f'SIGNED_VOLUME_{i}' for i in range(1, 21)]
features_to_use = [col for col in train_data.columns if col not in (['ROW_ID', 'target', 'y_binary'] + original_ts_cols)]

# features_to_use = [
#     "TS", "ALLOCATION", "AVG_DAILY_TURNOVER", "ret_mean_5d", "ret_std_5d",
#     'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility',
#     'interaction_ret_turnover', "vol_mean_5d", "vol_std_5d"
# ]

X = train_data[features_to_use].copy()
y = train_data['y_binary']
groups = train_data['TS']
X_test = test_data[features_to_use].copy()

X.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(features_to_use)
N_SPLITS = 20 # TO CHECK : à augmenter? 
gkf = GroupKFold(n_splits=N_SPLITS)
test_preds = []
oof_preds = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx].copy()
    X_val, y_val = X.iloc[val_idx].copy(), y.iloc[val_idx].copy()
    
    features_to_process = ['ret_mean_5d', 'momentum_ratio_5_20', 'interaction_ret_turnover']
    for feature in features_to_process:
        market_map = X_train.groupby('TS')[feature].mean()
        # ici on va comparer X_train à toutes les lignes d'une meme TS 
        X_train[f'diff_from_market_{feature}'] = X_train[feature] - X_train['TS'].map(market_map)
        X_val[f'diff_from_market_{feature}'] = X_val[feature] - X_val['TS'].map(market_map)
        X_train[f'{feature}_rank'] = X_train.groupby('TS')[feature].rank(pct=True)
        X_val[f'{feature}_rank'] = X_val.groupby('TS')[feature].rank(pct=True)

    X_train.fillna(0, inplace=True)
    X_val.fillna(0, inplace=True)
    
    categorical_features = ['ALLOCATION', 'TS']
    # TO DO : Optuna ?
    model = cb.CatBoostClassifier(
        iterations=2000, learning_rate=0.015, depth=6, l2_leaf_reg=4,
        loss_function='Logloss', eval_metric='Accuracy',
        random_seed=fold, verbose=0, early_stopping_rounds=150
    )
    model.fit(X_train, y_train, cat_features=categorical_features, eval_set=(X_val, y_val))
    
    val_preds = model.predict(X_val)
    oof_preds[val_idx] = val_preds.flatten()
    print(f"Score du Fold {fold+1}: {accuracy_score(y_val, val_preds)}")
    
    # pareil pour X_test
    X_test_fold = X_test.copy()
    for feature in features_to_process:
        market_map = X_train.groupby('TS')[feature].mean()
        X_test_fold[f'diff_from_market_{feature}'] = X_test_fold[feature] - X_test_fold['TS'].map(market_map)
        X_test_fold[f'{feature}_rank'] = X_test_fold.groupby('TS')[feature].rank(pct=True)
    X_test_fold.fillna(0, inplace=True)

    preds = model.predict_proba(X_test_fold)[:, 1]
    test_preds.append(preds)

final_score = accuracy_score(y, oof_preds)
print(f"Score CV global sur {N_SPLITS} Folds : {final_score}")

['TS', 'ALLOCATION', 'AVG_DAILY_TURNOVER', 'ret_mean_5d', 'ret_std_5d', 'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility', 'interaction_ret_turnover']
Score du Fold 1: 0.5249584947426674
Score du Fold 2: 0.5280575539568345


KeyboardInterrupt: 

In [None]:
['TS', 'ALLOCATION', 'AVG_DAILY_TURNOVER', 'ret_mean_5d', 'ret_std_5d', 'ret_mean_20d', 'momentum_ratio_5_20', 'meta_volatility', 'interaction_ret_turnover']
Score du Fold 1: 0.5249584947426674
Score du Fold 2: 0.5280575539568345
Score du Fold 3: 0.5337022689540675
Score du Fold 4: 0.5327061427780853
Score du Fold 5: 0.521306032097399
Score du Fold 6: 0.5176535694521306
Score du Fold 7: 0.52174875484228
Score du Fold 8: 0.5158826784726065
Score du Fold 9: 0.5315993359158827
Score du Fold 10: 0.543884892086331
Score du Fold 11: 0.5227448810182623
Score du Fold 12: 0.5167681239623686
Score du Fold 13: 0.533923630326508
Score du Fold 14: 0.5100334448160535
Score du Fold 15: 0.5307692307692308
Score du Fold 16: 0.5216276477146042
Score du Fold 17: 0.5220735785953178
Score du Fold 18: 0.526644370122631
Score du Fold 19: 0.5296544035674471
Score du Fold 20: 0.5099219620958751
Score CV global sur 20 Folds : 0.5247912563455297

In [None]:
feature_importances = model.get_feature_importance()
importance_df = pd.DataFrame({
    'feature': model.feature_names_,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print(importance_df)

                                      feature  importance
0                                          TS   41.217718
2                          AVG_DAILY_TURNOVER   15.632429
1                                  ALLOCATION   11.788091
5                                ret_mean_20d   10.004636
10                           ret_mean_5d_rank    5.086727
14              interaction_ret_turnover_rank    4.816476
9                diff_from_market_ret_mean_5d    4.803442
8                    interaction_ret_turnover    1.948992
3                                 ret_mean_5d    1.854604
13  diff_from_market_interaction_ret_turnover    1.626853
4                                  ret_std_5d    0.419053
7                             meta_volatility    0.372108
6                         momentum_ratio_5_20    0.251686
12                   momentum_ratio_5_20_rank    0.119882
11       diff_from_market_momentum_ratio_5_20    0.057302


In [9]:
################ SUBMIT ################

final_predictions_proba = np.mean(test_preds, axis=0)
final_predictions = (final_predictions_proba > 0.5).astype(int)

submission = pd.DataFrame({'ROW_ID': test_data['ROW_ID'], 'prediction': final_predictions})

submission.to_csv('submission20_with_new_features.csv', index=False) 


In [None]:
(submission["prediction"]==1).sum()