In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import lightgbm as lgbm
from autogluon.tabular import TabularPredictor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X = pd.read_csv('data/X_train.csv',index_col='ROW_ID')
X_test_final = pd.read_csv('data/X_test.csv',index_col='ROW_ID')
y_train = pd.read_csv('data/y_train.csv',index_col='ROW_ID')
y_bin = (y_train > 0).astype(int) #binariser l outcome pour faire de la classification et pas regression
sample_submission = pd.read_csv('data/sample_submission.csv',index_col='ROW_ID')

In [3]:
RET_features = [f'RET_{i}' for i in range(1,20)]
SIGNED_VOLUME_features = [f'SIGNED_VOLUME_{i}' for i in range(1,20)]
TURNOVER_features = ['AVG_DAILY_TURNOVER']
for i in [3,5,10,15,20]:
    X[ f'AVERAGE_PERF_{i}'] = X[RET_features[:i]].mean(1)
    X[ f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X.groupby('TS')[ f'AVERAGE_PERF_{i}'].transform('mean')
    
    X_test_final[ f'AVERAGE_PERF_{i}'] = X_test_final[RET_features[:i]].mean(1)
    X_test_final[ f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X_test_final.groupby('TS')[ f'AVERAGE_PERF_{i}'].transform('mean')


In [4]:
features = RET_features + SIGNED_VOLUME_features + TURNOVER_features
features = features + [ f'AVERAGE_PERF_{i}' for i in [3,5,10,15,20]]
features = features + [ f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in [3,5,10,15,20]]

In [5]:
unique_dates = X['TS'].unique()
unique_dates = np.sort(unique_dates)

n = len(unique_dates)
train_dates = unique_dates[: int(0.6 * n)]
tuning_dates = unique_dates[int(0.6 * n): int(0.8 * n)]
test_dates = unique_dates[int(0.8 * n):]

#separation train/test/tuning en groupant par TS pour eviter fuite de donnee
train_idx = X['TS'].isin(train_dates)
tuning_idx = X['TS'].isin(tuning_dates)
test_idx = X['TS'].isin(test_dates)

X_train = X.loc[train_idx].copy()
y_train = y_bin.loc[train_idx].copy()

X_tune = X.loc[tuning_idx].copy()
y_tune = y_bin.loc[tuning_idx].copy()

X_test = X.loc[test_idx].copy()
y_test = y_bin.loc[test_idx].copy()

In [6]:

train_data = X_train[features].copy()
train_data['target'] = y_train

tuning_data = X_tune[features].copy()
tuning_data['target'] = y_tune

save_path = "Autogluon_TS_grouped"
predictor = TabularPredictor(label='target', eval_metric='accuracy', path=save_path, verbosity=3, problem_type='binary')

predictor.fit(
    train_data=train_data,
    tuning_data=tuning_data,
    presets='best_quality',
    time_limit=3600,
    num_bag_folds = 7, 
    num_stack_levels = 4, 
    auto_stack=True,
    dynamic_stacking=True  
)

Verbosity: 3 (Detailed Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T8132
CPU Count:          10
GPU Count:          1
Memory Avail:       3.19 GB / 16.00 GB (19.9%)
Disk Space Avail:   40.69 GB / 228.27 GB (17.8%)
Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': True,
 'num_bag_folds': 7,
 'num_bag_sets': 1,
 'num_stack_levels': 4}
Full kwargs:
{'_experimental_dynamic_hyperparameters': False,
 '_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': True,
 'calibrate': 'auto',
 'delay_bag_sets': False,
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'enable_callbacks': False,
             'enable_ray_logging': True,
             'holdout_data': None

KeyboardInterrupt: 

In [None]:
y_pred = predictor.predict(X_test[features])
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy (grouped by TS): {acc*100:.2f}%")

# leaderboard & feature importances
lb = predictor.leaderboard(X_test[features], silent=True)
print(lb[['model','score_val','score_test','fit_time','pred_time']])

fi = predictor.feature_importance(X_test[features])
print(fi.head(30))

In [None]:
preds = predictor.predict(X_test_final[features])

# transforme en DataFrame et exporte
preds_df = pd.DataFrame(preds, index=sample_submission.index, columns=['target'])
preds_df.to_csv("preds_autogluon.csv")
(preds_df > 0.5).astype(int).to_csv("preds_autogluon_binaire.csv")
