In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import lightgbm as lgbm
from autogluon.tabular import TabularPredictor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [3]:
X = pd.read_csv('data/X_train.csv',index_col='ROW_ID')
X_test_final = pd.read_csv('data/X_test.csv',index_col='ROW_ID')
y_train = pd.read_csv('data/y_train.csv',index_col='ROW_ID')
y_bin = (y_train > 0).astype(int) #binariser l outcome pour faire de la classification et pas regression
sample_submission = pd.read_csv('data/sample_submission.csv',index_col='ROW_ID')

In [4]:
RET_features = [f'RET_{i}' for i in range(1,20)]
SIGNED_VOLUME_features = [f'SIGNED_VOLUME_{i}' for i in range(1,20)]
TURNOVER_features = ['AVG_DAILY_TURNOVER']
for i in [3,5,10,15,20]:
    X[ f'AVERAGE_PERF_{i}'] = X[RET_features[:i]].mean(1)
    X[ f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X.groupby('TS')[ f'AVERAGE_PERF_{i}'].transform('mean')
    
    X_test_final[ f'AVERAGE_PERF_{i}'] = X_test_final[RET_features[:i]].mean(1)
    X_test_final[ f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X_test_final.groupby('TS')[ f'AVERAGE_PERF_{i}'].transform('mean')


In [5]:
features = RET_features + SIGNED_VOLUME_features + TURNOVER_features
features = features + [ f'AVERAGE_PERF_{i}' for i in [3,5,10,15,20]]
features = features + [ f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in [3,5,10,15,20]]

In [6]:
unique_dates = X['TS'].unique()
unique_dates = np.sort(unique_dates)

n = len(unique_dates)
train_dates = unique_dates[: int(0.6 * n)]
tuning_dates = unique_dates[int(0.6 * n): int(0.8 * n)]
test_dates = unique_dates[int(0.8 * n):]

#separation train/test/tuning en groupant par TS pour eviter fuite de donnee
train_idx = X['TS'].isin(train_dates)
tuning_idx = X['TS'].isin(tuning_dates)
test_idx = X['TS'].isin(test_dates)

X_train = X.loc[train_idx].copy()
y_train = y_bin.loc[train_idx].copy()

X_tune = X.loc[tuning_idx].copy()
y_tune = y_bin.loc[tuning_idx].copy()

X_test = X.loc[test_idx].copy()
y_test = y_bin.loc[test_idx].copy()

In [8]:

train_data = X_train[features].copy()
train_data['target'] = y_train

tuning_data = X_tune[features].copy()
tuning_data['target'] = y_tune

save_path = "Autogluon_TS_grouped"
predictor = TabularPredictor(label='target', eval_metric='accuracy', path=save_path, verbosity=3, problem_type='binary')

predictor.fit(
    train_data=train_data,
    tuning_data=tuning_data,
    presets='best_quality',
    time_limit=3600,
    num_bag_folds = 7, 
    num_stack_levels = 4, 
    auto_stack=True,
    dynamic_stacking=True,
    use_bag_holdout=True   
)

Verbosity: 3 (Detailed Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T8132
CPU Count:          10
GPU Count:          1
Memory Avail:       3.07 GB / 16.00 GB (19.2%)
Disk Space Avail:   38.78 GB / 228.27 GB (17.0%)
Presets specified: ['best_quality']
User Specified kwargs:
{'auto_stack': True,
 'num_bag_folds': 7,
 'num_bag_sets': 1,
 'num_stack_levels': 4,
 'use_bag_holdout': True}
Full kwargs:
{'_experimental_dynamic_hyperparameters': False,
 '_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': True,
 'calibrate': 'auto',
 'delay_bag_sets': False,
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'enable_callbacks': False,
             'enable_ray_logging': True,
       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x339f6a1b0>

In [11]:
y_pred = predictor.predict(X_test[features])
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy (grouped by TS): {acc*100:.2f}%")

test_data = X_test[features].copy()
test_data['target'] = y_test
lb = predictor.leaderboard(test_data, silent=True)
print(lb.head(30))

fi = predictor.feature_importance(test_data)
print(fi.head(30))

Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesEntr_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesGini_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/LightGBM_r131_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/RandomForestEntr_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/WeightedEnsemble_L2/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/LightGBMXT_BAG_L1/model.pkl


Test accuracy (grouped by TS): 51.95%


Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/LightGBM_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/RandomForestGini_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/RandomForestEntr_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/CatBoost_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesGini_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesEntr_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/NeuralNetFastAI_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/

                        model  score_test  score_val eval_metric  \
0         WeightedEnsemble_L2    0.519473   0.523382    accuracy   
1       ExtraTreesGini_BAG_L1    0.518559   0.516563    accuracy   
2        LightGBM_r131_BAG_L1    0.516452   0.521358    accuracy   
3     RandomForestEntr_BAG_L1    0.513957   0.516008    accuracy   
4       ExtraTreesEntr_BAG_L1    0.512737   0.513292    accuracy   
5     RandomForestGini_BAG_L1    0.509356   0.512516    accuracy   
6        LightGBMLarge_BAG_L1    0.509161   0.512017    accuracy   
7   NeuralNetTorch_r79_BAG_L1    0.507942   0.508552    accuracy   
8           LightGBMXT_BAG_L1    0.506556   0.514595    accuracy   
9             LightGBM_BAG_L1    0.506500   0.512072    accuracy   
10     NeuralNetFastAI_BAG_L1    0.506085   0.502204    accuracy   
11       CatBoost_r177_BAG_L1    0.505502   0.512821    accuracy   
12      NeuralNetTorch_BAG_L1    0.504948   0.500984    accuracy   
13            CatBoost_BAG_L1    0.503035   0.51

Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesGini_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/LightGBM_r131_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/RandomForestEntr_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/WeightedEnsemble_L2/model.pkl
	311.57s	= Expected runtime (62.31s per shuffle set)
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesEntr_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/ExtraTreesGini_BAG_L1/model.pkl
Loading: /Users/olivierf/stanford/QRT-Asset-Allocation-Performance-forecasting/Autogluon_TS_grouped/models/LightGB

                             importance    stddev   p_value  n  p99_high  \
RET_15                          0.00524  0.003772  0.018000  5  0.013007   
AVG_DAILY_TURNOVER              0.00364  0.005268  0.098600  5  0.014486   
RET_1                           0.00360  0.003234  0.033779  5  0.010259   
AVERAGE_PERF_15                 0.00300  0.001667  0.007910  5  0.006433   
RET_2                           0.00292  0.004374  0.104895  5  0.011926   
RET_3                           0.00268  0.002726  0.046419  5  0.008293   
RET_16                          0.00264  0.003061  0.063002  5  0.008942   
RET_14                          0.00244  0.002251  0.036241  5  0.007075   
AVERAGE_PERF_10                 0.00208  0.002496  0.067956  5  0.007220   
ALLOCATIONS_AVERAGE_PERF_5      0.00204  0.004199  0.169183  5  0.010685   
SIGNED_VOLUME_7                 0.00204  0.001757  0.030156  5  0.005658   
RET_13                          0.00188  0.002091  0.057367  5  0.006185   
ALLOCATIONS_

In [None]:
preds = predictor.predict(X_test_final[features])

# transforme en DataFrame et exporte
preds_df = pd.DataFrame(preds, index=sample_submission.index, columns=['target'])
preds_df.to_csv("preds_autogluon.csv")
(preds_df > 0.5).astype(int).to_csv("preds_autogluon_binaire.csv")
