In [1]:
import polars as pl
import optuna
import lightgbm as lgb
import numpy as np

from src.util.constants import DATA_PATH, META_MODEL_PERFORMANCE
from src.util.common import load_from_pickle, mean_grouped_spearman_correlation, save_as_pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
selected_features = load_from_pickle(DATA_PATH / 'results/selected_features.pkl')
required_columns = ['era', 'target'] + selected_features

df_train_list = []
df_validate_list = []
for fold in range(2):
    df_train_fold = pl.read_parquet(f"{DATA_PATH}/folds/df_train_{fold}.parquet")
    df_validate_fold = pl.read_parquet(f"{DATA_PATH}/folds/df_validate_{fold}.parquet")

    df_train_fold = df_train_fold.select(required_columns)
    df_validate_fold = df_validate_fold.select(required_columns)

    df_train_list.append(df_train_fold)
    df_validate_list.append(df_validate_fold)
    del df_train_fold, df_validate_fold


df_meta_model = pl.read_parquet(f'{DATA_PATH}/folds/df_meta_model.parquet')
df_meta_model = df_meta_model.select(required_columns + ['numerai_meta_model'])

In [3]:
def performance_approximation(model: lgb.Booster, fold: int) -> tuple[float, float, float, float]:
    df_validate_fold = df_validate_list[fold]

    corr = mean_grouped_spearman_correlation(
        pl.Series(model.predict(df_validate_fold[selected_features].to_numpy())),
        df_validate_fold['target'],
        df_validate_fold['era']
    )

    df_mm_corr = pl.DataFrame({
        'prediction': pl.Series(model.predict(df_meta_model[selected_features].to_numpy())),
        'numerai_meta_model': df_meta_model['numerai_meta_model'],
        'era': df_meta_model['era']
    })
    corr_w_mm = df_mm_corr.select(
        pl.corr("prediction", "numerai_meta_model", method="spearman")
        .over('era', mapping_strategy='explode')
    ).mean().item()

    mmc_approximation = corr - corr_w_mm * META_MODEL_PERFORMANCE[fold]
    performance = .5 * corr + 2 * mmc_approximation

    return performance, corr, corr_w_mm, mmc_approximation

In [4]:
fixed_parameters = {
    'objective': 'regression',
    'metric': 'None',
    "n_jobs": 12,  # current number of cores on my Mac - set this to hardware cores, not virtual threads
    "subsample_freq": 1,
    "verbose": -1
}

In [5]:
def objective(trial: optuna.trial.Trial) -> float:
    # we use these exponents to create a log scale that includes zero
    min_sum_hessian_in_leaf_exponent = trial.suggest_int("min_sum_hessian_in_leaf_exponent", 0, 10)
    lambda_l1_exponent = trial.suggest_int("lambda_l1_exponent", 0, 8)
    lambda_l2_exponent = trial.suggest_int("lambda_l2_exponent", 0, 12)
    parameter_sample = {
        "learning_rate": trial.suggest_float("learning_rate", .001, .2, log=True),
        "num_boost_round": trial.suggest_int("num_boost_round", 10, 5000),
        'max_bin': trial.suggest_int('max_bin', 2 ** 3 - 1, 2 ** 10 - 1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2 ** 2 - 1, 2 ** 12 - 1, log=True),
        "bagging_fraction": trial.suggest_float("bagging_fraction", .1, 1.0),
        "feature_fraction": trial.suggest_float("feature_fraction", .1, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 500000, log=True),
        "min_sum_hessian_in_leaf": (10 ** min_sum_hessian_in_leaf_exponent - 1) / 10 ** 5,
        "lambda_l1": (10 ** lambda_l1_exponent - 1) / 10 ** 5,
        "lambda_l2": (10 ** lambda_l2_exponent - 1) / 10 ** 5
    }
    # use default for max_depth = -1 (tree complexity is regularised via num_leaves)

    parameters = {
        **fixed_parameters,
        **parameter_sample
    }

    result_performance = []

    for fold in range(2):
        lgb_train = lgb.Dataset(
            df_train_list[fold][selected_features].to_numpy(),
            label=df_train_list[fold]['target'].to_numpy()
        )

        model = lgb.train(
            params=parameters,
            train_set=lgb_train,
            num_boost_round=parameters['num_boost_round']
        )

        performance, corr, corr_w_mm, mmc_approximation = performance_approximation(model, fold)
        trial.set_user_attr(f"performance_{fold}", performance)
        trial.set_user_attr(f"correlation_{fold}", corr)
        trial.set_user_attr(f"correlation_with_meta_model_{fold}", corr_w_mm)
        trial.set_user_attr(f"mmc_approximation_{fold}", mmc_approximation)
        print(f'Results for fold {fold}: Performance {performance:.5f}, correlation: {corr:.5f}, correlation with meta model: {corr_w_mm:.5f}, MMC (approx.): {mmc_approximation:.5f}')

        # stop trial if it does not show promise (by default, median pruner is used)
        trial.report(performance, step=fold)
        if fold == 0 and trial.number > 10 and trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        result_performance.append(performance)

    return np.mean(result_performance)

In [9]:
study = optuna.create_study(direction="maximize")

[I 2025-06-30 12:09:11,666] A new study created in memory with name: no-name-b5c8d4ea-2b18-4a6e-a2bd-bb91662e3f42


In [7]:
study.optimize(objective, n_trials=5, show_progress_bar=True)

  0%|          | 0/5 [00:00<?, ?it/s]

Results for fold 0: Performance 0.03452, correlation: 0.04147, correlation with meta model: 0.67810, MMC (approx.): 0.00689


Best trial: 73. Best value: 0.0355224:  20%|██        | 1/5 [29:37<1:58:31, 1777.92s/it]

Results for fold 1: Performance 0.02915, correlation: 0.03906, correlation with meta model: 0.71350, MMC (approx.): 0.00481
[I 2025-07-01 21:14:00,054] Trial 95 finished with value: 0.03183312609901347 and parameters: {'min_sum_hessian_in_leaf_exponent': 6, 'lambda_l1_exponent': 4, 'lambda_l2_exponent': 9, 'learning_rate': 0.014542122914774934, 'num_boost_round': 4645, 'max_bin': 139, 'num_leaves': 58, 'bagging_fraction': 0.5375919162365881, 'feature_fraction': 0.3999116676657809, 'min_data_in_leaf': 12}. Best is trial 73 with value: 0.0355223881469549.
Results for fold 0: Performance 0.03561, correlation: 0.04170, correlation with meta model: 0.67287, MMC (approx.): 0.00738


Best trial: 73. Best value: 0.0355224:  40%|████      | 2/5 [1:02:52<1:35:16, 1905.36s/it]

Results for fold 1: Performance 0.03089, correlation: 0.03960, correlation with meta model: 0.70948, MMC (approx.): 0.00555
[I 2025-07-01 21:47:14,631] Trial 96 finished with value: 0.03325100189668005 and parameters: {'min_sum_hessian_in_leaf_exponent': 7, 'lambda_l1_exponent': 4, 'lambda_l2_exponent': 9, 'learning_rate': 0.014550666952116275, 'num_boost_round': 4975, 'max_bin': 137, 'num_leaves': 54, 'bagging_fraction': 0.44022011175217773, 'feature_fraction': 0.47678198593151977, 'min_data_in_leaf': 22}. Best is trial 73 with value: 0.0355223881469549.
Results for fold 0: Performance 0.03366, correlation: 0.04184, correlation with meta model: 0.69552, MMC (approx.): 0.00637


Best trial: 73. Best value: 0.0355224:  60%|██████    | 3/5 [1:42:58<1:11:07, 2133.99s/it]

Results for fold 1: Performance 0.03250, correlation: 0.04082, correlation with meta model: 0.72448, MMC (approx.): 0.00604
[I 2025-07-01 22:27:20,684] Trial 97 finished with value: 0.03307967579391572 and parameters: {'min_sum_hessian_in_leaf_exponent': 7, 'lambda_l1_exponent': 4, 'lambda_l2_exponent': 10, 'learning_rate': 0.021789711636117003, 'num_boost_round': 4863, 'max_bin': 195, 'num_leaves': 111, 'bagging_fraction': 0.42935449300010037, 'feature_fraction': 0.4769195945621156, 'min_data_in_leaf': 21}. Best is trial 73 with value: 0.0355223881469549.
Results for fold 0: Performance 0.03182, correlation: 0.04145, correlation with meta model: 0.70385, MMC (approx.): 0.00555


Best trial: 73. Best value: 0.0355224:  80%|████████  | 4/5 [2:21:29<36:43, 2203.71s/it]  

Results for fold 1: Performance 0.03081, correlation: 0.04061, correlation with meta model: 0.73671, MMC (approx.): 0.00525
[I 2025-07-01 23:05:51,277] Trial 98 finished with value: 0.031314289195949224 and parameters: {'min_sum_hessian_in_leaf_exponent': 7, 'lambda_l1_exponent': 4, 'lambda_l2_exponent': 10, 'learning_rate': 0.01495086312240265, 'num_boost_round': 4880, 'max_bin': 192, 'num_leaves': 106, 'bagging_fraction': 0.3980783396037334, 'feature_fraction': 0.47159908024663494, 'min_data_in_leaf': 22}. Best is trial 73 with value: 0.0355223881469549.


Best trial: 73. Best value: 0.0355224: 100%|██████████| 5/5 [2:34:50<00:00, 1858.12s/it]

Results for fold 0: Performance 0.02821, correlation: 0.03980, correlation with meta model: 0.69902, MMC (approx.): 0.00415
[I 2025-07-01 23:19:12,722] Trial 99 pruned. 





In [15]:
# visualise result
for param in study.best_params.keys():
    optuna.visualization.plot_slice(study, params=[param]).show()

In [8]:
save_as_pickle(study, DATA_PATH / 'results/study.pkl')