In [1]:
import polars as pl
import lightgbm as lgb

from datetime import datetime

from src.util.constants import DATA_PATH
from src.util.common import mean_grouped_spearman_correlation

In [2]:
df_meta_model = pl.read_parquet(f"{DATA_PATH}/folds/df_meta_model.parquet")
df_meta_model = df_meta_model.drop([col for col in df_meta_model.columns if "target_" in col] + ["data_type", 'id'])

In [3]:
fixed_parameters = {
    'objective': 'regression',
    'metric': 'None',
    "n_jobs": 12,  # current number of cores on my Mac - set this to hardware cores, not virtual threads
    "subsample_freq": 1,
    "verbose": -1
}

num_boost_round_space = [10, 50, 200]
num_leaves_space = [2**(x*2) - 1 for x in range(2, 7)]

In [4]:
df_result = pl.DataFrame()

for num_boost_round in num_boost_round_space:
    for num_leaves in num_leaves_space:

        parameters = {
            **fixed_parameters,
            'num_leaves': num_leaves,
            'num_boost_round': num_boost_round
        }

        corr = []
        for fold in range(3):
            df_train = pl.read_parquet(f'{DATA_PATH}/folds/df_train_{fold}.parquet')
            df_validate = pl.read_parquet(f'{DATA_PATH}/folds/df_validate_{fold}.parquet')
            feature_names = [x for x in df_train.columns if 'feature' in x]

            lgb_train = lgb.Dataset(df_train[feature_names].to_numpy(), label=df_train['target'].to_numpy())

            model = lgb.train(
                params=parameters,
                train_set=lgb_train,
                num_boost_round=parameters['num_boost_round']
            )

            corr.append(mean_grouped_spearman_correlation(
                pl.Series(model.predict(df_validate[feature_names].to_numpy())),
                df_validate['target'],
                df_validate['era']
            ))

            if fold == 2:
                df_meta_model = df_meta_model.with_columns(
                    prediction=pl.Series(model.predict(df_meta_model[feature_names].to_numpy()))
                )
                corr_mm_set = mean_grouped_spearman_correlation(
                    df_meta_model['prediction'].to_numpy(),
                    df_meta_model['target'],
                    df_meta_model['era'].to_numpy()
                )
                corr_w_mm = df_meta_model.select(
                    pl.corr("prediction", "numerai_meta_model", method="spearman").over('era', mapping_strategy='explode')
                ).mean()[0, 0]

            print(f'{datetime.now().strftime("%H:%M:%S")} . . . Training and evaluating model for num_leaves={num_leaves}, num_boost_round={num_boost_round}, and fold {fold} done.')

        df_result = df_result.vstack(
            pl.DataFrame({'corr_0': corr[0], 'corr_1': corr[1], 'corr_2': corr[2], 'corr_mm_set': corr_mm_set, 'corr_w_mm': corr_w_mm})  # noqa: is existence is ensured by loop
        )

22:39:25 . . . Training and evaluating model for num_leaves=15, num_boost_round=10, and fold 0 done.
22:42:17 . . . Training and evaluating model for num_leaves=15, num_boost_round=10, and fold 1 done.
22:47:06 . . . Training and evaluating model for num_leaves=15, num_boost_round=10, and fold 2 done.
shape: (1, 5)
┌──────────┬──────────┬──────────┬─────────────┬───────────┐
│ corr_0   ┆ corr_1   ┆ corr_2   ┆ corr_mm_set ┆ corr_w_mm │
│ ---      ┆ ---      ┆ ---      ┆ ---         ┆ ---       │
│ f64      ┆ f64      ┆ f64      ┆ f64         ┆ f64       │
╞══════════╪══════════╪══════════╪═════════════╪═══════════╡
│ 0.013796 ┆ 0.022798 ┆ 0.009786 ┆ 0.010884    ┆ 0.420501  │
└──────────┴──────────┴──────────┴─────────────┴───────────┘
22:48:57 . . . Training and evaluating model for num_leaves=63, num_boost_round=10, and fold 0 done.
22:51:39 . . . Training and evaluating model for num_leaves=63, num_boost_round=10, and fold 1 done.
22:56:26 . . . Training and evaluating model for num_l

In [5]:
df_result.median()

corr_0,corr_1,corr_2,corr_mm_set,corr_w_mm
f64,f64,f64,f64,f64
0.016831,0.021903,0.011901,0.016311,0.420501


In [6]:
performance_ratio_median = mean_grouped_spearman_correlation(
    df_meta_model['numerai_meta_model'],
    df_meta_model['target'], df_meta_model['era']
) / df_result['corr_mm_set'].median()
print(performance_ratio_median)

2.210682073090773


In [7]:
print(df_result.median()[0, 0:3] * performance_ratio_median)

shape: (1, 3)
┌──────────┬─────────┬─────────┐
│ corr_0   ┆ corr_1  ┆ corr_2  │
│ ---      ┆ ---     ┆ ---     │
│ f64      ┆ f64     ┆ f64     │
╞══════════╪═════════╪═════════╡
│ 0.037209 ┆ 0.04842 ┆ 0.02631 │
└──────────┴─────────┴─────────┘
