In [1]:
import polars as pl
import lightgbm as lgb
import numpy as np

from datetime import datetime

from src.util.constants import DATA_PATH, META_MODEL_PERFORMANCE
from src.util.common import load_from_pickle, mean_grouped_spearman_correlation
from util.common import save_as_pickle

In [2]:
selected_features = load_from_pickle(DATA_PATH / 'results/selected_features.pkl')
required_columns = ['era', 'target'] + selected_features

df_validate_list = []

for fold in range(3):
    df_validate_fold = pl.read_parquet(f"{DATA_PATH}/folds/df_validate_{fold}.parquet")
    df_validate_fold = df_validate_fold.select(required_columns)
    df_validate_list.append(df_validate_fold)
    del df_validate_fold

df_meta_model = pl.read_parquet(f'{DATA_PATH}/folds/df_meta_model.parquet')
df_meta_model = df_meta_model.select(required_columns + ['numerai_meta_model'])

In [3]:
def performance_approximation(df_validate_with_prediction: pl.DataFrame, df_meta_model_with_prediction: pl.DataFrame, fold: int) -> tuple[float, float, float, float]:
    corr = mean_grouped_spearman_correlation(
        df_validate_with_prediction['prediction'],
        df_validate_with_prediction['target'],
        df_validate_with_prediction['era']
    )
    corr_w_mm = df_meta_model_with_prediction.select(
        pl.corr("prediction", "numerai_meta_model", method="spearman")
        .over('era', mapping_strategy='explode')
    ).mean().item()

    mmc_approximation = corr - corr_w_mm * META_MODEL_PERFORMANCE[fold]
    performance = .5 * corr + 2 * mmc_approximation

    return performance, corr, corr_w_mm, mmc_approximation

In [4]:
def get_linear_component(df: pl.DataFrame) -> pl.Series:
    X = df[selected_features].to_numpy()
    y = df['prediction'].to_numpy()
    X = np.hstack([np.ones((X.shape[0], 1)), X])

    beta_hat = np.linalg.solve(X.T @ X, X.T @ y)

    return pl.Series(X @ beta_hat)

In [5]:
df_prediction_list_validate = [pl.DataFrame(), pl.DataFrame(), pl.DataFrame()]
df_prediction_list_meta_model = [pl.DataFrame(), pl.DataFrame(), pl.DataFrame()]
eras_meta_model = df_meta_model['era'].unique().to_list()

In [6]:
# forecast with each model for each fold
for index in range(20):
    for fold in range(3):
        model = lgb.Booster(model_file=f"{DATA_PATH}/models/lgb/lgb_model_{index}_{fold}.txt")

        df_validate_with_prediction = df_validate_list[fold].with_columns(
            prediction=pl.Series(model.predict(df_validate_list[fold][selected_features].to_numpy()))
        )
        df_meta_model_with_prediction = df_meta_model.with_columns(
            prediction=pl.Series(model.predict(df_meta_model[selected_features].to_numpy()))
        )

        df_validate_with_prediction = df_validate_with_prediction.with_columns(
            prediction_normalised=((pl.col('prediction') - pl.col('prediction').min()) / (
                    pl.col('prediction').max() - pl.col('prediction').min())).over('era')
        )
        df_meta_model_with_prediction = df_meta_model_with_prediction.with_columns(
            prediction_normalised=((pl.col('prediction') - pl.col('prediction').min()) / (
                        pl.col('prediction').max() - pl.col('prediction').min())).over('era')
        )

        df_prediction_list_validate[fold] = df_prediction_list_validate[fold].with_columns(
            df_validate_with_prediction['prediction_normalised'].alias(f'model_{index}')
        )
        df_prediction_list_meta_model[fold] = df_prediction_list_meta_model[fold].with_columns(
            df_meta_model_with_prediction['prediction_normalised'].alias(f'model_{index}')
        )

        print(f'{datetime.now().strftime("%H:%M:%S")} . . . Prediction for model {index}, fold {fold} done.')

11:08:24 . . . Prediction for model 0, fold 0 done.
11:08:51 . . . Prediction for model 0, fold 1 done.
11:09:17 . . . Prediction for model 0, fold 2 done.
11:10:03 . . . Prediction for model 1, fold 0 done.
11:10:47 . . . Prediction for model 1, fold 1 done.
11:11:33 . . . Prediction for model 1, fold 2 done.
11:12:24 . . . Prediction for model 2, fold 0 done.
11:13:15 . . . Prediction for model 2, fold 1 done.
11:14:03 . . . Prediction for model 2, fold 2 done.
11:15:02 . . . Prediction for model 3, fold 0 done.
11:15:59 . . . Prediction for model 3, fold 1 done.
11:16:56 . . . Prediction for model 3, fold 2 done.
11:18:19 . . . Prediction for model 4, fold 0 done.
11:19:43 . . . Prediction for model 4, fold 1 done.
11:21:06 . . . Prediction for model 4, fold 2 done.
11:21:59 . . . Prediction for model 5, fold 0 done.
11:22:51 . . . Prediction for model 5, fold 1 done.
11:23:43 . . . Prediction for model 5, fold 2 done.
11:24:24 . . . Prediction for model 6, fold 0 done.
11:25:04 . .

In [7]:
ensembled_models = []
df_prediction_ensemble_list_validate = [pl.DataFrame(), pl.DataFrame(), pl.DataFrame()]
df_prediction_ensemble_list_meta_model = [pl.DataFrame(), pl.DataFrame(), pl.DataFrame()]
best_score_2 = -1
stop_counter = 0
i = 0
stopping_condition = 3

In [8]:
while stop_counter < stopping_condition:
    best_score_0_and_1 = 0
    add = -1

    for index in range(20):
        df_prediction_ensemble_list_validate_copy = df_prediction_ensemble_list_validate.copy()
        df_prediction_ensemble_list_meta_model_copy = df_prediction_ensemble_list_meta_model.copy()
        for fold in range(3):
            df_prediction_ensemble_list_validate_copy[fold] = df_prediction_ensemble_list_validate_copy[fold].with_columns(
                model_under_test=df_prediction_list_validate[fold][f'model_{index}']
            )
            df_prediction_ensemble_list_meta_model_copy[fold] = df_prediction_ensemble_list_meta_model_copy[
                fold].with_columns(
                model_under_test=df_prediction_list_meta_model[fold][f'model_{index}']
            )

        performance_list = []
        for fold in range(2):
            df_validate_with_prediction = df_validate_list[fold].with_columns(
                prediction=df_prediction_ensemble_list_validate_copy[fold].mean_horizontal()
            )
            df_meta_model_with_prediction = df_meta_model.with_columns(
                prediction=df_prediction_ensemble_list_meta_model_copy[fold].mean_horizontal()
            )
            performance, _, _, _ = performance_approximation(
                df_validate_with_prediction,
                df_meta_model_with_prediction,
                fold
            )
            performance_list.append(performance)
        score_0_and_1 = np.mean(performance_list)

        if score_0_and_1 > best_score_0_and_1:
            best_score_0_and_1 = score_0_and_1
            add = index

    ensembled_models.append(add)
    for fold in range(3):
        df_prediction_ensemble_list_validate[fold] = df_prediction_ensemble_list_validate[fold].with_columns(
            df_prediction_list_validate[fold][f'model_{add}'].alias(f'model_{add}_{i}')
        )
        df_prediction_ensemble_list_meta_model[fold] = df_prediction_ensemble_list_meta_model[fold].with_columns(
            df_prediction_list_meta_model[fold][f'model_{add}'].alias(f'model_{add}_{i}')
        )

    df_validate_with_prediction = df_validate_list[2].with_columns(
        prediction=df_prediction_ensemble_list_validate[2].mean_horizontal()
    )
    df_meta_model_with_prediction = df_meta_model.with_columns(
        prediction=df_prediction_ensemble_list_meta_model[2].mean_horizontal()
    )
    score_2, corr_2, corr_w_mm_2, mmc_approximation_2 = performance_approximation(
        df_validate_with_prediction,
        df_meta_model_with_prediction,
        2
    )

    if score_2 > best_score_2:
        best_score_2 = score_2
        stop_counter = 0
    else:
        stop_counter = stop_counter + 1

    print(
        f'{datetime.now().strftime("%H:%M:%S")} . . . Added model {add}. Score on val 0 & 1: {best_score_0_and_1}. Score on val 2: {score_2:.5f}, correlation: {corr_2:.5f}, correlation with meta model: {corr_w_mm_2:.5f}, MMC (approx.): {mmc_approximation_2:.5f}.')
    i += 1

12:07:54 . . . Model 0 score on val 0 & 1: 0.0355223881469549
12:07:55 . . . Model 1 score on val 0 & 1: 0.03392802885905142
12:07:55 . . . Model 2 score on val 0 & 1: 0.03389716407158171
12:07:55 . . . Model 3 score on val 0 & 1: 0.03325100189668005
12:07:56 . . . Model 4 score on val 0 & 1: 0.03307967579391572
12:07:56 . . . Model 5 score on val 0 & 1: 0.03290794630283056
12:07:56 . . . Model 6 score on val 0 & 1: 0.03275800328965682
12:07:57 . . . Model 7 score on val 0 & 1: 0.03267116247147015
12:07:57 . . . Model 8 score on val 0 & 1: 0.03257229002834622
12:07:57 . . . Model 9 score on val 0 & 1: 0.03235837688451197
12:07:58 . . . Model 10 score on val 0 & 1: 0.03212908761224006
12:07:58 . . . Model 11 score on val 0 & 1: 0.03183312609901347
12:07:58 . . . Model 12 score on val 0 & 1: 0.0315422281991812
12:07:59 . . . Model 13 score on val 0 & 1: 0.03150828320349001
12:07:59 . . . Model 14 score on val 0 & 1: 0.03142385490716504
12:07:59 . . . Model 15 score on val 0 & 1: 0.031314

In [51]:
ensembled_models = ensembled_models[:-stopping_condition]

[0, 0, 1]


In [44]:
df_prediction_ensemble_validate = pl.DataFrame()
df_prediction_ensemble_meta_model = pl.DataFrame()

In [45]:
for i in range(len(ensembled_models)):
    index = ensembled_models[i]
    df_prediction_ensemble_validate = df_prediction_ensemble_validate.with_columns(
        df_prediction_list_validate[2][f'model_{index}'].alias(f'model_{index}_{i}')
    )
    df_prediction_ensemble_meta_model = df_prediction_ensemble_meta_model.with_columns(
        df_prediction_list_meta_model[2][f'model_{index}'].alias(f'model_{index}_{i}')
    )

df_validate_with_prediction = df_validate_list[2].with_columns(
    prediction=df_prediction_ensemble_validate.mean_horizontal()
)
df_meta_model_with_prediction = df_meta_model.with_columns(
    prediction=df_prediction_ensemble_meta_model.mean_horizontal()
)

In [46]:
linear_component_validate = pl.Series(dtype=pl.Float64)
linear_component_meta_model = pl.Series(dtype=pl.Float64)

# computing the linear component vectorised via Polars structs leads to extremely slow performance

eras_val = df_validate_with_prediction['era'].unique(maintain_order=True).to_list()
for era in eras_val:
    df_era = df_validate_with_prediction.filter(pl.col('era') == era)[selected_features + ["prediction"]]
    linear_component_validate.extend(get_linear_component(df_era))

for era in eras_meta_model:
    df_era = df_meta_model_with_prediction.filter(pl.col('era') == era)[selected_features + ["prediction"]]
    linear_component_meta_model.extend(get_linear_component(df_era))

df_validate_with_prediction = df_validate_with_prediction.with_columns(
    linear_component=linear_component_validate
)
df_meta_model_with_prediction = df_meta_model_with_prediction.with_columns(
    linear_component=linear_component_meta_model
)

In [47]:
performance, corr, corr_w_mm, mmc_approximation = performance_approximation(df_validate_with_prediction, df_meta_model_with_prediction, 2)

df_result = pl.DataFrame({
    'multiplier': 0.0,
    'performance': performance,
    'corr': corr,
    'corr_w_mm': corr_w_mm,
    'mmc_approximation': mmc_approximation
})

In [48]:
for multiplier in [-.4, -.5, -.6, -.7, -.8, -.9, -1.0]:
    df_validate_with_prediction_copy = df_validate_with_prediction.with_columns(
        prediction=pl.col('prediction').add(pl.col('linear_component').mul(multiplier))
    )
    df_meta_model_with_prediction_copy = df_meta_model_with_prediction.with_columns(
        prediction=pl.col('prediction').add(pl.col('linear_component').mul(multiplier))
    )

    performance, corr, corr_w_mm, mmc_approximation = performance_approximation(df_validate_with_prediction_copy, df_meta_model_with_prediction_copy, 2)

    df_result = df_result.vstack(pl.DataFrame({
        'multiplier': multiplier,
        'performance': performance,
        'corr': corr,
        'corr_w_mm': corr_w_mm,
        'mmc_approximation': mmc_approximation
    }))

In [49]:
df_result

multiplier,performance,corr,corr_w_mm,mmc_approximation
f64,f64,f64,f64,f64
0.0,0.01152,0.024823,0.721981,-0.000446
-0.4,0.014189,0.024545,0.67392,0.000958
-0.5,0.01498,0.02424,0.651723,0.00143
-0.6,0.015776,0.023803,0.624744,0.001937
-0.7,0.0166,0.023243,0.592946,0.002489
-0.8,0.01737,0.022532,0.55657,0.003052
-0.9,0.018006,0.021657,0.516223,0.003589
-1.0,0.018596,0.020675,0.472733,0.004129


I will use $-.75$ as multiplier, to balance the results of the 3 folds.

In [None]:
# save parameters

study = load_from_pickle(DATA_PATH / 'results/study.pkl')

ensembled_models_unique = list(set(ensembled_models))

sorted_trials = sorted(study.trials, key=lambda trial: trial.value if trial.value is not None else float('-inf'), reverse=True)
trials = [sorted_trials[i] for i in ensembled_models_unique]
parameters_list = [trial.params for trial in trials]

save_as_pickle(parameters_list, DATA_PATH / 'results/parameters_list.pkl')