In [1]:
import polars as pl
import lightgbm as lgb
import numpy as np

from datetime import datetime

from src.util.common import mean_grouped_spearman_correlation
from src.util.constants import DATA_PATH, FIXED_LGB_PARAMETERS

In [2]:
df_meta_model = pl.read_parquet(f'{DATA_PATH}/folds/df_meta_model.parquet')
feature_names = [x for x in df_meta_model.columns if 'feature' in x]
X_meta_model = df_meta_model[feature_names].to_numpy()
X_meta_model = np.hstack([np.ones((X_meta_model.shape[0], 1)), X_meta_model])

### linear model

In [34]:
df_result_linear = pl.DataFrame()

for fold in range(3):
    df_train = pl.read_parquet(f'{DATA_PATH}/folds/df_train_{fold}.parquet')

    X = df_train[feature_names].to_numpy()
    y = df_train['target'].to_numpy()
    del df_train
    X = np.hstack([np.ones((X.shape[0], 1)), X])

    beta_hat = np.linalg.solve(X.T @ X, X.T @ y)
    del X, y

    df_validate = pl.read_parquet(f'{DATA_PATH}/folds/df_validate_{fold}.parquet')
    X_validate = df_validate[feature_names].to_numpy()
    X_validate = np.hstack([np.ones((X_validate.shape[0], 1)), X_validate])
    df_validate = df_validate.with_columns(
        prediction=X_validate @ beta_hat
    )
    df_meta_model_with_prediction = df_meta_model.with_columns(
        prediction=X_meta_model @ beta_hat
    )

    corr = mean_grouped_spearman_correlation(
        df_validate['prediction'],
        df_validate['target'],
        df_validate['era']
    )
    corr_w_mm = df_meta_model_with_prediction.select(
        pl.corr("prediction", "numerai_meta_model", method="spearman")
        .over('era', mapping_strategy='explode')
    ).mean()[0, 0]

    df_result_linear = df_result_linear.vstack(pl.DataFrame({
        'fold': fold,
        'corr': corr,
        'corr_w_mm': corr_w_mm
    }))

    del df_validate, df_meta_model_with_prediction

    print(f'Fold {fold} done.')

Fold 2 done.


In [35]:
df_result_linear

fold,corr,corr_w_mm
i64,f64,f64
0,0.022727,0.461625
1,0.021811,0.49834
2,0.013165,0.522116


### Numerai models

In [6]:
# Parameters from Numerai (compare https://docs.numer.ai/numerai-tournament/models)
parameter_dict = {
    'standard_large_lgbm': {
        "learning_rate": 0.001,
        "max_depth": 6,
        "num_leaves": 2 ** 6,
        "colsample_bytree": 0.1,
    },
    'deep_lgbm': {
        "learning_rate": 0.001,
        "max_depth": 10,
        "num_leaves": 1024,
        "colsample_bytree": 0.1,
        "min_data_in_leaf": 10000
    }
}
num_boost_round_dict = {
    'standard_large_lgbm': 20000,
    'deep_lgbm': 30000,
}

In [7]:
df_result_numerai_lgb = pl.DataFrame()
model_deep_2 = None

for model_type in parameter_dict:
    print(f' Running model {model_type}...')
    parameters = {
        **FIXED_LGB_PARAMETERS,
        **parameter_dict[model_type]
    }

    for fold in range(3):
        df_train = pl.read_parquet(f'{DATA_PATH}/folds/df_train_{fold}.parquet')
        feature_names = [x for x in df_train.columns if 'feature' in x]
        lgb_train = lgb.Dataset(df_train[feature_names].to_numpy(), label=df_train['target'].to_numpy())

        model = lgb.train(
            params=parameters,
            train_set=lgb_train,
            num_boost_round=num_boost_round_dict[model_type]
        )
        del lgb_train, df_train

        df_validate = pl.read_parquet(f'{DATA_PATH}/folds/df_validate_{fold}.parquet')

        df_validate = df_validate.with_columns(
            prediction=model.predict(df_validate[feature_names].to_numpy())
        )
        df_meta_model_with_prediction = df_meta_model.with_columns(
            prediction=model.predict(df_meta_model[feature_names].to_numpy())
        )

        corr = mean_grouped_spearman_correlation(
            df_validate['prediction'],
            df_validate['target'],
            df_validate['era']
        )
        corr_w_mm = df_meta_model_with_prediction.select(
            pl.corr("prediction", "numerai_meta_model", method="spearman")
            .over('era', mapping_strategy='explode')
        ).mean()[0, 0]

        df_result_numerai_lgb = df_result_numerai_lgb.vstack(pl.DataFrame({
            'type': model_type,
            'fold': fold,
            'corr': corr,
            'corr_w_mm': corr_w_mm
        }))

        if model_type == 'deep_lgbm' and fold == 2:
            model_deep_2 = model

        del df_validate, df_meta_model_with_prediction

        print(f'{datetime.now().strftime("%H:%M:%S")} . . . Type {model_type}, fold {fold} done.')

 Running model standard_large_lgbm_params...
Type standard_large_lgbm_params, fold 0 done.
Type standard_large_lgbm_params, fold 1 done.
Type standard_large_lgbm_params, fold 2 done.
 Running model deep_lgbm_params...
Type deep_lgbm_params, fold 0 done.
Type deep_lgbm_params, fold 1 done.
Type deep_lgbm_params, fold 2 done.


In [41]:
df_result_numerai_lgb

type,fold,corr,corr_w_mm
str,i64,f64,f64
"""standard_large_lgbm_params""",0,0.036744,0.754296
"""standard_large_lgbm_params""",1,0.036229,0.802252
"""standard_large_lgbm_params""",2,0.024072,0.841545
"""deep_lgbm_params""",0,0.040446,0.794359
"""deep_lgbm_params""",1,0.038057,0.84461
"""deep_lgbm_params""",2,0.02745,0.886149


In [42]:
# MMC approximation
performance_meta_model = mean_grouped_spearman_correlation(
    df_meta_model['numerai_meta_model'],
    df_meta_model['target'],
    df_meta_model['era']
)
df_meta_model_with_prediction = df_meta_model.with_columns(
    prediction=model_deep_2.predict(df_meta_model[feature_names].to_numpy())
)
performance_deep_model = mean_grouped_spearman_correlation(
    df_meta_model_with_prediction['prediction'],
    df_meta_model_with_prediction['target'],
    df_meta_model_with_prediction['era']
)
ratio = performance_meta_model / performance_deep_model

df_result_numerai_lgb.filter(pl.col('type') == 'deep_lgbm').select('corr') * ratio

corr
f64


In [44]:
df_result_numerai_lgb.filter(pl.col('type') == 'deep_lgbm_params').select('corr') * ratio

corr
f64
0.051137
0.048117
0.034706


Using the "deep" Numerai model (which has the closest correlation with the meta model) and its performance on the meta-model set (.0285) to approximate meta-model performance on the threee folds, we get .051, .048, and .035.

In [45]:
meta_model_performance = [.051, .048, .035]

# calculate performance with updated meta-model performance approximation
df_result_numerai_lgb = df_result_numerai_lgb.with_columns(
    mmc_approximation = (pl.col('corr') - (pl.col('corr_w_mm') * pl.col('fold').map_elements(lambda x: meta_model_performance[x], return_dtype=pl.Float64)))
)
df_result_numerai_lgb = df_result_numerai_lgb.with_columns(
    performance = .5 * pl.col('corr') + 2 * pl.col('mmc_approximation')
)
df_result_numerai_lgb

type,fold,corr,corr_w_mm,mmc_approximation,performance
str,i64,f64,f64,f64,f64
"""standard_large_lgbm_params""",0,0.036744,0.754296,-0.001725,0.014923
"""standard_large_lgbm_params""",1,0.036229,0.802252,-0.00228,0.013555
"""standard_large_lgbm_params""",2,0.024072,0.841545,-0.005382,0.001271
"""deep_lgbm_params""",0,0.040446,0.794359,-6.7e-05,0.02009
"""deep_lgbm_params""",1,0.038057,0.84461,-0.002484,0.014061
"""deep_lgbm_params""",2,0.02745,0.886149,-0.003565,0.006594


In [46]:
# let's also check the linear model
# calculate performance with updated meta-model performance approximation
df_result_linear = df_result_linear.with_columns(
    mmc_approximation = (pl.col('corr') - (pl.col('corr_w_mm') * pl.col('fold').map_elements(lambda x: meta_model_performance[x], return_dtype=pl.Float64)))
)
df_result_linear = df_result_linear.with_columns(
    performance = .5 * pl.col('corr') + 2 * pl.col('mmc_approximation')
)
df_result_linear

fold,corr,corr_w_mm,mmc_approximation,performance
i64,f64,f64,f64,f64
0,0.022727,0.461625,-0.000816,0.009731
1,0.021811,0.49834,-0.002109,0.006688
2,0.013165,0.522116,-0.005109,-0.003635


In [47]:
(DATA_PATH / 'results').mkdir(parents=True, exist_ok=True)
df_result_linear.write_parquet(f'{DATA_PATH}/results/df_result_linear.parquet')
df_result_numerai_lgb.write_parquet(f'{DATA_PATH}/results/df_result_numerai_lgb.parquet')