In [1]:
import polars as pl
import lightgbm as lgb
import numpy as np

from datetime import datetime

from src.util.common import mean_grouped_spearman_correlation
from src.util.constants import DATA_PATH, FIXED_LGB_PARAMETERS

In [2]:
df_meta_model = pl.read_parquet(f'{DATA_PATH}/folds/df_meta_model.parquet')
feature_names = [x for x in df_meta_model.columns if 'feature' in x]

### linear model

In [3]:
df_result_linear = pl.DataFrame()

for fold in range(3):
    df_train = pl.read_parquet(f'{DATA_PATH}/folds/df_train_{fold}.parquet')

    constant_features = []
    for feature in feature_names:
        if df_train[feature].var() < 1e-10:
            constant_features.append(feature)

    feature_names_fold = [feature for feature in feature_names if feature not in constant_features]

    X = df_train[feature_names_fold].to_numpy()
    y = df_train['target'].to_numpy()
    del df_train
    X = np.hstack([np.ones((X.shape[0], 1)), X])

    beta_hat = np.linalg.solve(X.T @ X, X.T @ y)
    del X, y

    df_validate = pl.read_parquet(f'{DATA_PATH}/folds/df_validate_{fold}.parquet')
    X_validate = df_validate[feature_names_fold].to_numpy()
    X_validate = np.hstack([np.ones((X_validate.shape[0], 1)), X_validate])
    df_validate = df_validate.with_columns(
        prediction=X_validate @ beta_hat
    )

    X_meta_model = df_meta_model[feature_names_fold].to_numpy()
    X_meta_model = np.hstack([np.ones((X_meta_model.shape[0], 1)), X_meta_model])
    df_meta_model_with_prediction = df_meta_model.with_columns(
        prediction=X_meta_model @ beta_hat
    )

    corr = mean_grouped_spearman_correlation(
        df_validate['prediction'],
        df_validate['target'],
        df_validate['era']
    )
    corr_w_mm = df_meta_model_with_prediction.select(
        pl.corr("prediction", "numerai_meta_model", method="spearman")
        .over('era', mapping_strategy='explode')
    ).mean()[0, 0]

    df_result_linear = df_result_linear.vstack(pl.DataFrame({
        'fold': fold,
        'corr': corr,
        'corr_w_mm': corr_w_mm
    }))

    del df_validate, df_meta_model_with_prediction

    print(f'Fold {fold} done.')

Fold 0 done.
Fold 1 done.
Fold 2 done.


In [4]:
df_result_linear

fold,corr,corr_w_mm
i64,f64,f64
0,0.016962,0.344646
1,0.016439,0.383734
2,0.010562,0.40612


### Numerai models

In [5]:
# Parameters from Numerai (compare https://docs.numer.ai/numerai-tournament/models)
parameter_dict = {
    'standard_large_lgbm': {
        "learning_rate": 0.001,
        "max_depth": 6,
        "num_leaves": 2 ** 6,
        "colsample_bytree": 0.1,
    },
    'deep_lgbm': {
        "learning_rate": 0.001,
        "max_depth": 10,
        "num_leaves": 1024,
        "colsample_bytree": 0.1,
        "min_data_in_leaf": 10000
    }
}
num_boost_round_dict = {
    'standard_large_lgbm': 20000,
    'deep_lgbm': 30000,
}

In [6]:
df_result_numerai_lgb = pl.DataFrame()
model_deep_2 = None

for model_type in parameter_dict:
    print(f' Running model {model_type}...')
    parameters = {
        **FIXED_LGB_PARAMETERS,
        **parameter_dict[model_type]
    }

    for fold in range(3):
        df_train = pl.read_parquet(f'{DATA_PATH}/folds/df_train_{fold}.parquet')
        feature_names = [x for x in df_train.columns if 'feature' in x]
        lgb_train = lgb.Dataset(df_train[feature_names].to_numpy(), label=df_train['target'].to_numpy())

        model = lgb.train(
            params=parameters,
            train_set=lgb_train,
            num_boost_round=num_boost_round_dict[model_type]
        )
        del lgb_train, df_train

        df_validate = pl.read_parquet(f'{DATA_PATH}/folds/df_validate_{fold}.parquet')

        df_validate = df_validate.with_columns(
            prediction=model.predict(df_validate[feature_names].to_numpy())
        )
        df_meta_model_with_prediction = df_meta_model.with_columns(
            prediction=model.predict(df_meta_model[feature_names].to_numpy())
        )

        corr = mean_grouped_spearman_correlation(
            df_validate['prediction'],
            df_validate['target'],
            df_validate['era']
        )
        corr_w_mm = df_meta_model_with_prediction.select(
            pl.corr("prediction", "numerai_meta_model", method="spearman")
            .over('era', mapping_strategy='explode')
        ).mean()[0, 0]

        df_result_numerai_lgb = df_result_numerai_lgb.vstack(pl.DataFrame({
            'type': model_type,
            'fold': fold,
            'corr': corr,
            'corr_w_mm': corr_w_mm
        }))

        if model_type == 'deep_lgbm' and fold == 2:
            model_deep_2 = model

        del df_validate, df_meta_model_with_prediction

        print(f'{datetime.now().strftime("%H:%M:%S")} . . . Type {model_type}, fold {fold} done.')

 Running model standard_large_lgbm...
01:08:05 . . . Type standard_large_lgbm, fold 0 done.
03:51:52 . . . Type standard_large_lgbm, fold 1 done.
07:08:14 . . . Type standard_large_lgbm, fold 2 done.
 Running model deep_lgbm...
11:43:30 . . . Type deep_lgbm, fold 0 done.
18:12:10 . . . Type deep_lgbm, fold 1 done.
02:34:02 . . . Type deep_lgbm, fold 2 done.


In [None]:
df_result_numerai_lgb

In [None]:
# MMC approximation
performance_meta_model = mean_grouped_spearman_correlation(
    df_meta_model['numerai_meta_model'],
    df_meta_model['target'],
    df_meta_model['era']
)
df_meta_model_with_prediction = df_meta_model.with_columns(
    prediction=model_deep_2.predict(df_meta_model[feature_names].to_numpy())
)
performance_deep_model = mean_grouped_spearman_correlation(
    df_meta_model_with_prediction['prediction'],
    df_meta_model_with_prediction['target'],
    df_meta_model_with_prediction['era']
)
ratio = performance_meta_model / performance_deep_model

df_result_numerai_lgb.filter(pl.col('type') == 'deep_lgbm').select('corr') * ratio

In [None]:
meta_model_performance = [.044, .037, .025]  # TODO: automate

# calculate performance with updated meta-model performance approximation
df_result_numerai_lgb = df_result_numerai_lgb.with_columns(
    mmc_approximation = (pl.col('corr') - (pl.col('corr_w_mm') * pl.col('fold').map_elements(lambda x: meta_model_performance[x], return_dtype=pl.Float64)))
)
df_result_numerai_lgb = df_result_numerai_lgb.with_columns(
    performance = .75 * pl.col('corr') + 2.25 * pl.col('mmc_approximation')
)
df_result_numerai_lgb

In [None]:
# let's also check the linear model
# calculate performance with updated meta-model performance approximation
df_result_linear = df_result_linear.with_columns(
    mmc_approximation = (pl.col('corr') - (pl.col('corr_w_mm') * pl.col('fold').map_elements(lambda x: meta_model_performance[x], return_dtype=pl.Float64)))
)
df_result_linear = df_result_linear.with_columns(
    performance = .5 * pl.col('corr') + 2 * pl.col('mmc_approximation')
)
df_result_linear

In [27]:
(DATA_PATH / 'results').mkdir(parents=True, exist_ok=True)
df_result_linear.write_parquet(f'{DATA_PATH}/results/df_result_linear.parquet')
df_result_numerai_lgb.write_parquet(f'{DATA_PATH}/results/df_result_numerai_lgb.parquet')