In [1]:
import polars as pl
import lightgbm as lgb

from datetime import datetime

from src.util.constants import PATH_RAW_TRAIN_SET, PATH_RAW_VALIDATE_SET
from src.util.common import mean_grouped_spearman_correlation

In [2]:
df_train = pl.read_parquet(PATH_RAW_TRAIN_SET)
df_validate = pl.read_parquet(PATH_RAW_VALIDATE_SET)

In [3]:
df_train = df_train.filter(pl.col("target").is_not_null())
df_validate = df_validate.filter(pl.col("target").is_not_null())

df_train = df_train.with_columns(pl.col("era").cast(pl.Int16))
df_validate = df_validate.with_columns(pl.col("era").cast(pl.Int16))

In [4]:
df_train = df_train.drop([col for col in df_train.columns if "target_" in col] + ["data_type", 'id'])
df_validate = df_validate.drop([col for col in df_train.columns if "target_" in col] + ["data_type", 'id'])

In [5]:
df_train = df_train.with_columns(
    pl.col('target').rank(method='random').over('era').cast(pl.Int64).alias('target_ranking')
)

In [6]:
feature_names = [x for x in df_train[0].columns if 'feature' in x]

X_train = df_train[feature_names].to_numpy()
y_train = df_train['target'].to_numpy()
y_train_ranking = df_train['target_ranking'].to_numpy()
groups_train = df_train.group_by('era').len().select('len').to_series().to_list()

X_validate = df_validate[feature_names].to_numpy()
y_validate = df_validate['target'].to_numpy()
eras_validate = df_validate['era'].to_numpy()

del df_train, df_validate

In [10]:
fixed_parameters = {
    'metric': 'None',
    "n_jobs": 12,  # current number of cores on my Mac - set this to hardware cores, not virtual threads
    "subsample_freq": 1,
    "verbose": -1
}

df_result = pl.DataFrame()

for num_boost_round in [10, 50, 200]:
    for num_leaves in [2**3 - 1, 2**6 - 1, 2**9 - 1, 2**12 - 1]:

        parameters = {
            **fixed_parameters,
            'num_leaves': num_leaves,
            'num_boost_round': num_boost_round
        }

        parameters_regression = {
            **parameters,
            'objective': 'regression'
        }

        lgb_train_regression = lgb.Dataset(X_train, label=y_train)

        model_regression = lgb.train(
            params=parameters_regression,
            train_set=lgb_train_regression,
            num_boost_round=parameters['num_boost_round']
        )

        corr_regression = mean_grouped_spearman_correlation(model_regression.predict(X_validate), y_validate, eras_validate)
        print(f'{datetime.now().strftime("%H:%M:%S")} . . . Training for regression done.')

        parameters_ranking = {
            **parameters,
            'objective': 'lambdarank',
            'label_gain': list(range(max(groups_train) + 1)),  # linear loss, more than max number of tickers per era
            'lambdarank_truncation_level': max(groups_train)  # evaluate sorting across all tickers
        }

        lgb_train_ranking = lgb.Dataset(X_train, label=y_train_ranking, group=groups_train)

        model_ranking = lgb.train(
            params=parameters_ranking,
            train_set=lgb_train_ranking,
            num_boost_round=parameters['num_boost_round']
        )

        corr_ranking = mean_grouped_spearman_correlation(model_ranking.predict(X_validate), y_validate, eras_validate)
        print(f'{datetime.now().strftime("%H:%M:%S")} . . . Training for ranking done.')

        df_result = df_result.vstack(pl.DataFrame({
            'num_leaves': num_leaves,
            'num_boost_round': num_boost_round,
            'corr_regression': corr_regression,
            'corr_ranking': corr_ranking
        }))

        print(f'{datetime.now().strftime("%H:%M:%S")} . . . Training models for num_leaves={num_leaves} and num_boost_round={num_boost_round} done. Correlation is {corr_regression:5f} for regression and {corr_ranking:5f} for ranking.')

23:45:03 . . . Training for ranking done. Correlation: 0.01782571834148132
23:52:38 . . . Training for ranking done. Correlation: 0.018679465340324837
00:16:55 . . . Training for ranking done. Correlation: 0.02256481429828645
00:43:03 . . . Training for ranking done. Correlation: 0.02046592133459756


In [11]:
df_result.with_columns(diff=pl.col('corr_regression')-pl.col('corr_ranking')).sort('corr_ranking', descending=True)

ColumnNotFoundError: corr_regression

Regression consistently outperforms ranking.