In [1]:
import polars as pl
import lightgbm as lgb
import random
import numpy as np
import warnings

from sklearn.metrics import make_scorer
from sklearn.inspection import permutation_importance
from datetime import datetime

from src.util.constants import DATA_PATH, FIXED_LGB_PARAMETERS
from src.util.common import mean_grouped_spearman_correlation, save_as_pickle

For feature selection, we use simple correlation scoring. For hyperparameter tuning, we'll switch to performance approximation.

In [2]:
df_train: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_train_0.parquet")
feature_names = [x for x in df_train.columns if 'feature' in x]
del df_train
number_of_shadow_features: int = round(len(feature_names) / 10)

df_train_list = []
df_validate_list = []

def add_shadow_features(df: pl.DataFrame, _feature_names: list[str]) -> pl.DataFrame:
    shadow_df = df.select([pl.col(col_name).shuffle().alias(f'{col_name}_shadow') for col_name in _feature_names])
    df_with_shadow = pl.concat([df, shadow_df], how="horizontal")

    return df_with_shadow

random_features = random.sample(feature_names, number_of_shadow_features)

for fold in range(2):
    df_train: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_train_{fold}.parquet")
    df_validate: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_validate_{fold}.parquet")

    df_train = add_shadow_features(df_train, random_features)
    df_validate = add_shadow_features(df_validate, random_features)

    df_train_list.append(df_train)
    df_validate_list.append(df_validate)
    del df_train, df_validate

In [4]:
feature_list_to_test = feature_names + [f'{feature}_shadow' for feature in random_features]
selected_features = feature_names

# run on a small set of hyperparameters
num_boost_round_space = [50, 200]
num_leaves_space = [2**(x*2) - 1 for x in range(2, 5)]

active = True
best_mean_corr = -1.0
best_max_corr = -1.0
features_to_keep = feature_list_to_test
features_to_keep_last = features_to_keep
while active:
    result_permutation_importance = []

    corrs= []
    for fold in range(2):
        df_train = df_train_list[fold]
        df_validate = df_validate_list[fold]

        for num_boost_round in num_boost_round_space:
            for num_leaves in num_leaves_space:
                parameters = {
                    **FIXED_LGB_PARAMETERS,
                    'num_leaves': num_leaves,
                    'num_boost_round': num_boost_round
                }

                # use sklearn to calculate permutation feature importances (the fastest way I found so far, faster than lleaves)
                model = lgb.LGBMRegressor(
                    n_estimators=num_boost_round,
                    **parameters
                )

                # noinspection PyTypeChecker
                model.fit(
                    X=df_train[feature_list_to_test].to_numpy(),
                    y=df_train['target'].to_numpy()
                )

                corr = mean_grouped_spearman_correlation(
                    pl.Series(model.predict(df_validate[feature_list_to_test].to_numpy())),
                    df_validate['target'],
                    df_validate['era']
                )
                corrs.append(corr)


                def mean_correlation_by_era(target: np.ndarray, prediction: np.ndarray) -> float:
                    return mean_grouped_spearman_correlation(pl.Series(prediction), pl.Series(target), df_validate['era'])


                score = make_scorer(mean_correlation_by_era, greater_is_better=True)
                warnings.filterwarnings("ignore", category=UserWarning)  # supress false positive warning
                result_permutation_importance.append(
                    permutation_importance(model, df_validate[feature_list_to_test].to_numpy(), df_validate['target'].to_numpy(), scoring=score, n_repeats=1)['importances_mean']
                )

                (DATA_PATH / 'tmp').mkdir(parents=True, exist_ok=True)
                save_as_pickle(result_permutation_importance, DATA_PATH / 'tmp/result_permutation_importance.pkl')

                print(f"{datetime.now().strftime("%H:%M:%S")} . . . Fold {fold} with parameters num_boost_round={num_boost_round} and num_leaves={num_leaves} done. Current performance: {corr:.5f}.")


    df_feature_importance = pl.DataFrame({
        'feature': pl.Series(feature_list_to_test),
        'importance_permutation': pl.Series(np.array(result_permutation_importance).mean(axis=0))
    })

    max_shadow_importance = df_feature_importance.filter(pl.col('feature').str.contains('_shadow'))['importance_permutation'].max()

    mean_corr = np.mean(corrs)
    max_corr = np.max(corrs)
    if (mean_corr + max_corr) >= (best_mean_corr + best_max_corr):
        best_mean_corr = mean_corr
        best_max_corr = max_corr
    else:
        print('No performance improvement. Stopping early and using feature set of last iteration.')
        selected_features = [feature for feature in features_to_keep_last if 'shadow' not in feature]
        active = False
        break

    features_to_keep_last = features_to_keep
    features_to_keep = df_feature_importance.filter((pl.col('importance_permutation') > max_shadow_importance))['feature'].to_list()
    features_to_keep = features_to_keep + [f'{feature}_shadow' for feature in random_features]

    number_of_features_to_drop = int((len(feature_list_to_test) - len(features_to_keep)))  # excluding shadow features
    print(f"Dropping {number_of_features_to_drop} of {len(feature_list_to_test) - number_of_shadow_features} features")

    if number_of_features_to_drop <= 0:
        active = False
        print('No more features to drop. Stopping and using all surviving features.')
        selected_features = [feature for feature in features_to_keep if 'shadow' not in feature]
    else:
        feature_list_to_test = features_to_keep



16:51:49 . . . Fold 0 with parameters num_boost_round=5 and num_leaves=3 done. Current performance: 0.00931.
16:54:53 . . . Fold 0 with parameters num_boost_round=5 and num_leaves=15 done. Current performance: 0.01665.
16:57:59 . . . Fold 0 with parameters num_boost_round=15 and num_leaves=3 done. Current performance: 0.01131.
17:01:23 . . . Fold 0 with parameters num_boost_round=15 and num_leaves=15 done. Current performance: 0.02088.
17:04:24 . . . Fold 1 with parameters num_boost_round=5 and num_leaves=3 done. Current performance: 0.01144.
17:07:29 . . . Fold 1 with parameters num_boost_round=5 and num_leaves=15 done. Current performance: 0.02346.
17:10:39 . . . Fold 1 with parameters num_boost_round=15 and num_leaves=3 done. Current performance: 0.01429.
17:14:00 . . . Fold 1 with parameters num_boost_round=15 and num_leaves=15 done. Current performance: 0.02549.
Carry on...
Dropping 144 of 39 features
17:14:57 . . . Fold 0 with parameters num_boost_round=5 and num_leaves=3 done. C

In [19]:
save_as_pickle(selected_features, DATA_PATH / 'results/selected_features.pkl')