In [1]:
import warnings
import random
from datetime import datetime

import numpy as np
import polars as pl
import xgboost as xgb
from sklearn.metrics import make_scorer
from sklearn.inspection import permutation_importance

from src.util.constants import DATA_PATH
from src.util.common import mean_grouped_spearman_correlation, save_as_pickle

For feature selection, we use simple correlation scoring. For hyperparameter tuning, we'll switch to performance approximation.

In [2]:
df_train: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_train_0.parquet")
feature_names = [x for x in df_train.columns if 'feature' in x]
del df_train
number_of_shadow_features: int = round(len(feature_names) / 10)

df_train_list = []
df_validate_list = []

def add_shadow_features(df: pl.DataFrame, _feature_names: list[str]) -> pl.DataFrame:
    shadow_df = df.select([pl.col(col_name).shuffle().alias(f'{col_name}_shadow') for col_name in _feature_names])

    return pl.concat([df, shadow_df], how="horizontal")

random_features = random.sample(feature_names, number_of_shadow_features)

for fold in range(2):
    df_train: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_train_{fold}.parquet")
    df_validate: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_validate_{fold}.parquet")

    df_train = add_shadow_features(df_train, random_features)
    df_validate = add_shadow_features(df_validate, random_features)

    df_train_list.append(df_train)
    df_validate_list.append(df_validate)
    del df_train, df_validate

In [None]:
feature_list_to_test = feature_names + [f'{feature}_shadow' for feature in random_features]
selected_features = feature_names

# run on a small set of hyperparameters
num_boost_round_space = [50, 200]
max_depth_space = [4, 6, 8]

active = True
best_mean_corr = -1.0
best_max_corr = -1.0
features_to_keep = feature_list_to_test
features_to_keep_last = features_to_keep
while active:
    result_permutation_importance = []

    corrs = []
    for fold in range(2):
        df_train = df_train_list[fold]
        df_validate = df_validate_list[fold]

        for num_boost_round in num_boost_round_space:
            for max_depth in max_depth_space:
                parameters = {
                    "device_type": "cpu",
                    "nthread": 12,
                    "objective": "reg:squarederror",
                    "verbosity": 0,
                    "max_depth": max_depth,
                    "num_round": num_boost_round
                }

                # use sklearn to calculate permutation feature importances (the fastest way I found so far, faster than lleaves)
                model = xgb.XGBRegressor(
                    **parameters
                )

                # noinspection PyTypeChecker
                model.fit(
                    X=df_train[feature_list_to_test],
                    y=df_train['target']
                )

                print("Model trained.")

                corr = mean_grouped_spearman_correlation(
                    pl.Series(model.predict(df_validate[feature_list_to_test])),
                    df_validate['target'],
                    df_validate['era']
                )
                corrs.append(corr)


                def mean_correlation_by_era(target: np.ndarray, prediction: np.ndarray) -> float:
                    return mean_grouped_spearman_correlation(pl.Series(prediction), pl.Series(target), df_validate['era'])


                score = make_scorer(mean_correlation_by_era, greater_is_better=True)
                warnings.filterwarnings("ignore", category=UserWarning)  # supress false positive warning
                result_permutation_importance.append(
                    permutation_importance(
                        model,
                        df_validate[feature_list_to_test].to_numpy(),
                        df_validate['target'].to_numpy(),
                        scoring=score,
                        n_repeats=1,
                        n_jobs=-1  # parallelise
                    )['importances_mean']
                )

                (DATA_PATH / 'tmp').mkdir(parents=True, exist_ok=True)
                save_as_pickle(result_permutation_importance, DATA_PATH / 'tmp/result_permutation_importance.pkl')

                print(f"{datetime.now().strftime('%H:%M:%S')} . . . Fold {fold} with parameters num_boost_round={num_boost_round} and max_depth={max_depth} done. Current performance: {corr:.5f}.")


    df_feature_importance = pl.DataFrame({
        'feature': pl.Series(feature_list_to_test),
        'importance_permutation': pl.Series(np.array(result_permutation_importance).mean(axis=0))
    })

    max_shadow_importance = df_feature_importance.filter(pl.col('feature').str.contains('_shadow'))['importance_permutation'].max()

    mean_corr = np.mean(corrs)
    max_corr = np.max(corrs)
    if (mean_corr + max_corr) >= (best_mean_corr + best_max_corr):
        best_mean_corr = mean_corr
        best_max_corr = max_corr
        selected_features = [feature for feature in feature_list_to_test if 'shadow' not in feature]
    else:
        print('No performance improvement. Stopping early and using feature set of last iteration.')
        active = False
        break

    features_to_keep_last = features_to_keep
    features_to_keep = df_feature_importance.filter((pl.col('importance_permutation') > max_shadow_importance))['feature'].to_list()
    features_to_keep = features_to_keep + [f'{feature}_shadow' for feature in random_features]

    number_of_features_to_drop = int((len(feature_list_to_test) - len(features_to_keep)))  # excluding shadow features
    print(f"Dropping {number_of_features_to_drop} of {len(feature_list_to_test) - number_of_shadow_features} features")

    if number_of_features_to_drop <= int(number_of_shadow_features / 10):
        active = False
        print('Not enough features to drop. Stopping and using all surviving features.')
        selected_features = [feature for feature in features_to_keep if 'shadow' not in feature]
    else:
        feature_list_to_test = features_to_keep

In [None]:
save_as_pickle(selected_features, DATA_PATH / 'results/selected_features.pkl')