In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from shaphypetune import BoostSearch, BoostRFE, BoostRFA, BoostBoruta

# load files

In [None]:
train = pd.read_parquet("../input/train_base.parquet")

# define loss and metric

In [None]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def amex_metric(y_hat, y_true):
    return 'amex_metric', amex_metric_mod(y_true, y_hat)

# define training config

In [None]:
seed = 42
n_folds = 5

exclude_features = []

exclude_features += [
    "customer_ID", 
    "target",
    "number_of_observations",
]
features = [col for col in train.columns if col not in exclude_features]
target = "target"
cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

# RFE

In [None]:
kfold = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True, 
    random_state=seed
)

fold_features = []

for fold,(trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):
    
    print("RFE for fold {}".format(fold))
    
    x_train= train.loc[trn_ind, features]
    y_train= train.loc[trn_ind, target]
    x_val = train.loc[val_ind, features]
    y_val = train.loc[val_ind, target]
    
    estimator = xgb.XGBRegressor(
        max_depth=4, 
        learning_rate=0.05, 
        subsample=0.8,
        colsample_bytree=0.6, 
        eval_metric="logloss",
        objective="binary:logistic",
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        random_state=seed
    )
    model = BoostRFE(  
        estimator,                              # LGBModel or XGBModel
        min_features_to_select=None,            # the minimum number of features to be selected  
        step=40,                                 # number of features to remove at each iteration  
        param_grid=None,                        # parameters to be optimized  
        greater_is_better=False,                # minimize or maximize the monitored score  
        importance_type="shap_importances",     # which importance measure to use: default or shap  
        train_importance=False,                 # where to compute the shap feature importance  
        n_iter=None,                            # number of sampled parameter configurations  
        sampling_seed=seed,                     # the seed used for parameter sampling  
        verbose=1,                              # verbosity mode  
        n_jobs=None                             # number of jobs to run in parallel  
    )
    
    model.fit(
        x_train, 
        y_train, 
        eval_set=[(x_val, y_val)], 
        early_stopping_rounds=6,
        verbose=1
    )
    
    fold_features.append(model.support_)
    
    del x_train, y_train, x_val, y_val, estimator, model
    _ = gc.collect()

In [20]:
fold_features = load("feature_mask.pkl")

In [21]:
fold_features

[array([ True,  True,  True, ...,  True,  True, False]),
 array([ True,  True,  True, ...,  True,  True, False]),
 array([False,  True,  True, ..., False, False, False]),
 array([False,  True,  True, ..., False, False, False]),
 array([False,  True,  True, ...,  True,  True, False])]

In [None]:
np.sum(fold_features[4])

In [None]:
dump(fold_features, "feature_mask.pkl")

In [22]:
fold_features = np.stack(fold_features, axis=1)

In [23]:
fold_features = fold_features.mean(axis=1)

In [24]:
fold_features

array([0.4, 1. , 1. , ..., 0.6, 0.6, 0. ])

In [25]:
selected_feature_idxes = [feature_idx for feature_idx in range(len(fold_features)) if fold_features[feature_idx] > 0.2]

In [26]:
len(selected_feature_idxes)

1346

In [27]:
dump(selected_feature_idxes, "selected_feature_idxes.pkl")

['selected_feature_idxes.pkl']