In [1]:
import os
import gc
from joblib import dump, load, Parallel, delayed
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# load files

In [2]:
train = pd.read_parquet("../input/train_full_features.parquet")

# define loss and metric

In [3]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

# define training config

In [4]:
exclude_features = []

exclude_features += [
    "customer_ID", 
    "target",
]

features = [col for col in train.columns if col not in exclude_features]

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = []
for feature in features:
    for cat_feature_base in cat_features_base:
        if cat_feature_base in feature:
            cat_features.append(feature)
            
target = "target"

In [5]:
basic_agg_to_del = load("basic_agg_to_del.pkl")[0]
count_agg_to_del = load("count_agg_to_del.pkl")[0]
count_duplicate_min_agg_to_del = load("count_duplicate_min_agg_to_del.pkl")[0]
sma_agg_to_del = load("sma_agg_to_del.pkl")[0]
mean_std_diff_agg_to_del = load("mean_std_diff_agg_to_del.pkl")[0]
drawdown_drawup_agg_to_del = []

features = [feature for feature in features if feature not in (
    basic_agg_to_del + \
    count_agg_to_del + \
    count_duplicate_min_agg_to_del + \
    sma_agg_to_del + \
    mean_std_diff_agg_to_del + \
    drawdown_drawup_agg_to_del
)]

In [6]:
dump(features, "selected_features.pkl")

['selected_features.pkl']

In [None]:
new_features = [feature for feature in features if "duplicate_min" in feature]

In [None]:
len(new_features)

In [None]:
seed = 42
n_folds = 5

xgb_parms = { 
    "max_depth":4, 
    "learning_rate":0.05, 
    "subsample":0.8,
    "colsample_bytree":0.6, 
    "eval_metric":"logloss",
    "objective":"binary:logistic",
    "tree_method":"gpu_hist",
    "gpu_id":1,
    "predictor":"gpu_predictor",
    "random_state":seed
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

In [None]:
def training(train):

    importances = []
    
    # create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))

    kfold = StratifiedKFold(
        n_splits=n_folds, 
        shuffle=True, 
        random_state=seed
    )
    
    for fold,(trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):

        print("#"*100)
        print("Training fold {} with {} features...".format(fold, len(features)))
        
        x_train= train.loc[trn_ind, features]
        y_train= train.loc[trn_ind, target]
        x_val = train.loc[val_ind, features]
        y_val = train.loc[val_ind, target]

        # xgb_train = xgb.DeviceQuantileDMatrix(xy_train, max_bin=256)
        xgb_train = xgb.DMatrix(data=x_train, label=y_train)
        xgb_val = xgb.DMatrix(data=x_val, label=y_val)

        model = xgb.train(
            xgb_parms, 
            dtrain=xgb_train,
            evals=[(xgb_train,"train"),(xgb_val,"valid")],
            num_boost_round=9999,
            early_stopping_rounds=100,
            verbose_eval=100
        ) 
        model.save_model("../ckpt/xgb_{}_{}.xgb".format(fold, seed))

        # importance
        importance = model.get_score(importance_type="weight")
        importances.append(pd.DataFrame({"feature":importance.keys(), "importance_{}".format(fold):importance.values()}))

        # oof
        oof_preds = model.predict(xgb_val)
        score = amex_metric_mod(y_val.values, oof_preds)
        print("fold {} score is {}".format(fold, score))
        
        # add to out of folds array
        oof_predictions[val_ind] = oof_preds

        del x_train, y_train, x_val, y_val, xgb_train, xgb_val, model, importance, oof_preds
        _ = gc.collect()
    
    # compute oof
    score = amex_metric_mod(train[target], oof_predictions)
    print("oof score is {}".format(score))
    
    # create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({"customer_ID": train["customer_ID"], "target": train[target], "prediction": oof_predictions})
    oof_df.to_parquet("xgb_oof_{}.parquet".format(seed))
    
    return importances, oof_df

In [None]:
importances, oof_df = training(train)

In [None]:
importance = importances[0].copy()
for k in range(1, n_folds): 
    importance = importance.merge(importances[k], on="feature", how="left")
    
importance["importance"] = importance.iloc[:,1:].mean(axis=1)
importance = importance.sort_values("importance",ascending=False)

In [None]:
count_features = [col for col in train.columns if "duplicate" in col]
importance.loc[importance["feature"].isin(count_features)]

In [None]:
n_features = 50

plt.figure(figsize=(10, n_features))
plt.barh(np.arange(n_features, 0, -1), importance.importance.values[:n_features])
plt.yticks(np.arange(n_features,0,-1), importance.feature.values[:n_features])
plt.title("xgb feature importance - Top {}".format(n_features))
plt.show()

# permutation importance

In [None]:
kfold = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True, 
    random_state=seed
)
k_splits = kfold.split(train, train[target])

all_val_inds = []
all_models = []
all_features = []
all_targets = []

for fold,(trn_ind, val_ind) in enumerate(k_splits):

    all_val_inds.append(val_ind)

    model = xgb.Booster()
    model.load_model("../ckpt/xgb_{}_{}.xgb".format(fold, seed))
    # model.set_param({"predictor": "gpu_predictor"})
    all_models.append(model)

    features_fold = train.loc[val_ind, features]
    target_fold = train.loc[val_ind, target]

    all_features.append(features_fold)
    all_targets.append(target_fold)

In [None]:
def permutation(feature):
        
    oof_predictions = np.zeros(len(train))

    for fold, val_ind in enumerate(all_val_inds):

        x_val = all_features[fold].copy()
        x_val[feature] = np.random.RandomState(seed=42).permutation(x_val[feature])
        y_val = all_targets[fold].copy()

        xgb_val = xgb.DMatrix(data=x_val, label=y_val)

        # oof
        oof_preds = all_models[fold].predict(xgb_val)
        oof_predictions[val_ind] = oof_preds

    # compute oof
    score = amex_metric_mod(train[target], oof_predictions)
    
    return (feature, score)

In [None]:
permutation_importances = Parallel(n_jobs=10, verbose=1)(
        delayed(permutation)(feature) for feature in new_features)

In [None]:
permutation_importances = sorted(permutation_importances, key=lambda x : x[1])

In [None]:
len(permutation_importances)

In [None]:
permutation_importances

In [None]:
dump(permutation_importances, "count_duplicate_min_agg_permutation_importances.pkl")

In [None]:
features_to_del = []
feature_scores_to_del = []

for (feature, feature_score) in permutation_importances:
    if feature_score > 0.79315:
        features_to_del.append(feature)
        feature_scores_to_del.append(feature_score)

In [None]:
len(features_to_del)

In [None]:
dump([features_to_del, feature_scores_to_del], "count_duplicate_min_agg_to_del.pkl")