In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# load files

In [2]:
train = pd.read_parquet("../input/train_full_features.parquet")

# define loss and metric

In [3]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

# define training config

In [4]:
features = load("selected_features.pkl")

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]
cat_features = [feature for feature in cat_features if feature in features]
            
target = "target"

In [5]:
train.head()

Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_nunique,B_38_count,B_38_last,B_38_nunique,D_114_count,D_114_last,D_114_nunique,...,D_137_drawup_duration,D_138_drawup_duration,D_139_drawup_duration,D_140_drawup_duration,D_141_drawup_duration,D_142_drawup_duration,D_143_drawup_duration,D_144_drawup_duration,D_145_drawup_duration,target
0,-9223358381327749917,13,0,2,13,6,3,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1
1,-9223193039457028513,13,0,1,13,0,1,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,-9223189665817919541,13,0,1,13,0,1,13,0,1,...,0.0,0.0,0.0,0.0,7.0,12.0,0.0,9.0,0.0,0
3,-9223188534444851899,13,0,1,13,0,1,13,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0
4,-9223173911659837606,13,1,1,13,6,2,13,1,2,...,4.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1


In [7]:
seed = 1024
n_folds = 5
target_fold = 0

xgb_parms = { 
    "max_depth":8, 
    "learning_rate":0.01, 
    "subsample":0.5,
    "colsample_bytree":0.2, 
#     "gamma":1.5,
#     "reg_lambda": 70,
    "min_child_weight": 0,
    "eval_metric":"logloss",
    "objective":"binary:logistic",
    "tree_method":"gpu_hist",
    "gpu_id":1,
    # "booster": "dart",
    # "rate_drop": 0.1,
    # "skip_drop": 0.5,
    "num_parallel_tree": 5,
    "predictor":"gpu_predictor",
    "random_state":seed
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

In [8]:
importances = []

# create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(train))

kfold = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True, 
    random_state=seed
)

for fold,(trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):
    
    if fold != target_fold:
        continue

    print("#"*100)
    print("Training fold {} with {} features...".format(fold, len(features)))

    x_train= train.loc[trn_ind, features]
    y_train= train.loc[trn_ind, target]
    x_val = train.loc[val_ind, features]
    y_val = train.loc[val_ind, target]

    # xgb_train = xgb.DeviceQuantileDMatrix(xy_train, max_bin=256)
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_val = xgb.DMatrix(data=x_val, label=y_val)

    model = xgb.train(
        xgb_parms, 
        dtrain=xgb_train,
        evals=[(xgb_train,"train"),(xgb_val,"valid")],
        num_boost_round=12000,
        early_stopping_rounds=100,
        verbose_eval=100
    ) 
    model.save_model("../ckpt/xgb_{}_{}.xgb".format(fold, seed))

    # importance
    importance = model.get_score(importance_type="weight")
    importances.append(pd.DataFrame({"feature":importance.keys(), "importance_{}".format(fold):importance.values()}))

    # oof
    oof_preds = model.predict(xgb_val)
    score = amex_metric_mod(y_val.values, oof_preds)
    print("fold {} score is {}".format(fold, score))

####################################################################################################
Training fold 0 with 1950 features...
[0]	train-logloss:0.68680	valid-logloss:0.68685
[100]	train-logloss:0.36652	valid-logloss:0.37013
[200]	train-logloss:0.27488	valid-logloss:0.28092
[300]	train-logloss:0.24173	valid-logloss:0.24960
[400]	train-logloss:0.22760	valid-logloss:0.23698
[500]	train-logloss:0.22026	valid-logloss:0.23110
[600]	train-logloss:0.21537	valid-logloss:0.22776
[700]	train-logloss:0.21156	valid-logloss:0.22551
[800]	train-logloss:0.20841	valid-logloss:0.22390
[900]	train-logloss:0.20575	valid-logloss:0.22273
[1000]	train-logloss:0.20340	valid-logloss:0.22180
[1100]	train-logloss:0.20123	valid-logloss:0.22104
[1200]	train-logloss:0.19918	valid-logloss:0.22044
[1300]	train-logloss:0.19726	valid-logloss:0.21992
[1400]	train-logloss:0.19542	valid-logloss:0.21948
[1500]	train-logloss:0.19364	valid-logloss:0.21912
[1600]	train-logloss:0.19193	valid-logloss:0.21878
[1700]

# feature importance

In [None]:
importance = importances[0].sort_values("importance_0",ascending=False)

# importance.to_csv("xgb_feature_importance.csv",index=False)

In [None]:
importance

In [None]:
n_features = 50

plt.figure(figsize=(10, n_features))
plt.barh(np.arange(n_features, 0, -1), importance.importance_0.values[:n_features])
plt.yticks(np.arange(n_features,0,-1), importance.feature.values[:n_features])
plt.title("xgb feature importance - Top {}".format(n_features))
plt.show()

In [None]:
importance.head(50)

In [None]:
P_2 = [col for col in importance["feature"] if "target" in col]

In [None]:
importance.loc[importance["feature"].isin(P_2)]