In [None]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# load files

In [None]:
train = pd.read_parquet("../input/train_full_features.parquet")

# define loss and metric

In [None]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

# define training config

In [None]:
features = load("selected_features.pkl")

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]
cat_features = [feature for feature in cat_features if feature in features]
            
target = "target"

In [None]:
train.head()

# split dataset

In [None]:
seed = 1024
n_folds = 5
target_fold = 0

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

In [None]:
kfold = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True, 
    random_state=seed
)

for fold,(trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):
    
    if fold != target_fold:
        continue

    print("#"*100)
    print("Training fold {} with {} features...".format(fold, len(features)))

    x_train = train.loc[trn_ind, features]
    y_train = train.loc[trn_ind, target]
    x_val = train.loc[val_ind, features]
    y_val = train.loc[val_ind, target]

    # xgb_train = xgb.DeviceQuantileDMatrix(xy_train, max_bin=256)
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_val = xgb.DMatrix(data=x_val, label=y_val)
    
del x_train, y_train, x_val, y_val
gc.collect()

In [None]:
x_val = train.loc[val_ind, features]
y_val = train.loc[val_ind, target]

In [None]:
space = {
    "max_depth": hp.quniform("max_depth", 6, 12, 1),
    "subsample": hp.uniform("subsample", 0.1, 0.8),
    "colsample_bytree" : hp.uniform("colsample_bytree", 0.1, 0.8),
    "gamma": hp.uniform("gamma", 0, 4),
    "reg_lambda": hp.quniform("reg_lambda", 1, 160, 5),
    "min_child_weight" : hp.quniform("min_child_weight", 0, 10, 1),
    "num_parallel_tree":  hp.quniform("num_parallel_tree", 5, 10, 1),
}

In [None]:
def objective(space):
    
    xgb_parms = { 
        "max_depth": int(space["max_depth"]), 
        "learning_rate": 0.01, 
        "subsample": space["subsample"],
        "colsample_bytree": space["colsample_bytree"], 
        "gamma": space["gamma"],
        "reg_lambda": int(space["reg_lambda"]),
        "min_child_weight": int(space["min_child_weight"]),
        "num_parallel_tree": int(space["num_parallel_tree"]),
        "eval_metric": "logloss",
        "objective": "binary:logistic",
        "tree_method":"gpu_hist",
        "gpu_id": 1,
        "predictor": "gpu_predictor",
        "random_state": seed
    }
    
    print("max_depth: {}".format(int(space["max_depth"])))
    print("subsample: {}".format(space["subsample"]))
    print("colsample_bytree: {}".format(space["colsample_bytree"]))
    print("gamma: {}".format(int(space["gamma"])))
    print("reg_lambda: {}".format(int(space["reg_lambda"])))
    print("min_child_weight: {}".format(int(space["min_child_weight"])))
    print("num_parallel_tree: {}".format(int(space["num_parallel_tree"])))
    
    model = xgb.train(
        xgb_parms, 
        dtrain=xgb_train,
        evals=[(xgb_train,"train"),(xgb_val,"valid")],
        num_boost_round=12000,
        early_stopping_rounds=100,
        verbose_eval=100
    ) 
    

    pred = model.predict(x_val)
    score = amex_metric_mod(y_val.values, pred)
    print("score is {}".format(score))
    
    return {"loss": -score, "status": STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials
)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)