In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# load files

In [2]:
train = pd.read_parquet("../input/train_full_features.parquet")

# define loss and metric

In [3]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

# define training config

In [4]:
features = load("selected_features.pkl")

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]
cat_features = [feature for feature in cat_features if feature in features]
            
target = "target"

In [5]:
train.head()

Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_nunique,B_38_count,B_38_last,B_38_nunique,D_114_count,D_114_last,D_114_nunique,...,D_137_drawup_duration,D_138_drawup_duration,D_139_drawup_duration,D_140_drawup_duration,D_141_drawup_duration,D_142_drawup_duration,D_143_drawup_duration,D_144_drawup_duration,D_145_drawup_duration,target
0,-9223358381327749917,13,0,2,13,6,3,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1
1,-9223193039457028513,13,0,1,13,0,1,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,-9223189665817919541,13,0,1,13,0,1,13,0,1,...,0.0,0.0,0.0,0.0,7.0,12.0,0.0,9.0,0.0,0
3,-9223188534444851899,13,0,1,13,0,1,13,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0
4,-9223173911659837606,13,1,1,13,6,2,13,1,2,...,4.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1


In [6]:
train = train[["customer_ID"] + features + ["target"]]

In [8]:
float_cols = train[features].select_dtypes(include=[np.float64]).columns
for float_col in tqdm(float_cols):
    train[float_col] = train[float_col].astype(np.float32)

int_cols = train[features].select_dtypes(include=[np.int64]).columns
for int_col in tqdm(int_cols):
    train[int_col] = train[int_col].astype(np.int8)

100%|██████████| 705/705 [04:41<00:00,  2.50it/s]
100%|██████████| 232/232 [00:31<00:00,  7.46it/s]


In [9]:
for col in features:
    print(train[col].dtype)

int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
int8
float32
float32
float32
float32
float32
float32
float32
int16
int16
int16
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
int16
int16
int16
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
int8
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
int8
int8
float32
float32
float32
float32
float32
float32
int8
int8
int8
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float3

float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32


In [10]:
train.to_parquet("../input/train_select_features.parquet")

In [11]:
seed = 42
n_folds = 5

xgb_parms = { 
    "max_depth":4, 
    "learning_rate":0.05, 
    "subsample":0.8,
    "colsample_bytree":0.6, 
    "eval_metric":"logloss",
    "objective":"binary:logistic",
    "tree_method":"gpu_hist",
    "gpu_id":1,
    "predictor":"gpu_predictor",
    "random_state":seed
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed_everything(seed)

In [12]:
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = pd.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [13]:
def training(train):

    importances = []
    
    # create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))

    kfold = StratifiedKFold(
        n_splits=n_folds, 
        shuffle=True, 
        random_state=seed
    )
    
    for fold,(trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):

        print("#"*100)
        print("Training fold {} with {} features...".format(fold, len(features)))
        
        x_train= train.loc[trn_ind, features]
        y_train= train.loc[trn_ind, target]
        x_val = train.loc[val_ind, features]
        y_val = train.loc[val_ind, target]

        # xgb_train = xgb.DeviceQuantileDMatrix(xy_train, max_bin=256)
        xgb_train = xgb.DMatrix(data=x_train, label=y_train)
        xgb_val = xgb.DMatrix(data=x_val, label=y_val)

        model = xgb.train(
            xgb_parms, 
            dtrain=xgb_train,
            evals=[(xgb_train,"train"),(xgb_val,"valid")],
            num_boost_round=9999,
            early_stopping_rounds=100,
            verbose_eval=100
        ) 
        model.save_model("../ckpt/xgb_{}_{}.xgb".format(fold, seed))

        # importance
        importance = model.get_score(importance_type="weight")
        importances.append(pd.DataFrame({"feature":importance.keys(), "importance_{}".format(fold):importance.values()}))

        # oof
        oof_preds = model.predict(xgb_val)
        score = amex_metric_mod(y_val.values, oof_preds)
        print("fold {} score is {}".format(fold, score))
        
        # add to out of folds array
        oof_predictions[val_ind] = oof_preds

        del x_train, y_train, x_val, y_val, xgb_train, xgb_val, model, importance, oof_preds
        _ = gc.collect()
    
    # compute oof
    score = amex_metric_mod(train[target], oof_predictions)
    print("oof score is {}".format(score))
    
    # create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({"customer_ID": train["customer_ID"], "target": train[target], "prediction": oof_predictions})
    oof_df.to_parquet("xgb_oof_{}.parquet".format(seed))
    
    return importances, oof_df

In [14]:
importances, oof_df = training(train)

####################################################################################################
Training fold 0 with 1950 features...
[0]	train-logloss:0.66257	valid-logloss:0.66262
[100]	train-logloss:0.23578	valid-logloss:0.23881
[200]	train-logloss:0.22149	valid-logloss:0.22699
[300]	train-logloss:0.21541	valid-logloss:0.22329
[400]	train-logloss:0.21124	valid-logloss:0.22138
[500]	train-logloss:0.20785	valid-logloss:0.22026
[600]	train-logloss:0.20493	valid-logloss:0.21954
[700]	train-logloss:0.20235	valid-logloss:0.21900
[800]	train-logloss:0.19985	valid-logloss:0.21853
[900]	train-logloss:0.19755	valid-logloss:0.21811
[1000]	train-logloss:0.19538	valid-logloss:0.21787
[1100]	train-logloss:0.19326	valid-logloss:0.21772
[1200]	train-logloss:0.19121	valid-logloss:0.21756
[1300]	train-logloss:0.18923	valid-logloss:0.21747
[1400]	train-logloss:0.18720	valid-logloss:0.21740
[1500]	train-logloss:0.18532	valid-logloss:0.21736
[1600]	train-logloss:0.18348	valid-logloss:0.21729
[1700]

# feature importance

In [None]:
importance = importances[0].copy()
for k in range(1, n_folds): 
    importance = importance.merge(importances[k], on="feature", how="left")
    
importance["importance"] = importance.iloc[:,1:].mean(axis=1)
importance = importance.sort_values("importance",ascending=False)

# importance.to_csv("xgb_feature_importance.csv",index=False)

In [None]:
importance

In [None]:
n_features = 50

plt.figure(figsize=(10, n_features))
plt.barh(np.arange(n_features, 0, -1), importance.importance.values[:n_features])
plt.yticks(np.arange(n_features,0,-1), importance.feature.values[:n_features])
plt.title("xgb feature importance - Top {}".format(n_features))
plt.show()

In [None]:
importance.head(50)

In [None]:
P_2 = [col for col in importance["feature"] if "target" in col]

In [None]:
importance.loc[importance["feature"].isin(P_2)]