In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt,gc,os
from tqdm.auto import tqdm
import itertools
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#print('RAPIDS version',cudf.__version__)

In [2]:
GPU = True
try:
    import cupy, cudf
except ImportError:
    GPU = False

In [3]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

GPU =True

TRAIN_NUM_PARTS = 2
TEST_SECTIONS = 2
TEST_NUM_PARTS = 2

In [4]:
TRAIN_PATH = '../input/amexfeatureengineering/770_FE_train.feather'
train = pd.read_feather(TRAIN_PATH)

In [5]:
train.sample()

Unnamed: 0,customer_ID,B_30_nunique,B_38_last,B_38_nunique,D_114_last,D_117_last,D_120_last,D_120_nunique,D_66_last,D_66_nunique,...,D_53_last_mean_diff,B_28_last_mean_diff,S_22_last_mean_diff,B_3_last_mean_diff,D_56_last_mean_diff,D_130_last_mean_diff,S_7_last_mean_diff,total_data_count,total_data_last,target
190839,-1551089096557598217,1,3,2,1,2,1,2,0,1,...,-0.002375,0.01181,0.013428,0.021942,0.0,0.000619,0.032318,2285.0,174.0,0


In [6]:
# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 770 features!


In [7]:
# LOAD XGB LIBRARY
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# XGB MODEL PARAMETERS
xgb_params = {
            'objective': 'binary:logitraw', 
            'tree_method': 'gpu_hist',
            'predictor':'gpu_predictor',
            'max_depth': 7,    
            'subsample':0.9,   
            'colsample_bytree': 0.25,  
            'gamma':1.5,        
            'min_child_weight':8, 
            'lambda': 10,   
            'random_state':42,
            'eval_metric':'auc',
            'num_parallel_tree':1,
    }

In [8]:
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [9]:
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [10]:
def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())
def lgb_amex_metric(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label()), True

# code by @https://www.kaggle.com/yunchonggan
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos

    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight * (1 / weight.sum())).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / n_pos

    lorentz = (target * (1 / n_pos)).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    g = gini / gini_max
    return 0.5 * (g + d)

In [11]:
import pandas as pd
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [12]:
importances = []
oof = []
#train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 1.0
gc.collect() 

# XGB MODEL PARAMETERS
BASE_LEARNING_RATE = 0.015
PYRAMID_W = [0.5, 2/3, 0.75, 0.875, 1, 0]

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)    
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    # TRAIN, VALID, TEST FOR FOLD K
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
    

    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    # TRAIN MODEL FOLD K
    # PYRAMID: Smoothly go from diverse forest of early trees into focused boosted trees correcting residuals.
    #   final layer must have w==0
    #   columns:    forest|boost|adj_eta|w
    pyramid_layers = [( 40,  10,  1.56,  0.5),
                          (  8,  50,  1.3,   2/3),
                          (  1, 400,  1.25,  0.75),
                          (  1, 400,  1.125, 0.875),
                          (  1,1200,  1.0,   1),
                          (  1,5200,  0.5,   0)]
    assert(PYRAMID_W == [layer[-1] for layer in pyramid_layers])
    
    for (layer, (n_trees, n_rounds, adj_learning, w)) in enumerate(pyramid_layers):
            ## Load the manual parameters from the pyramid layer
            xgb_params['num_parallel_tree'] = n_trees
            xgb_params['learning_rate'] = n_trees*adj_learning*BASE_LEARNING_RATE
            
            ## No early stopping except on final round. This is important since the weighting causes the model to go backwards for a time at the start of the next layer.
            early_stop = None
            if w == 0:
                early_stop = 1800        
    
            model = xgb.train(xgb_params, 
                        dtrain=dtrain,
                        evals=[(dtrain,'train'),(dvalid,'valid')],
                        num_boost_round=n_rounds,
                        early_stopping_rounds=early_stop,
                        custom_metric=xgb_amex,
                        maximize=True,
                        verbose_eval=200//n_trees)
        
            model.save_model(f'XGB_v{VER}_fold{fold}_layer{layer}.xgb')
            
            ## predict to load the predictions on the next model layer
            ## Don't set base margin on final layer. w = 0 is used as an encoded way to skip this step.
            if (w != 0):
                ptrain = model.predict(dtrain, output_margin=True)
                pvalid = model.predict(dvalid, output_margin=True)
                
            ## reduce the impact of all model layers so far by w. This should be another way to reduce over-specialization, without the computational cost of DART
                if (w < 1.0):
                    ptrain = ptrain * w
                    pvalid = pvalid * w  
            
            ## This set_base_margin on the DMatrix data is what informs the next layer of the prior training.
            ## See code example from official demos: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/boost_from_prediction.py
                dtrain.set_base_margin(ptrain)
                dvalid.set_base_margin(pvalid)
            
                del model, ptrain, pvalid
                gc.collect()
            
    del dtrain
    _ = gc.collect()        

            
    # INFER OOF FOLD K
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric =',acc,'\n')
    
    # SAVE OOF
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = oof_preds
    oof.append( df )
    
    del Xy_train, df
    del X_valid, y_valid, dvalid
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
print('OVERALL CV Kaggle Metric =',acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
[0]	train-auc:0.95201	train-amex:0.75805	valid-auc:0.95011	valid-amex:0.74701
[5]	train-auc:0.96339	train-amex:0.79929	valid-auc:0.95867	valid-amex:0.77834
[9]	train-auc:0.96728	train-amex:0.81756	valid-auc:0.96033	valid-amex:0.78591
[0]	train-auc:0.96692	train-amex:0.81583	valid-auc:0.96018	valid-amex:0.78462
[25]	train-auc:0.96744	train-amex:0.81787	valid-auc:0.96033	valid-amex:0.78496
[49]	train-auc:0.97014	train-amex:0.83132	valid-auc:0.96123	valid-amex:0.78895
[0]	train-auc:0.97011	train-amex:0.83119	valid-auc:0.96122	valid-amex:0.78871
[200]	train-auc:0.97033	train-amex:0.83201	valid-auc:0.96115	valid-amex:0.78831
[399]	train-auc:0.97246	train-amex:0.84318	valid-auc:0.96168	valid-amex:0.79035
[0]	train-auc:0.97244	train-amex:0.84307	valid-auc:0.96167	valid-amex:0.79015
[200]	train-auc:0.97257	train-amex:0.84344	valid-auc:0.96161	valid-amex:0.79

In [13]:
# CLEAN RAM
del train
_ = gc.collect()

In [14]:
oof_xgb = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet', columns=['customer_ID']).drop_duplicates()
oof_xgb['customer_ID_hash'] = oof_xgb['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
oof_xgb = oof_xgb.set_index('customer_ID_hash')
oof_xgb = oof_xgb.merge(oof, left_index=True, right_index=True)
oof_xgb = oof_xgb.sort_index().reset_index(drop=True)
oof_xgb.to_csv(f'oof_xgb_v{VER}.csv',index=False)
oof_xgb.head()

Unnamed: 0,customer_ID,target,oof_pred
0,20eac26171c3d251c55fc78204e59fab1c15fc2bc96d0c...,1,1.028872
1,aea50fdf9b974ccec95fa177c3225a0f913483b457de6e...,0,-8.107467
2,32cd2d41aef737b69089882754395925c96eaee1f4a859...,0,-6.588468
3,8daa6d5dc2655a8a437531e6b8b96829113cdfe9bf6cae...,0,-3.595696
4,0ceba351a3851202542feb49d7385bcef32f6037fc57c7...,1,2.096829


In [15]:
oof_xgb

Unnamed: 0,customer_ID,target,oof_pred
0,20eac26171c3d251c55fc78204e59fab1c15fc2bc96d0c...,1,1.028872
1,aea50fdf9b974ccec95fa177c3225a0f913483b457de6e...,0,-8.107467
2,32cd2d41aef737b69089882754395925c96eaee1f4a859...,0,-6.588468
3,8daa6d5dc2655a8a437531e6b8b96829113cdfe9bf6cae...,0,-3.595696
4,0ceba351a3851202542feb49d7385bcef32f6037fc57c7...,1,2.096829
...,...,...,...
458908,295e48fc8327a967b857b36f767014bf3a13040d33edea...,0,0.403101
458909,c50efe7fde092c75efc7a6cd84689de399d7ae4f01aaed...,1,4.076441
458910,cabc752e0a8f813e720383aef769ffb48a1528d20dcb10...,1,2.209568
458911,4051ddc52bd37987009270ec48e55805021ccfd6e78f8e...,0,-4.863576
