In [1]:
import pandas as pd
import numpy as np
import cupy,cudf 
import gc

In [2]:
SEED = 42
FOLDS = 5

In [3]:
TRAIN_PATH = '../input/amexfeatureengineering/770_FE_train.feather'
train = pd.read_feather(TRAIN_PATH)

In [4]:
train.shape

(458913, 772)

In [5]:
train.sample(5)

Unnamed: 0,customer_ID,B_30_nunique,B_38_last,B_38_nunique,D_114_last,D_117_last,D_120_last,D_120_nunique,D_66_last,D_66_nunique,...,D_53_last_mean_diff,B_28_last_mean_diff,S_22_last_mean_diff,B_3_last_mean_diff,D_56_last_mean_diff,D_130_last_mean_diff,S_7_last_mean_diff,total_data_count,total_data_last,target
239875,418336341001300877,1,2,1,2,5,1,1,0,1,...,0.0,-0.025208,0.014008,-0.000659,0.011551,0.0,-0.244141,2280.0,175.0,0
400335,6868324700994531223,1,3,1,2,1,1,1,0,1,...,0.0,-0.107605,0.005424,-0.108459,0.0,0.0,-0.013939,2266.0,174.0,0
428880,8014137752067627019,1,1,1,1,5,2,2,0,1,...,-0.003323,0.012474,0.443604,-0.002375,0.0,0.0,0.027573,1704.0,176.0,1
342441,4546763288999499503,1,2,1,2,5,1,1,0,1,...,0.0,-0.036438,0.001087,0.002991,0.0,0.0,-0.013756,2261.0,173.0,0
435076,8258162537431698827,1,3,3,1,1,1,1,0,1,...,0.0,0.033661,0.034637,0.021561,0.0,0.0,0.0672,2236.0,174.0,1


In [6]:
FEATURES = [col for col in train.columns if col not in ['customer_ID','target']]
print(f"There are {len(FEATURES)} features!")

There are 770 features!


In [7]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

xgb_params = {
            'objective': 'binary:logitraw', 
            'tree_method': 'gpu_hist',
            'predictor':'gpu_predictor',
            'max_depth': 7,    
            'subsample':0.9,   
            'colsample_bytree': 0.25,  
            'gamma':1.5,        
            'min_child_weight':8, 
            'lambda': 10,   
            'random_state':42,
            'eval_metric':'auc',
            'learning_rate':0.015,
            'num_parallel_tree':1,
    }

In [8]:
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [9]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

importances = []

In [10]:
oof = np.zeros((len(train)))
#train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 1.0
gc.collect() 

for SEED in [41,42,43]:
    oof_ = np.zeros((len(train)))
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
        if TRAIN_SUBSAMPLE<1.0:
                np.random.seed(SEED)
                train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
                np.random.seed(None)
    
        print('#'*25)
        print('### SEED',SEED)
        print('### Fold',fold+1)
        print('### Train size',len(train_idx),'Valid size',len(valid_idx))
        print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
        print('#'*25)
    
        # TRAIN, VALID, TEST FOR FOLD K
        Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
        X_valid = train.loc[valid_idx, FEATURES]
        y_valid = train.loc[valid_idx, 'target']
    

        dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
        dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
         # TRAIN MODEL FOLD K
        model = xgb.train(xgb_params, 
                dtrain=dtrain,          
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=9999,
                early_stopping_rounds=100,
                verbose_eval=100) 
        model.save_model(f'XGB_seed{SEED}_fold{fold}.xgb')
            
        # INFER OOF FOLD K
        oof_preds = model.predict(dvalid)
        acc = amex_metric_mod(y_valid.values, oof_preds)
        print('Kaggle Metric =',acc,'\n')
    
        # SAVE OOF
        oof_[valid_idx] = oof_preds

        del dtrain, Xy_train
        del X_valid, y_valid, dvalid, model
        _ = gc.collect()
        
    oof += oof_ / 3
       
    
print('#'*25)

acc = amex_metric_mod(train.target, oof)
print('OVERALL CV Kaggle Metric =',acc)

#########################
### SEED 41
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
[0]	train-auc:0.93947	valid-auc:0.93679
[100]	train-auc:0.95700	valid-auc:0.95355
[200]	train-auc:0.96016	valid-auc:0.95583
[300]	train-auc:0.96250	valid-auc:0.95738
[400]	train-auc:0.96433	valid-auc:0.95846
[500]	train-auc:0.96600	valid-auc:0.95928
[600]	train-auc:0.96737	valid-auc:0.95981
[700]	train-auc:0.96856	valid-auc:0.96021
[800]	train-auc:0.96964	valid-auc:0.96051
[900]	train-auc:0.97059	valid-auc:0.96074
[1000]	train-auc:0.97149	valid-auc:0.96093
[1100]	train-auc:0.97235	valid-auc:0.96108
[1200]	train-auc:0.97315	valid-auc:0.96122
[1300]	train-auc:0.97388	valid-auc:0.96133
[1400]	train-auc:0.97463	valid-auc:0.96143
[1500]	train-auc:0.97531	valid-auc:0.96153
[1600]	train-auc:0.97600	valid-auc:0.96162
[1700]	train-auc:0.97661	valid-auc:0.96167
[1800]	train-auc:0.97723	valid-auc:0.96173
[1900]	train-auc:0.97783	valid-auc:0.96177
[2

In [11]:
oof

array([ 1.15419364, -7.93937588, -6.1991326 , ...,  2.27935576,
       -4.61087211,  2.17345047])

In [12]:
oof = pd.DataFrame({'customer_ID':train.customer_ID,'target':train.target,'oof_pred':oof})
oof = oof.set_index('customer_ID')
oof.to_csv('oof_transfer_learning.csv',index=False)