In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/amex-credit/918_features_wo_imputation_2.parquet
/kaggle/input/amex-credit/918_features_wo_imputation.parquet
/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr


In [None]:
train = pd.read_parquet("/kaggle/input/amex-credit/918_features_wo_imputation_2.parquet")

In [2]:
import cudf

train=cudf.read_parquet("/kaggle/input/amex-credit/918_features_wo_imputation_2.parquet")

In [3]:
FEATURES = train.columns[1:-1]

In [4]:
from sklearn.model_selection import KFold
import xgboost as xgb
import gc

In [5]:
SEED = 42
xgb_parms = {'max_depth': 4,
             'learning_rate':0.05,
             'subsample': 0.8,
             'colsample_bytree': 0.6,
             'eval_metric':'logloss',
             'objective': 'binary:logistic',
             'tree_method': 'gpu_hist',
             'predictor': 'gpu_predictor',
             'random_state': SEED}

In [6]:
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1


In [7]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)


In [8]:
importances = []
oof = []
train = train.to_pandas()
TRAIN_SUBSAMPLE = 1.0
gc.collect()

FOLDS = 5
SEED = 42
skf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
    
    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace = False)
        np.random.seed(SEED)
    
    print('#'*25)
    print('### Fold', fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
        
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    model = xgb.train(xgb_parms, 
                      dtrain = dtrain,
                      evals = [(dtrain, 'train'), (dvalid, 'valid')],
                      num_boost_round = 9999,
                      early_stopping_rounds = 100,
                      verbose_eval = 100)
    model.save_model(f'XBG_918_features_wo_imputation_fold{fold}.xgb')
    
    dd = model.get_score(importance_type = 'weight')
    df = pd.DataFrame({'feature': dd.keys(), f'importance_{fold}': dd.values()})
    importances.append(df)
    
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric = ', acc, '\n')
    
    df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
    df['oof_pred'] = oof_preds
    oof.append(df)
    
    del dtrain, Xy_train, dd, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
print('OVERALL CV Kaggle Metric = ', acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
[0]	train-logloss:0.66266	valid-logloss:0.66266
[100]	train-logloss:0.23641	valid-logloss:0.23936
[200]	train-logloss:0.22186	valid-logloss:0.22752
[300]	train-logloss:0.21573	valid-logloss:0.22382
[400]	train-logloss:0.21146	valid-logloss:0.22191
[500]	train-logloss:0.20802	valid-logloss:0.22088
[600]	train-logloss:0.20498	valid-logloss:0.22018
[700]	train-logloss:0.20224	valid-logloss:0.21979
[800]	train-logloss:0.19956	valid-logloss:0.21949
[900]	train-logloss:0.19709	valid-logloss:0.21934
[1000]	train-logloss:0.19469	valid-logloss:0.21915
[1100]	train-logloss:0.19241	valid-logloss:0.21902
[1200]	train-logloss:0.19017	valid-logloss:0.21897
[1300]	train-logloss:0.18795	valid-logloss:0.21894
[1400]	train-logloss:0.18585	valid-logloss:0.21888
[1500]	train-logloss:0.18380	valid-logloss:0.21885
[1600]	train-logloss:0.18179	valid-logloss:0.21882
[1659]	