In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-credit/918_features_wo_imputation_2.parquet
/kaggle/input/amex-credit/test_918_features_wo_imputation_2.csv


In [2]:
import cudf
import joblib
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
import optuna
import time

In [3]:
train=cudf.read_parquet("/kaggle/input/amex-credit/918_features_wo_imputation_2.parquet")
FEATURES = train.columns[1:-1]

In [4]:
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1


In [5]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)


In [6]:
train = train.to_pandas()

#### KFold Optuna

In [7]:
def objective(trial):
    
    importances = []
    oof = []
    
    TRAIN_SUBSAMPLE = 1.0
    gc.collect()

    FOLDS = 5
    SEED = 42
    skf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)


    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

        if TRAIN_SUBSAMPLE < 1.0:
            np.random.seed(SEED)
            train_idx = np.random.choice(train_idx, int(len(train_idx) * TRAIN_SUBSAMPLE), replace = False)
            np.random.seed(SEED)

        print('#'*25)
        print('### Fold', fold+1)
        print('### Train size',len(train_idx),'Valid size',len(valid_idx))
        print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
        print('#'*25)

        Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
        X_valid = train.loc[valid_idx, FEATURES]
        y_valid = train.loc[valid_idx, 'target']

        dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256, enable_categorical=True)
        dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
        '''
        xgb_parms = {'max_depth': 4,
             'learning_rate':0.05,
             'subsample': 0.8,
             'colsample_bytree': 0.6,
             'eval_metric':'logloss',
             'objective': 'binary:logistic',
             'tree_method': 'gpu_hist',
             'predictor': 'gpu_predictor',
             'random_state': SEED}
        '''
        xgb_params = {"verbosity": 0,
                      "objective": "binary:logistic",
                      "eval_metric": "logloss",
                      "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
                      "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
                      "tree_method": "gpu_hist",
                      "predictor": "gpu_predictor",
                      "gpu_id": 0,
                      "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
                      "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
                      "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
                      "random_state": SEED}

        if xgb_params["booster"] == "gbtree" or xgb_params["booster"] == "dart":
            xgb_params["max_depth"] = trial.suggest_int("max_depth", 1, 5)
            xgb_params["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
            xgb_params["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 1.0)
            xgb_params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        if xgb_params["booster"] == "dart":
            xgb_params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            xgb_params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            xgb_params["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
            xgb_params["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)
            
#         pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "eval-logloss")


        model = xgb.train(xgb_params, 
                          dtrain = dtrain,
                          evals = [(dtrain, 'train'), (dvalid, 'valid')],
                          num_boost_round = 9999,
                          early_stopping_rounds = 100,
                          verbose_eval = 100)

        model.save_model(f'XBG_918_features_wo_imputation_fold{fold}.xgb')

        dd = model.get_score(importance_type = 'weight')
        df = pd.DataFrame({'feature': dd.keys(), f'importance_{fold}': dd.values()})
        importances.append(df)

        oof_preds = model.predict(dvalid)
        acc = amex_metric_mod(y_valid.values, oof_preds)
        print('Kaggle Metric = ', acc, '\n')

        df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
        df['oof_pred'] = oof_preds
        oof.append(df)

        del dtrain, Xy_train, dd, df
        del X_valid, y_valid, dvalid, model
        _ = gc.collect()
        

    print('#'*25)
    oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
    acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
    print('OVERALL CV Kaggle Metric = ', acc)
    
    return acc
    

#### Simple split Optuna 

In [10]:
def objective2(trial):
    
    gc.collect()

    SEED = 42
    
    X_train, X_valid, y_train, y_valid = train_test_split(train, train.target, test_size = 0.33, random_state = SEED)

    Xy_train = IterLoadForDMatrix(X_train, FEATURES, 'target')
    X_valid = X_valid.loc[:, FEATURES]

    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    '''
    xgb_parms = {'max_depth': 4,
         'learning_rate':0.05,
         'subsample': 0.8,
         'colsample_bytree': 0.6,
         'eval_metric':'logloss',
         'objective': 'binary:logistic',
         'tree_method': 'gpu_hist',
         'predictor': 'gpu_predictor',
         'random_state': SEED}
    '''
    xgb_params = {"verbosity": 0,
                  "objective": "binary:logistic",
                  "eval_metric": "logloss",
                  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
                  "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
                  "tree_method": "gpu_hist",
                  "predictor": "gpu_predictor",
                  "gpu_id": 0,
                  "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
                  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
                  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
                  "random_state": SEED}

    if xgb_params["booster"] == "gbtree" or xgb_params["booster"] == "dart":
        xgb_params["max_depth"] = trial.suggest_int("max_depth", 1, 5)
        xgb_params["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
        xgb_params["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 1.0)
        xgb_params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if xgb_params["booster"] == "dart":
        xgb_params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        xgb_params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        xgb_params["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
        xgb_params["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)

#         pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "eval-logloss")

    start = time.time()
    model = xgb.train(xgb_params, 
                      dtrain = dtrain,
                      evals = [(dtrain, 'train'), (dvalid, 'valid')],
                      num_boost_round = 9999,
                      early_stopping_rounds = 100,
                      verbose_eval = 100)
    print(f"Training time: {time.time() - start}")

    model.save_model(f'XBG_918_features_wo_imputation.xgb')


    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric = ', acc, '\n')

    del dtrain, Xy_train#, dd, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()

    print('OVERALL CV Kaggle Metric = ', acc)
    
    return acc
    

In [11]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective2, n_trials = 50)
joblib.dump(study, 'study.pkl')

[32m[I 2022-07-31 05:37:25,261][0m A new study created in memory with name: no-name-d6d15bde-ff55-42aa-bdaa-ee430620f878[0m


[0]	train-logloss:0.68605	valid-logloss:0.68605


[100]	train-logloss:0.36340	valid-logloss:0.36374


[200]	train-logloss:0.28535	valid-logloss:0.28615


[300]	train-logloss:0.25859	valid-logloss:0.25997


[400]	train-logloss:0.24608	valid-logloss:0.24795


[500]	train-logloss:0.23899	valid-logloss:0.24131


[600]	train-logloss:0.23448	valid-logloss:0.23724


[700]	train-logloss:0.23128	valid-logloss:0.23443


[800]	train-logloss:0.22892	valid-logloss:0.23246


[900]	train-logloss:0.22702	valid-logloss:0.23090


[1000]	train-logloss:0.22545	valid-logloss:0.22966


[1100]	train-logloss:0.22407	valid-logloss:0.22861


[1200]	train-logloss:0.22285	valid-logloss:0.22771


[1300]	train-logloss:0.22176	valid-logloss:0.22691


[1400]	train-logloss:0.22078	valid-logloss:0.22621


[1500]	train-logloss:0.21990	valid-logloss:0.22561


[1600]	train-logloss:0.21910	valid-logloss:0.22511


[1700]	train-logloss:0.21835	valid-logloss:0.22465


[1800]	train-logloss:0.21766	valid-logloss:0.22423


[1900]	train-logloss:0.21702	valid-logloss:0.22387


[2000]	train-logloss:0.21640	valid-logloss:0.22354


[2100]	train-logloss:0.21584	valid-logloss:0.22326


[2200]	train-logloss:0.21530	valid-logloss:0.22299


[2300]	train-logloss:0.21480	valid-logloss:0.22277


[2400]	train-logloss:0.21428	valid-logloss:0.22254


[2500]	train-logloss:0.21381	valid-logloss:0.22235


[2600]	train-logloss:0.21333	valid-logloss:0.22216


[2700]	train-logloss:0.21289	valid-logloss:0.22201


[2800]	train-logloss:0.21244	valid-logloss:0.22186


[2900]	train-logloss:0.21201	valid-logloss:0.22171


[3000]	train-logloss:0.21160	valid-logloss:0.22159


[3100]	train-logloss:0.21120	valid-logloss:0.22148


[3200]	train-logloss:0.21081	valid-logloss:0.22136


[3300]	train-logloss:0.21044	valid-logloss:0.22127


[3400]	train-logloss:0.21007	valid-logloss:0.22118


[3500]	train-logloss:0.20971	valid-logloss:0.22108


[3600]	train-logloss:0.20934	valid-logloss:0.22100


[3700]	train-logloss:0.20899	valid-logloss:0.22091


[3800]	train-logloss:0.20863	valid-logloss:0.22082


[3900]	train-logloss:0.20828	valid-logloss:0.22076


[4000]	train-logloss:0.20794	valid-logloss:0.22069


[4100]	train-logloss:0.20759	valid-logloss:0.22062


[4200]	train-logloss:0.20725	valid-logloss:0.22056


[4300]	train-logloss:0.20692	valid-logloss:0.22051


[4400]	train-logloss:0.20660	valid-logloss:0.22048


[4500]	train-logloss:0.20627	valid-logloss:0.22041


[4600]	train-logloss:0.20595	valid-logloss:0.22036


[4700]	train-logloss:0.20564	valid-logloss:0.22031


[4800]	train-logloss:0.20532	valid-logloss:0.22026


[4900]	train-logloss:0.20500	valid-logloss:0.22022


[5000]	train-logloss:0.20469	valid-logloss:0.22018


[5100]	train-logloss:0.20439	valid-logloss:0.22015


[5200]	train-logloss:0.20409	valid-logloss:0.22011


[5300]	train-logloss:0.20379	valid-logloss:0.22008


[5400]	train-logloss:0.20348	valid-logloss:0.22004


[5500]	train-logloss:0.20318	valid-logloss:0.22000


[5600]	train-logloss:0.20287	valid-logloss:0.21996


[5700]	train-logloss:0.20258	valid-logloss:0.21993


[5800]	train-logloss:0.20229	valid-logloss:0.21992


[5900]	train-logloss:0.20200	valid-logloss:0.21989


[6000]	train-logloss:0.20172	valid-logloss:0.21987


[6100]	train-logloss:0.20144	valid-logloss:0.21985


[6200]	train-logloss:0.20116	valid-logloss:0.21983


[6300]	train-logloss:0.20088	valid-logloss:0.21982


[6400]	train-logloss:0.20060	valid-logloss:0.21980


[6500]	train-logloss:0.20031	valid-logloss:0.21978


[6600]	train-logloss:0.20003	valid-logloss:0.21975


[6700]	train-logloss:0.19975	valid-logloss:0.21975


[6800]	train-logloss:0.19948	valid-logloss:0.21974


[6900]	train-logloss:0.19921	valid-logloss:0.21972


[7000]	train-logloss:0.19894	valid-logloss:0.21970


[7100]	train-logloss:0.19866	valid-logloss:0.21969


[7200]	train-logloss:0.19840	valid-logloss:0.21967


[7300]	train-logloss:0.19812	valid-logloss:0.21964


[7400]	train-logloss:0.19786	valid-logloss:0.21962


[7500]	train-logloss:0.19758	valid-logloss:0.21960


[7600]	train-logloss:0.19732	valid-logloss:0.21957


[7700]	train-logloss:0.19706	valid-logloss:0.21955


[7800]	train-logloss:0.19680	valid-logloss:0.21954


[7900]	train-logloss:0.19654	valid-logloss:0.21953


[8000]	train-logloss:0.19629	valid-logloss:0.21951


[8100]	train-logloss:0.19603	valid-logloss:0.21951


[8200]	train-logloss:0.19577	valid-logloss:0.21950


[8300]	train-logloss:0.19551	valid-logloss:0.21949


[8400]	train-logloss:0.19524	valid-logloss:0.21948


[8500]	train-logloss:0.19499	valid-logloss:0.21947


[8600]	train-logloss:0.19474	valid-logloss:0.21946


[8700]	train-logloss:0.19448	valid-logloss:0.21945


[8800]	train-logloss:0.19423	valid-logloss:0.21944


[8900]	train-logloss:0.19398	valid-logloss:0.21943


[9000]	train-logloss:0.19373	valid-logloss:0.21942


[9100]	train-logloss:0.19347	valid-logloss:0.21941


[9200]	train-logloss:0.19323	valid-logloss:0.21941


[9300]	train-logloss:0.19297	valid-logloss:0.21940


[9400]	train-logloss:0.19273	valid-logloss:0.21939


[9500]	train-logloss:0.19248	valid-logloss:0.21940


[9510]	train-logloss:0.19245	valid-logloss:0.21940


Training time: 508.3883295059204


[32m[I 2022-07-31 05:46:18,465][0m Trial 0 finished with value: 0.7906320540062097 and parameters: {'learning_rate': 0.011899155967261734, 'colsample_bytree': 0.2820238130125764, 'booster': 'gbtree', 'lambda': 0.08569031073095795, 'alpha': 2.5536747402042386e-08, 'max_depth': 3, 'eta': 0.029452038248850763, 'gamma': 0.0004146024307745867, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.7906320540062097.[0m


Kaggle Metric =  0.7906320540062097 

OVERALL CV Kaggle Metric =  0.7906320540062097


[0]	train-logloss:0.67117	valid-logloss:0.67119


[100]	train-logloss:0.24170	valid-logloss:0.24605


[200]	train-logloss:0.22003	valid-logloss:0.22889


[300]	train-logloss:0.21158	valid-logloss:0.22451


[400]	train-logloss:0.20552	valid-logloss:0.22227


[500]	train-logloss:0.20068	valid-logloss:0.22118


[600]	train-logloss:0.19644	valid-logloss:0.22058


[700]	train-logloss:0.19255	valid-logloss:0.22020


[800]	train-logloss:0.18875	valid-logloss:0.21997


[900]	train-logloss:0.18518	valid-logloss:0.21979


In [None]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))