In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats
import warnings
from colorama import Fore, Back, Style
import gc
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
from lightgbm import LGBMClassifier, log_evaluation

INFERENCE = False # set to False if you only want to cross-validate


In [2]:
# @yunchonggan's fast metric implementation
# From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

In [3]:
train = pd.read_feather('../input/amexfeatureengineering/770_FE_train.feather')

In [4]:
target = train.target.values

In [5]:
%%time
# Cross-validation of the classifier

ONLY_FIRST_FOLD = False

features = [f for f in train.columns if f != 'customer_ID' and f != 'target']

oof = np.zeros((len(train)))

def my_booster(random_state=42, n_estimators=1500):
    return LGBMClassifier(n_estimators=n_estimators,
                          learning_rate=0.03, reg_lambda=10,
                          min_child_samples=1000,
                          num_leaves=95,
                          colsample_bytree=0.2,
                          max_bins=511, random_state=random_state)
      
print(f"{len(features)} features")
score_list = []
y_pred_list = []
kf = StratifiedKFold(n_splits=5)
SEED = [43,42]
for seed in SEED:
    oof_ = np.zeros((len(train)))
    for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):
        X_tr, X_va, y_tr, y_va, model = None, None, None, None, None
        start_time = datetime.datetime.now()
        X_tr = train.iloc[idx_tr][features]
        X_va = train.iloc[idx_va][features]
        y_tr = target[idx_tr]
        y_va = target[idx_va]
    
        model = my_booster(random_state=seed)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, y_tr,
                  eval_set = [(X_va, y_va)], 
                   eval_metric = 'AUC',
                  callbacks=[log_evaluation(100)])
        X_tr, y_tr = None, None
        y_va_pred = model.predict_proba(X_va, raw_score=True)
        score = amex_metric(y_va, y_va_pred)
    
        joblib.dump(model, f'LGBM_Simple_fold{fold}_SEED{seed}.pkl')
    
        # SAVE OOF
        oof_[idx_va] = y_va_pred
        
        n_trees = model.best_iteration_
        if n_trees is None: n_trees = model.n_estimators
        print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
              f" {n_trees:5} trees |"
              f"                Score = {score:.5f}{Style.RESET_ALL}")
        score_list.append(score)
    oof += oof_ / 2
        
acc = amex_metric(train.target, oof)       
print('OVERALL CV Kaggle Metric =',acc)

770 features
[100]	valid_0's auc: 0.956922	valid_0's binary_logloss: 0.239017
[200]	valid_0's auc: 0.959936	valid_0's binary_logloss: 0.222873
[300]	valid_0's auc: 0.961167	valid_0's binary_logloss: 0.219046
[400]	valid_0's auc: 0.961744	valid_0's binary_logloss: 0.217372
[500]	valid_0's auc: 0.962059	valid_0's binary_logloss: 0.216476
[600]	valid_0's auc: 0.962244	valid_0's binary_logloss: 0.215959
[700]	valid_0's auc: 0.962348	valid_0's binary_logloss: 0.215677
[800]	valid_0's auc: 0.962407	valid_0's binary_logloss: 0.215514
[900]	valid_0's auc: 0.962455	valid_0's binary_logloss: 0.215406
[1000]	valid_0's auc: 0.962473	valid_0's binary_logloss: 0.21538
[1100]	valid_0's auc: 0.962487	valid_0's binary_logloss: 0.215368
[1200]	valid_0's auc: 0.962495	valid_0's binary_logloss: 0.21538
[1300]	valid_0's auc: 0.962498	valid_0's binary_logloss: 0.215411
[1400]	valid_0's auc: 0.962493	valid_0's binary_logloss: 0.215467
[1500]	valid_0's auc: 0.962485	valid_0's binary_logloss: 0.215552
[32m[1

In [6]:
oof = pd.DataFrame({'customer_ID':train.customer_ID,'target':train.target,'oof_pred':oof})
oof = oof.set_index('customer_ID')
oof.to_csv('oof_lgbmquick.csv',index=False)