# 1. SETTINGS

In [22]:
# libraries
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [23]:
# pandas options
pd.set_option("display.max_columns", None)

In [24]:
# garbage collection
import gc
gc.enable()

In [25]:
# ignore warnings
warnings.filterwarnings("ignore")

In [26]:
# random settings
seed = 42

# 2. PREPARATIONS

In [None]:
# import data
train = pd.read_csv("../data/prepared/train_red.csv")
test  = pd.read_csv("../data/prepared/test_red.csv")
y     = pd.read_csv("../data/prepared/y_red.csv")

In [None]:
# check train
train.head()

In [None]:
# check test
test.head()

In [None]:
# extract target
y = y["TARGET"]

In [None]:
# exclude features
excluded_feats = ["SK_ID_CURR"]
features = [f for f in train.columns if f not in excluded_feats]

In [None]:
# check dimensions
print(train[features].shape)
print(test[features].shape)

In [None]:
### PARAMETERS

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 10000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed)

# learner settings
metric   = "auc"
verbose  = 250
stopping = 300

# CV settings
num_folds = 5
shuffle   = True

# 3. CROSS-VALIDATION

In [None]:
# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

# placeholders
valid_aucs_cv = np.zeros(num_folds) 
test_preds_cv = np.zeros(test.shape[0])

In [None]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    # train lightGBM
    gbm = gbm.fit(trn_x, trn_y, 
                  eval_set = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric = metric, 
                  verbose = verbose, 
                  early_stopping_rounds = stopping)
    
    # save number of iterations
    num_iter_cv = gbm.best_iteration_
    
    # predictions
    valid_preds_cv = gbm.predict_proba(val_x, num_iteration = num_iter_cv)[:, 1]
    valid_aucs_cv[n_fold] = roc_auc_score(val_y, valid_preds_cv)
    test_preds_cv += gbm.predict_proba(test[features], num_iteration = num_iter_cv)[:, 1] / folds.n_splits
    
    # print performance
    print("----------------------")
    print("Fold%2d AUC: %.6f" % (n_fold + 1, valid_aucs_cv[n_fold]))
    print("----------------------")
    print("")

    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
# print overall performance    
print("Cross-Validation AUC score %.6f" % np.mean(valid_aucs_cv))

# RESULTS
# 1500: 0.788733
# 426:  0.778094

# 3. SUBMISSION

In [28]:
# create submission
test["TARGET"] = test_preds_cv
subm = test[["SK_ID_CURR", "TARGET"]]

# export CSV
subm.to_csv("../submissions/bagged_lgb_5k_426.csv", index = False, float_format = "%.8f")