# 1. SETTINGS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import scipy.stats

import os
import time

from functions import prediction_reward
import functions 

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

In [None]:
# white axis labeles
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w"}
plt.rcParams.update(params)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# garbage collection
import gc
gc.enable()

# 2. DATA PARTITIONING

In [None]:
# import CSV
df = pd.read_csv('../data/data_v2.csv')
print(df.shape)

In [None]:
# target variable
target = 'fraud'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target encoding
y = train[target]
del train[target], test[target]

# 3. MODELING

In [None]:
# drop bad features
excluded_feats = ['id']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

In [None]:
### PARAMETERS

# settings
cores = 10
seed = 23

# cross-validation
num_folds = 5
shuffle   = True

# learner
metric   = "auc"
verbose  = 200
stopping = 1000

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 10000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed,
                         num_threads      = cores)

# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [None]:
# placeholders
valid_aucs_cv = np.zeros(num_folds) 
valid_costs_cv = np.zeros(num_folds) 
test_preds_cv = np.zeros(test.shape[0])
oof_preds_cv  = np.zeros(train.shape[0])
feature_importance_df = pd.DataFrame()

In [None]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
        
    # train lightGBM
    gbm = gbm.fit(trn_x, trn_y, 
                  eval_set = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric = prediction_reward, 
                  verbose = verbose, 
                  early_stopping_rounds = stopping)
    
    # save number of iterations
    num_iter_cv = gbm.best_iteration_
       
    # predictions
    oof_preds_cv[val_idx] =  gbm.predict_proba(val_x, num_iteration = num_iter_cv)[:, 1]
    valid_aucs_cv[n_fold] = roc_auc_score(val_y, oof_preds_cv[val_idx])
    valid_costs_cv[n_fold] = prediction_reward(val_y, oof_preds_cv[val_idx])[1]
    test_preds_cv += gbm.predict_proba(test[features], num_iteration = num_iter_cv)[:, 1] / folds.n_splits 
    
    # importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["Importance"] = gbm.feature_importances_
    fold_importance_df["Fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis = 0)
    
    # print performance
    print("----------------------")
    print("Fold%2d AUC: %.6f" % (n_fold + 1, valid_aucs_cv[n_fold]))
    print("----------------------")
    print("")
    
     # print performance
    print("----------------------")
    print("Fold%2d reward: %.6f" % (n_fold + 1, valid_costs_cv[n_fold]))
    print("----------------------")
    print("")
    
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
# print overall performance    
cv_perf = np.mean(valid_aucs_cv)
print("CV AUC score: %.6f" % cv_perf)

# print overall performance    
cv_perf_rew = np.mean(valid_costs_cv)
print("CV reward score: %.6f" % cv_perf_rew)
###### TRACKING RESULTS



In [None]:
##### VARIABLE IMPORTANCE

# load importance    
top_feats = 100
cols = feature_importance_df[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by = "Importance", ascending = False)[0:top_feats].index
importance = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
    
# plot variable importance
plt.figure(figsize = (10, 10))
sns.barplot(x = "Importance", y = "Feature", data = importance.sort_values(by = "Importance", ascending = False))
plt.title('LightGBM Variable Importance (mean over CV folds)')
plt.tight_layout()

# save plot as pdf
plt.savefig("../var_importance.pdf")

# 4. SUBMISSION

In [None]:
# file name
model = 'lgb_v1'
perf  = str(round(cv_perf, 6))[2:8]
name  = model + '_' + perf

In [None]:
# export submission
sub = pd.DataFrame(test_preds_cv, columns = ['fraud'])
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape

In [None]:
# export OOF preds
oof_preds_cv = pd.DataFrame(oof_preds_cv, columns = ['fraud'])
oof_preds_cv.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof_preds_cv.shape