# 1. SETTINGS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import scipy.stats

import os
import time

import functions
from functions import prediction_reward

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# white axis labeles
params = {'ytick.color' :     'w',
          'xtick.color' :     'w',
          'axes.labelcolor' : 'w',
          'axes.edgecolor' :  'w'}
plt.rcParams.update(params)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

# 2. DATA PARTITIONING

In [None]:
# import CSV
df = pd.read_csv('../data/data_v1.csv')
print(df.shape)

In [None]:
# target variable
target = 'fraud'

In [None]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

In [None]:
# target encoding
y = train[target]
del train[target], test[target]

# 3. MODELING

In [None]:
# drop bad features
excluded_feats = ['id']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

In [None]:
### PARAMETERS

# settings
cores = 10
seed  = 23

# cross-validation
num_folds = 5
shuffle   = True

# muner of rounds
max_rounds = 10000
stopping   = 500
verbose    = 100

# LGB parameters
lgb_params = {
    'boosting_type':   'gbdt',
    'objective':       'binary',
    'metric':          'None',
    'subsample':        0.9,
    'feature_fraction': 0.8,
    'lambda_l1':        0.1,
    'lambda_l2':        0.1,
    'min_split_gain':   0.01,
    'min_child_weight': 2,
    'silent':           True,
    'verbosity':        -1,
    'learning_rate':    0.05,
    'max_depth':        7,
    'num_leaves':       70,
    'n_estimators':     max_rounds,
    'nthread' :         cores,
    'random_state':     seed,
}

# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [None]:
# placeholders
clfs = []
valid_profit = np.zeros(num_folds) 
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])
importances  = pd.DataFrame()

In [None]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
        
    # train lightGBM
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric           = prediction_reward, 
                  verbose               = verbose, 
                  early_stopping_rounds = stopping)
    clfs.append(clf)
       
    # predictions
    preds_oof[val_idx]    = clf.predict_proba(val_x, num_iteration = clf.best_iteration_)[:, 1]
    valid_profit[n_fold]  = prediction_reward(val_y, preds_oof[val_idx])[1]
    preds_test           += clf.predict_proba(test[features], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits 
    
    # importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = n_fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('----------------------')
    print('FOLD%2d: PROFIT = %.0f' % (n_fold + 1, valid_profit[n_fold]))
    print('----------------------')
    print('')
    
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
# print overall performance    
cv_perf = np.sum(valid_profit)
print('----------------------')
print('TOTAL PROFIT = %.0f' % cv_perf)
print('----------------------')

In [None]:
##### RECHECK PROFIT
prediction_reward(y, preds_oof)


###### TRACKING RESULTS

# lgb, 5 folds, default feats: 80

In [None]:
##### VARIABLE IMPORTANCE

# load importance    
top_feats = 100
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
    
# plot variable importance
plt.figure(figsize = (8,5))
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False))
plt.tight_layout()

# save plot as pdf
plt.savefig('../var_importance.pdf')

# 4. SUBMISSION

In [None]:
# file name
model = 'lgb_v1'
perf  = str(round(cv_perf, 6))[2:8]
name  = model + '_' + perf

In [None]:
# export OOF preds
oof = pd.DataFrame({'id': train['id'], 'fraud': preds_oof})
oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof.head()

In [None]:
# check submission
sub = pd.DataFrame({'id': test['id'], 'fraud': preds_test})
sub['fraud'] = np.round(sub['fraud']).astype('int')
sub.head()

In [None]:
# export submission
sub = sub['fraud']
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape