# SETTINGS

In [718]:
############ LIBRARIES

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')
%matplotlib inline

import os
import time
import random
import multiprocessing
import pickle

import scipy.stats
from scipy.stats import rankdata

import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix
from sklearn.neighbors import NearestNeighbors

In [719]:
############ HELPER FUNCTIONS

import functions
import importlib
importlib.reload(functions)
from functions import *

In [720]:
############ RANDOMNESS

# seed function
def seed_everything(seed = 23):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 23
seed_everything(seed)

# DATA PREPARATION

In [721]:
# import CSV
df = pd.read_pickle('../input/data_v8.pkl')
print(df.shape)

(1097231, 498)


In [722]:
# target variable
target = 'isFraud'

In [723]:
# partitioning
train_df = df[df[target].isnull() == False]
test_df  = df[df[target].isnull() == True]
print(train_df.shape)
print(test_df.shape)
del df

(590540, 498)
(506691, 498)


In [724]:
# sort data
train_df = train_df.sort_values('TransactionID')
test_df  = test_df.sort_values('TransactionID')

train_df = train_df.reset_index(drop = True)
test_df  = test_df.reset_index(drop = True)

In [725]:
# target variable
y = train_df[target]
del train_df[target]
del test_df[target]
g = train_df['DT_M']
#del train, test

In [726]:
### IMPORT OOF PREDS

# which model to stack?
#model = 'lgb'

# threshold
#min_auc = 90

# list names
names = sorted(os.listdir('../oof_preds'))
#names = [n for n in names if int(n[n.rindex('_')+1:-7]) > min_auc]
#names = [s for s in names if model in s]

# preprocessing loop
for name in names:

    # load preds
    tmp_tr = pd.read_csv('../oof_preds/'   + str(name))
    tmp_te = pd.read_csv('../submissions/' + str(name))

    # sort preds by ID
    tmp_tr = tmp_tr.sort_values('TransactionID')
    tmp_te = tmp_te.sort_values('TransactionID')

    
    # cbind data
    if name == names[0]:  
        
        tmp_tr.columns = ['TransactionID', name]    
        tmp_te.columns = ['TransactionID', name]    
        train = tmp_tr 
        test  = tmp_te
        
    else:
        
        del tmp_tr['TransactionID'], tmp_te['TransactionID']
        tmp_tr.columns = [name]    
        tmp_te.columns = [name]    
        train = pd.concat([train, tmp_tr], axis = 1)
        test  = pd.concat([test,  tmp_te], axis = 1)
    
    '''
    del tmp_tr['TransactionID'], tmp_te['TransactionID']
    tmp_tr.columns = [name]    
    tmp_te.columns = [name]    
    train = pd.concat([train, tmp_tr], axis = 1)
    test  = pd.concat([test,  tmp_te], axis = 1)
    '''
        
# display information
print('- Train shape:', train.shape)
print('- Test shape:',  test.shape)

- Train shape: (590540, 48)
- Test shape: (506691, 48)


# MODELING

In [727]:
# drop bad features
excluded_feats = ['TransactionID', 'DT_M']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

(590540, 47)


In [728]:
### PARAMETERS

# settings
cores = 24
seed  = 999

# cross-validation
num_folds = 6
shuffle   = True

# muner of rounds
max_rounds = 10000
stopping   = 100
verbose    = 100

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'binary',
    'metric':            'auc',
    'bagging_fraction':  0.9,
    'feature_fraction':  0.9,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0,
    'min_child_weight':  0.1,
    'min_child_samples': 20,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.001,
    'max_depth':         -1,
    'num_leaves':        64,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
}

'''
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':cores,
                    'learning_rate':0.005,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.8,
                    'n_estimators':10000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': seed,
                } 
'''

# data partitinoing
#folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)
folds = GroupKFold(n_splits = num_folds)

In [729]:
# placeholders
clfs = []
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])
importances  = pd.DataFrame()

In [730]:
### CROSS-VALIDATION LOOP
cv_start  = time.time()
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y, groups = g)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    # label encoding
    trn_x, val_x, test_x = label_encoding(trn_x, val_x, test[features])
    
    # print data dimensions
    print('Data shape:', trn_x.shape, val_x.shape)
           
    # train lightGBM
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric           = "auc", 
                  early_stopping_rounds = stopping,
                  verbose               = verbose)
    clfs.append(clf)
    
    # predict validation from the best iteration
    best_iter = clf.best_iteration_
       
    # predictions
    preds_oof[val_idx]    = clf.predict_proba(val_x,  num_iteration = best_iter)[:, 1]
    preds_test           += clf.predict_proba(test_x, num_iteration = best_iter)[:, 1] / folds.n_splits 

    ## importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = n_fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('----------------------')
    print('FOLD%2d: AUC = %.6f' % (n_fold + 1, roc_auc_score(y[val_idx], preds_oof[val_idx])))
    print('----------------------')
    print('')
        
    # clear memory
    #del trn_x, trn_y, val_x, val_y
    #gc.collect()
    
    
# print overall performance    
cv_perf = roc_auc_score(y, preds_oof)
print('--------------------------------')
print('- OOF AUC = %.6f' % cv_perf)
print('- CV TIME = {:.2f} min'.format((time.time() - cv_start) / 60))
print('--------------------------------')

Data shape: (453219, 47) (137321, 47)
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.953215	valid_1's auc: 0.924427
Early stopping, best iteration is:
[75]	training's auc: 0.953117	valid_1's auc: 0.924596
----------------------
FOLD 1: AUC = 0.924596
----------------------

Data shape: (488908, 47) (101632, 47)
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.948574	valid_1's auc: 0.947596
[200]	training's auc: 0.950332	valid_1's auc: 0.949372
[300]	training's auc: 0.951156	valid_1's auc: 0.949724
[400]	training's auc: 0.951916	valid_1's auc: 0.949975
Early stopping, best iteration is:
[392]	training's auc: 0.951883	valid_1's auc: 0.949982
----------------------
FOLD 2: AUC = 0.949982
----------------------

Data shape: (497955, 47) (92585, 47)
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.948662	valid_1's auc: 0.946709
[200]	training's auc: 0.950077	valid_1's auc: 0.9

In [731]:
############ RECHECK PERFORMANCE  

# check performance
print(np.round(roc_auc_score(y, preds_oof), 5))

0.927


# SUBMISSION

In [732]:
# file name
model = 'stack_lgb'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + str(len(features)) + '_' + perf
name

'stack_lgb47_927'

In [733]:
# export submission
sub = pd.DataFrame({'TransactionID': test['TransactionID'], 'isFraud': preds_test})
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.023594
1,3663550,0.023705
2,3663551,0.023598
3,3663552,0.023594
4,3663553,0.023599


In [734]:
# check rank correlation with the best submission
from scipy.stats import spearmanr
sub  = sub.sort_values('TransactionID')
best = pd.read_csv("../submissions/stack_lgb46_94648.csv")
best = best.sort_values('TransactionID')
spearmanr(sub.isFraud, best.isFraud)

SpearmanrResult(correlation=0.9877730760217902, pvalue=0.0)