# 1. SETTINGS

In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.metrics import log_loss
import copy
import scipy.stats
import os
import time
import pandas as pd

In [2]:
# helper functions
import functions
from functions import *

In [3]:
# pandas options
pd.set_option('display.max_columns', None)

In [4]:
# dark background style
plt.style.use('dark_background')

In [5]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
# garbage collection
import gc
gc.enable()

# 2. DATA PREPARATION

In [7]:
# import CSV
df = pd.read_csv('../data/data_v3.csv')
print(df.shape)

(500000, 13)


In [8]:
# target variable
target = 'fraud'

In [9]:
# partitioning
train = df[df[target].isnull() == False]
test  = df[df[target].isnull() == True]
print(train.shape)
print(test.shape)

(1879, 13)
(498121, 13)


In [10]:
# target variable
y = train[target]
del train[target], test[target]

# 3. MODELING

### PARAMETERS

In [11]:
# drop bad features
excluded_feats = ['id']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

(1879, 11)


In [12]:
### PARAMETERS

# settings
cores = 12
seed  = 23

# cross-validation
num_folds = 5
shuffle   = True

# muner of rounds
max_rounds = 600
stopping   = 600
verbose    = 200

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'binary',
    'metrics':           'binary_logloss',
    'bagging_fraction':  0.9,
    'feature_fraction':  0.8,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0.01,
    'min_child_weight':  2,
    'min_child_samples': 20,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.1,
    'max_depth':         7,
    'num_leaves':        70,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
}
# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [13]:
# placeholders
clfs = []
valid_profit = np.zeros(num_folds) 
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])
importances  = pd.DataFrame()

In [14]:
# SMOTE settings
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
sm = SMOTE(random_state = 23, n_jobs = 10, sampling_strategy = 0.1)
sm2 = SMOTEENN(smote=sm)

### CROSS-VALIDATION

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from imblearn.pipeline import make_pipeline
from collections.abc import Iterable
from sklearn.metrics import make_scorer

qs = [0,100]
scaler_params = {
    #'quantile_range': [(qs[0],i) for i in np.arange(0,101) if i > qs[0]],
    'quantile_range': [(i,qs[1]) for i in np.arange(0,101,5) if i < qs[1]],
    #'quantile_range': [(qs[0],i) for i in np.arange(qs[1]-5,qs[1]+5,0.1) if i > qs[0]],
    #'quantile_range': [(i,qs[1]) for i in np.arange(qs[0]-5,qs[0]+5,0.1) if i < qs[1]],
    #'with_centering': [True,False],
#    'with_scaling': [True,False]
}
params = dict()
params.update({'lgbmclassifier__'+k: v if isinstance(v,Iterable) and not isinstance(v,str) else [v] for k,v in lgb_params.items()})
params.update({'robustscaler__'+k:   v if isinstance(v,Iterable) else [v] for k,v in scaler_params.items()})

lgb.LGBMClassifier()
rs = RandomizedSearchCV(
    estimator = make_pipeline(RobustScaler(),sm2,lgb.LGBMClassifier(objective = custom_loss
                                                           )),
    n_iter = 100,
    param_distributions = params,
    #scoring=make_scorer(prediction_reward,greater_is_better=True), 
    cv=num_folds, 
    n_jobs=cores,
    verbose=2)
rs.fit(train[features],y,lgbmclassifier__eval_metric = prediction_reward)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   5 out of  10 | elapsed:    4.2s remaining:    4.2s
[Parallel(n_jobs=12)]: Done  10 out of  10 | elapsed:    5.0s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('smoteenn', SMOTEENN(enn=None, random_state=None, ratio=None, sampling_strategy='auto',
     smote=SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))]),
          fit_params=None, iid='warn', n_iter=2, n_jobs=12,
          param_distributions={'lgbmclassifier__boosting_type': ['gbdt'], 'lgbmclassifier__objective': ['binary'], 'lgbmclassifier__metrics': ['binary_logloss'], 'lgbmclassifier__bagging_fraction': [0.9], 'lgbmclassifier__feature_fraction': [0.8], 'lgbmclassifier__lambda_l1': [0.1], 'lgbmclassifier__lambda_l2... (55, 100), (60, 100), (65, 100), (70, 100), (75, 100), (80, 100), (85, 100), (90, 100), (95, 100)]},
          pre_

In [19]:
lgb_params_star = {k.replace('lgbmclassifier__',''):v for k,v in rs.best_params_.items() if 'lgbmclassifier' in k}
print(rs.best_score_)
#lgb_params = lgb_params_star

0.9856306546035125


In [18]:
sorted(rs.estimator.get_params().keys())

['lgbmclassifier',
 'lgbmclassifier__boosting_type',
 'lgbmclassifier__class_weight',
 'lgbmclassifier__colsample_bytree',
 'lgbmclassifier__importance_type',
 'lgbmclassifier__learning_rate',
 'lgbmclassifier__max_depth',
 'lgbmclassifier__min_child_samples',
 'lgbmclassifier__min_child_weight',
 'lgbmclassifier__min_split_gain',
 'lgbmclassifier__n_estimators',
 'lgbmclassifier__n_jobs',
 'lgbmclassifier__num_leaves',
 'lgbmclassifier__objective',
 'lgbmclassifier__random_state',
 'lgbmclassifier__reg_alpha',
 'lgbmclassifier__reg_lambda',
 'lgbmclassifier__silent',
 'lgbmclassifier__subsample',
 'lgbmclassifier__subsample_for_bin',
 'lgbmclassifier__subsample_freq',
 'memory',
 'robustscaler',
 'robustscaler__copy',
 'robustscaler__quantile_range',
 'robustscaler__with_centering',
 'robustscaler__with_scaling',
 'smoteenn',
 'smoteenn__enn',
 'smoteenn__random_state',
 'smoteenn__ratio',
 'smoteenn__sampling_strategy',
 'smoteenn__smote',
 'smoteenn__smote__k_neighbors',
 'smoteenn_

In [149]:
[i for i in rs.estimator.get_params().keys() if 'eval' in i]
#{k:v for k,v in lgb_params.items() if k in list(rs.estimator.get_params().keys())}

[]

In [150]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    # scale data
    scaler   = RobustScaler()
    trn_x    = pd.DataFrame(scaler.fit_transform(trn_x),      columns = features)
    val_x    = pd.DataFrame(scaler.transform(val_x),          columns = features)
    tmp_test = pd.DataFrame(scaler.transform(test[features]), columns = features)

    # augment training data with SMOTE
    trn_x, trn_y = sm.fit_sample(trn_x, trn_y)
    trn_x = pd.DataFrame(trn_x, columns = features)
    trn_y = pd.Series(trn_y)
    
    # factor decomposition
    tmp_features = copy.deepcopy(features)
    
    # train lightGBM
    print('Custom early stopping: select the best out of %.0f iterations...' % max_rounds)
    clf = lgb.LGBMClassifier(**lgb_params)
    #clf.set_params(**{'objective': custom_loss})
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric           = prediction_reward, 
                  #eval_metric           = "logloss", 
                  #early_stopping_rounds = stopping,
                  verbose               = verbose
                 )
    clfs.append(clf)
    
    # find the best iteration
    best_iter = np.argmax(clf.evals_result_['valid_1']['profit']) + 1

    # predictions
    preds_oof[val_idx]    = clf.predict_proba(val_x, num_iteration = best_iter)[:, 1]
    valid_profit[n_fold]  = prediction_reward(val_y, preds_oof[val_idx])[1]
    preds_test           += clf.predict_proba(tmp_test, num_iteration = best_iter)[:, 1] / folds.n_splits 

    ## importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = tmp_features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = n_fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('--------------------------------')
    print('FOLD%2d: PROFIT = %.0f' % (n_fold + 1, valid_profit[n_fold]))
    print('--------------------------------')
    print('')
    
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    # uncomment for mean target encoding
    #features = [f for f in train.columns if f not in excluded_feats]
    
    
# print overall performance    
cv_perf = np.sum(valid_profit)
print('--------------------------------')
print('TOTAL PROFIT = %.0f' % cv_perf)
print('--------------------------------')

Custom early stopping: select the best out of 600 iterations...
[1]	training's binary_logloss: 0.231828	training's profit: -710	valid_1's binary_logloss: 0.181446	valid_1's profit: -105
[2]	training's binary_logloss: 0.212174	training's profit: -710	valid_1's binary_logloss: 0.17436	valid_1's profit: -105
[3]	training's binary_logloss: 0.196823	training's profit: -710	valid_1's binary_logloss: 0.169997	valid_1's profit: -105
[4]	training's binary_logloss: 0.172281	training's profit: -710	valid_1's binary_logloss: 0.152349	valid_1's profit: -105
[5]	training's binary_logloss: 0.156713	training's profit: -590	valid_1's binary_logloss: 0.140714	valid_1's profit: -105
[6]	training's binary_logloss: 0.146682	training's profit: -510	valid_1's binary_logloss: 0.136586	valid_1's profit: -85
[7]	training's binary_logloss: 0.13198	training's profit: -100	valid_1's binary_logloss: 0.124621	valid_1's profit: -85
[8]	training's binary_logloss: 0.119661	training's profit: 90	valid_1's binary_logloss

In [None]:
##### RECHECK PROFIT  
prediction_reward(y, preds_oof)


###### TRACKING RESULTS (5 folds, strat = True, seed = 23)

# V1: lgb, 5 folds, default features:   80
# V2: add feature:  no. total items:   250
# V3: use logloss for ES, not profit:  260
# V4: add feature: no. weird actions:  275
# V5: custom earlystop for profit:     320
# V6: add SMOTE for minority class:    335
# V7: add robust data scaling:         350 = 95 + 55 + 75 + 35 + 90
# V8: increase learning rate to 0.1:   375 = 95 + 65 + 75 + 50 + 90

In [None]:
##### VARIABLE IMPORTANCE

# load importance    
top_feats = 100
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
    
# plot variable importance
plt.figure(figsize = (10, 6))
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False))
plt.tight_layout()

# save plot as pdf
plt.savefig('../var_importance.pdf')

### CUTOFF OPTIMIZATION

In [None]:
##### OPTIMIZE CUTOFF

# set step
step = 100

# search
cutoffs = []
profits = []
for i in range(0, step):
    cutoffs.append(i / step)
    profits.append(recompute_reward(y, preds_oof, cutoff = cutoffs[i]))
        
# results
plt.figure(figsize = (10,4))
sns.lineplot(x = cutoffs[10:step], y = profits[10:step], color = 'red')
plt.tight_layout()
plt.axvline(x = cutoffs[np.argmax(profits)], color = 'white', linestyle = '--')
print('- optimal cutoff = %.4f' % cutoffs[np.argmax(profits)])
print('- optimal profit = %.4f' % profits[np.argmax(profits)])
plt.savefig('../cutoff_selection.pdf')

# 4. SUBMISSION

In [None]:
# file name
model = 'lgb_v8'
perf  = str(round(cv_perf, 0).astype('int'))
name  = model + '_' + perf

In [None]:
# export OOF preds
oof = pd.DataFrame({'id': train['id'], 'fraud': preds_oof})
oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof.head()

In [None]:
# check submission
sub = pd.DataFrame({'id': test['id'], 'fraud': preds_test})
sub['fraud'] = np.round(sub['fraud']).astype('int')
sub.head()

In [None]:
# export submission
sub = sub[['fraud']]
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape

In [None]:
# check correlation with previous submission
prev_sub = pd.read_csv('../submissions/lgb_v8_375.csv')
cor = np.sum(prev_sub[target] == sub.reset_index()[target]) / len(sub)
print("Share of the same predictions: " + str(np.round(cor, 6)))