### Load Libraries

In [11]:
# Load libraries
import datetime as dt
from datetime import datetime
import time 
import calendar
start_time = datetime.now()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import scipy.stats as sps
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import StackingClassifier

import keras

import math

print('LOAD DURATION:',datetime.now() - start_time)

LOAD DURATION: 0:00:00.005774


In [2]:
%%time
# load the data
df = pd.read_csv('train_test.csv')
oot = pd.read_csv('oot.csv')

CPU times: user 1.33 s, sys: 227 ms, total: 1.56 s
Wall time: 1.56 s


In [7]:
X = df.drop(columns=['fraud_label'])
y = df['fraud_label']

oot_x = oot.drop(columns=['fraud_label'])
oot_y = oot['fraud_label']

**FDR Function**

In [4]:
# create functions to calculate fdr
def fdr_cal(x_data, y_data, model_choice):
    model = model_choice
    pop = int(round(len(x_data)*0.03))
    result = pd.DataFrame(model.predict_proba(x_data),columns=['prob_0', 'prob_1'])
    temp = x_data.copy()
    temp['fraud_label'] = y_data
    temp['prob_1']= list(result.prob_1)
    temp0 = temp.sort_values('prob_1', ascending=False)
    temp1 = temp0.head(pop)
    fdr = temp1.fraud_label.sum() / y_data.sum()
    
    return fdr

**GradientBoosting Model Tuning**

In [None]:
# create a table to store the data
gbm_table = pd.DataFrame()
num_tree = [1000,1500,1800]
max_depth = 5
learn_rate = [0.02,0.05]
max_feature = 5
min_samples_leaf = 30
min_samples_split = [500, 1500]

i=0
for lr in learn_rate:
        for num in num_tree:
            for mss in min_samples_split:
                gbm_table.loc[i,'n_estimators'] = num
                gbm_table.loc[i,'learning_rate'] = lr
                gbm_table.loc[i,'max_depth'] = max_depth
                gbm_table.loc[i,'max_feature'] = max_feature
                gbm_table.loc[i,'min_samples_leaf'] = min_samples_leaf
                gbm_table.loc[i,'min_samples_split'] = mss
                i+=1



gbm_table['Train'] = 0.000
gbm_table['Test'] = 0.000
gbm_table['OOT'] = 0.000

In [None]:
gbm_table

In [None]:
# create a for loop to calculate all GBM FDR@3%

# using KFold
n_choice = 10
kf = KFold(n_splits=n_choice)


num_tree = [1000,1500,1800]
max_depth = 5
learn_rate = [0.02,0.05]
max_feature = 5
min_samples_leaf = 30
min_samples_split = [500, 1500]

i=0
for lr in learn_rate:
        for num in num_tree:
            for mss in min_samples_split:
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
                    y_train, y_test = y.iloc[train_index, :].values.ravel(), y.iloc[test_index, :].values.ravel()

                    gbm = GradientBoostingClassifier(learning_rate=lr, n_estimators =num,
                                                     max_depth=max_depth, max_features=max_feature,
                                                     min_samples_split=mss, 
                                                     min_samples_leaf=min_samples_leaf)
                    gbm.fit(X_train,y_train)

                    gbm_table.loc[(gbm_table['n_estimators']==num)\
                                  &(gbm_table['learning_rate']==lr)\
                                  &(gbm_table['min_samples_split']==mss)\
                                  ,'Train'] += fdr_cal(X_train, y_train, gbm)
                    gbm_table.loc[(gbm_table['n_estimators']==num)\
                                  &(gbm_table['learning_rate']==lr)\
                                  &(gbm_table['min_samples_split']==mss)\
                                  ,'Test'] += fdr_cal(X_test, y_test, gbm)
                    gbm_table.loc[(gbm_table['n_estimators']==num)\
                                  &(gbm_table['learning_rate']==lr)\
                                  &(gbm_table['min_samples_split']==mss)\
                                  ,'OOT'] += fdr_cal(oot_x, oot_y, gbm)
                    i+=1
                    print(f'Iteration {i} is completed.')

In [None]:
# save results
gbm_table.to_excel('gbm_table_results_1.xlsx')

In [None]:
for col in ['Train','Test','OOT']:
    gbm_table[col] = gbm_table[col]/n_choice

In [None]:
gbm_table.to_excel('gbm_table_results_2.xlsx')

In [None]:
top10 = gbm_table.sort_values('OOT', ascending=False)[:10]

In [None]:
top10

In [None]:
print('Total Time:', datetime.now()-start_time)

### Ensemble Stacking

In [8]:
train_result = []
test_result = []
oot_result = []


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#define base model
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('bayes', GaussianNB()))
level0.append(('mlp', MLPClassifier(hidden_layer_sizes=(10,),
                            max_iter=200,
                            activation='logistic',
                            solver='adam',
                            alpha=0.001,
                            learning_rate='adaptive',
                            learning_rate_init=0.001)))
level0.append(('rfl', RandomForestClassifier(criterion='gini',
                                             max_features='sqrt', 
                                             min_samples_leaf=1,
                                             min_samples_split=15,
                                             n_estimators=200,
                                             bootstrap=True,
                                             random_state=42)))
# define meta learner model
level1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators =1000,
                                    max_depth=5, max_features=0.2,
                                    min_samples_split=500, 
                                    min_samples_leaf=30)


# define the stacking ensemble
stack = StackingClassifier(estimators=level0, final_estimator=level1, cv=5, 
                           stack_method='predict_proba',n_jobs=-1)
# fit the model on all available data
stack.fit(X_train, y_train)

train_result = fdr_cal(X_train, y_train, stack)
test_result = fdr_cal(X_test, y_test, stack)
oot_result = fdr_cal(oot_x, oot_y, stack)

In [9]:
print('Stacked Train FDR:',train_result)
print('Stacked Test FDR:',test_result)
print('Stacked OOT FDR:',oot_result)

Stacked Train FDR: 0.5854423032273044
Stacked Test FDR: 0.5755571030640668
Stacked OOT FDR: 0.5536462699077954


In [10]:
print('Total Time:', datetime.now()-start_time)

Total Time: 5:27:58.600756


### Grid Search CV

In [None]:
# Finalize hyperparameters
from sklearn.model_selection import GridSearchCV

# setting parameter grid based on prior best results
parameters = {
    'num_tree': [1000,1500,1800], 
    'max_depth': [5],
    'learn_rate': [0.02,0.05],
    'max_feature': [5],
    'min_samples_leaf': [30],
    "min_samples_split": [500, 1500]
}

# perform grid search
gbm = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=gbm, param_grid = parameters, cv=2, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)