### Load Libraries

In [8]:
# Load libraries
import datetime as dt
from datetime import datetime
import time 
import calendar
start_time = datetime.now()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import scipy.stats as sps
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

import keras


print('LOAD DURATION:',datetime.now() - start_time)

LOAD DURATION: 0:00:00.000889


### Load Data

In [9]:
%%time
# load the data
data = pd.read_csv('400_var.csv', index_col=0)

CPU times: user 33 s, sys: 4.08 s, total: 37.1 s
Wall time: 37.2 s


### Selected Variables

In [None]:
score['rank_ks'] = score['ks'].rank(ascending = True)
score['rank_FDR'] = score['FDR at 3%'].rank(ascending = True)

score['average_rank'] = (score['rank_ks'] + score['rank_FDR']) / 2
score.sort_values(by=['average_rank'], ascending=False, inplace=True)

In [None]:
selected = list(score[:101]['Variable'])
selected.append('date')

In [None]:
df_s = data[selected].copy()
df_s['date'] = pd.to_datetime(df_s.date)
selection = df_s[(df_s.date > '2016-01-14')&(df_s.date < '2016-11-01')]

In [None]:
select_x = selection.drop(columns=['fraud_label', 'date'])
select_y = selection['fraud_label']

In [None]:
# do not run both RFECV in one setting
# n_jobs = -1 can only be used once 
model = LogisticRegression(penalty='l2', class_weight='balanced')
rfecv = RFECV(estimator=model, step=1, cv=2, verbose=2, n_jobs=-1, scoring="roc_auc")
rfecv.fit(select_x, select_y)

In [None]:
var_select_1 = pd.DataFrame(sorted(zip(map(lambda x: round(x,3), rfecv.ranking_), select_x.columns)),
                            columns = ['ranking', 'variable'])
print(var_select_1)

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation socre (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
plt.show()

In [None]:
var_select_manual = [
 'address_count_0',
 'address_count_0_by_30',
 'address_count_0_by_7',
 'address_count_1',
 'address_count_1_by_14',
 'address_count_1_by_7',
 'address_count_3',
 'address_count_30',
 'address_unique_count_for_name_dob_1',
 'address_unique_count_for_name_dob_14',
 'address_unique_count_for_name_dob_3',
 'address_unique_count_for_name_dob_30',
 'address_unique_count_for_name_dob_7',
 'address_unique_count_for_ssn_1',
 'address_unique_count_for_ssn_14',
 'address_unique_count_for_ssn_3',
 'address_unique_count_for_ssn_30',
 'address_unique_count_for_ssn_7',
 'fulladdress_count_0',
 'fulladdress_count_0_by_14',
 'fulladdress_count_0_by_3',
 'fulladdress_count_1',
 'fulladdress_count_14',
 'fulladdress_count_1_by_30',
 'fulladdress_count_7',
 'fulladdress_day_since',
 'fulladdress_homephone_count_0_by_14',
 'fulladdress_homephone_count_0_by_30',
 'fulladdress_homephone_count_3',
 'fulladdress_homephone_count_7',
 'fulladdress_homephone_day_since',
 'homephone_count_14',
 'homephone_count_3',
 'homephone_count_7',
 'homephone_unique_count_for_name_dob_3',
 'homephone_unique_count_for_name_dob_7',
 'homephone_unique_count_for_ssn_3',
 'homephone_unique_count_for_ssn_7',
 'name_count_14',
 'name_count_30',
 'name_count_7',
 'name_dob_count_0_by_14',
 'name_dob_count_0_by_30',
 'name_dob_count_7',
 'name_dob_day_since',
 'name_dob_unique_count_for_address_30',
 'name_dob_unique_count_for_homephone_30',
 'ssn_count_0_by_14',
 'ssn_count_0_by_30',
 'ssn_count_7',
 'ssn_day_since',
 'ssn_dob_count_0_by_14',
 'ssn_dob_count_0_by_30',
 'ssn_dob_count_7',
 'ssn_dob_day_since',
 'ssn_name_count_0_by_14',
 'ssn_name_count_0_by_30',
 'ssn_name_count_7',
 'ssn_name_day_since',
 'ssn_name_dob_count_14',
 'ssn_name_dob_count_30',
 'ssn_name_dob_count_7',
 'ssn_name_dob_day_since'
]

In [None]:
select_x_2 = selection[var_select_manual]

In [None]:
model2 = DecisionTreeClassifier(class_weight='balanced')
rfecv2 = RFECV(estimator=model2, step=1, cv=2, verbose=2, n_jobs=-1, scoring="roc_auc")
rfecv2.fit(select_x_2, select_y)

In [None]:
var_select_2 = pd.DataFrame(sorted(zip(map(lambda x: round(x,3), rfecv2.ranking_), select_x_2.columns)),
                            columns = ['ranking', 'variable'])
print(var_select_2)

In [14]:
var_list = [
 'address_count_0_by_14',
 'address_count_0_by_30',
 'address_count_14',
 'address_count_30',
 'address_count_7',
 'address_unique_count_for_name_dob_14',
 'address_unique_count_for_name_dob_30',
 'address_unique_count_for_ssn_14',
 'address_unique_count_for_ssn_30',
 'fulladdress_count_0_by_14',
 'fulladdress_count_14',
 'fulladdress_count_14',
 'fulladdress_count_30',
 'fulladdress_day_since',
 'fulladdress_homephone_count_0_by_30',
 'fulladdress_homephone_count_14',
 'fulladdress_homephone_count_30',
 'fulladdress_homephone_day_since',
 'name_dob_count_0_by_14',
 'name_dob_count_0_by_30',
 'ssn_count_30',
 'ssn_dob_count_14',
 'ssn_dob_count_30',
 'ssn_dob_day_since',
 'ssn_lastname_count_30',
 'ssn_name_dob_day_since',
 'ssn_day_since',
 'address_day_since',
 'name_dob_day_since',
 'fulladdress_count_0_by_7',
 'date',
 'fraud_label'
]

new_list = [
 'address_count_30',
 'fulladdress_day_since',
 'name_dob_unique_count_for_homephone_30',
 'homephone_unique_count_for_ssn_3',
 'ssn_dob_day_since',
 'address_unique_count_for_ssn_14',
 'ssn_day_since',
 'homephone_count_14',
 'fulladdress_homephone_day_since',
 'name_dob_day_since',
 'homephone_unique_count_for_ssn_7',
 'ssn_name_day_since',
 'name_count_30',
 'homephone_unique_count_for_name_dob_3',
 'ssn_name_dob_day_since',
 'homephone_count_7',
 'name_count_14',
 'homephone_count_3',
 'name_count_7',
 'homephone_unique_count_for_name_dob_7',
 'address_count_1_by_7',
 'address_count_1_by_14',
 'address_unique_count_for_ssn_30',
 'address_unique_count_for_name_dob_3',
 'address_count_0_by_30',
 'address_unique_count_for_name_dob_30',
 'address_count_0_by_7',
 'name_dob_unique_count_for_address_30',
 'address_unique_count_for_name_dob_7',
 'ssn_count_7',
 'date',
 'fraud_label'
]

combined_list = [
 'address_count_30',
 'fulladdress_day_since',
 'name_dob_unique_count_for_homephone_30',
 'homephone_unique_count_for_ssn_3',
 'ssn_dob_day_since',
 'address_unique_count_for_name_dob_7',
 'fulladdress_homephone_day_since',
 'ssn_name_dob_day_since',
 'ssn_count_30',
 'address_unique_count_for_name_dob_30',
 'address_unique_count_for_ssn_30',
 'address_count_1_by_14',
 'ssn_lastname_count_30',
 'address_count_0_by_7',
 'address_unique_count_for_name_dob_3',
 'name_dob_unique_count_for_address_30',
 'ssn_count_7',
 'address_count_0_by_30',
 'address_count_1_by_7',
 'ssn_dob_count_14',
 'name_dob_count_0_by_14',
 'fulladdress_homephone_count_14',
 'name_dob_count_0_by_30',
 'fulladdress_homephone_count_30',
 'ssn_dob_count_30',
 'date',
 'fraud_label'
]

In [26]:
# create datasets with 30 selected features
df = data[combined_list].copy()

In [27]:
# separate train, test, oot data
df['date'] = pd.to_datetime(df.date)

#train_test
train_test = df[(df.date > '2016-01-14')&(df.date < '2016-11-01')]

oot_data = df[df.date >= '2016-11-01']

In [28]:
X = train_test.drop(columns=['fraud_label', 'date'])
y = train_test[['fraud_label']]

oot_x = oot_data.drop(columns=['fraud_label', 'date'])
oot_y = oot_data['fraud_label']

In [30]:
X.describe()

Unnamed: 0,address_count_30,fulladdress_day_since,name_dob_unique_count_for_homephone_30,homephone_unique_count_for_ssn_3,ssn_dob_day_since,address_unique_count_for_name_dob_7,fulladdress_homephone_day_since,ssn_name_dob_day_since,ssn_count_30,address_unique_count_for_name_dob_30,...,name_dob_unique_count_for_address_30,ssn_count_7,address_count_0_by_30,address_count_1_by_7,ssn_dob_count_14,name_dob_count_0_by_14,fulladdress_homephone_count_14,name_dob_count_0_by_30,fulladdress_homephone_count_30,ssn_dob_count_30
count,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,...,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0,794996.0
mean,1.080105,326.637368,1.020345,1.447595,332.680089,1.036903,331.207869,332.822667,1.050981,1.05341,...,1.019843,1.025737,29.3424,6.960817,1.031863,13.892903,1.034473,29.573966,1.049368,1.046209
std,0.669916,98.866112,0.45604,0.857291,91.609103,0.588851,93.407625,91.423906,0.503758,0.646476,...,0.455393,0.442692,3.242724,0.376599,0.461563,0.897749,0.482971,2.573872,0.508366,0.48708
min,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.304348,0.583333,1.0,0.636364,1.0,1.363636,1.0,1.0
25%,1.0,365.0,1.0,1.0,365.0,1.0,365.0,365.0,1.0,1.0,...,1.0,1.0,30.0,7.0,1.0,14.0,1.0,30.0,1.0,1.0
50%,1.0,365.0,1.0,1.0,365.0,1.0,365.0,365.0,1.0,1.0,...,1.0,1.0,30.0,7.0,1.0,14.0,1.0,30.0,1.0,1.0
75%,1.0,365.0,1.0,2.0,365.0,1.0,365.0,365.0,1.0,1.0,...,1.0,1.0,30.0,7.0,1.0,14.0,1.0,30.0,1.0,1.0
max,30.0,365.0,34.0,30.0,365.0,30.0,365.0,365.0,34.0,30.0,...,34.0,34.0,30.0,7.0,34.0,14.0,29.0,30.0,29.0,34.0


In [33]:
oot_x.describe()

Unnamed: 0,address_count_30,fulladdress_day_since,name_dob_unique_count_for_homephone_30,homephone_unique_count_for_ssn_3,ssn_dob_day_since,address_unique_count_for_name_dob_7,fulladdress_homephone_day_since,ssn_name_dob_day_since,ssn_count_30,address_unique_count_for_name_dob_30,...,name_dob_unique_count_for_address_30,ssn_count_7,address_count_0_by_30,address_count_1_by_7,ssn_dob_count_14,name_dob_count_0_by_14,fulladdress_homephone_count_14,name_dob_count_0_by_30,fulladdress_homephone_count_30,ssn_dob_count_30
count,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,...,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0,166493.0
mean,1.078346,309.457359,1.024001,1.44567,317.560624,1.032602,315.568492,317.748776,1.055804,1.050999,...,1.023412,1.029815,29.342424,6.960674,1.035809,13.890628,1.034146,29.567146,1.049666,1.050417
std,0.658196,106.544062,0.513705,0.872399,100.255284,0.545742,101.713377,100.089049,0.566275,0.632015,...,0.513159,0.500642,3.244908,0.378508,0.518074,0.913591,0.498664,2.603736,0.530106,0.544683
min,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.578947,0.7,1.0,0.7,1.0,1.5,1.0,1.0
25%,1.0,324.0,1.0,1.0,365.0,1.0,365.0,365.0,1.0,1.0,...,1.0,1.0,30.0,7.0,1.0,14.0,1.0,30.0,1.0,1.0
50%,1.0,365.0,1.0,1.0,365.0,1.0,365.0,365.0,1.0,1.0,...,1.0,1.0,30.0,7.0,1.0,14.0,1.0,30.0,1.0,1.0
75%,1.0,365.0,1.0,2.0,365.0,1.0,365.0,365.0,1.0,1.0,...,1.0,1.0,30.0,7.0,1.0,14.0,1.0,30.0,1.0,1.0
max,30.0,365.0,29.0,32.0,365.0,30.0,365.0,365.0,29.0,30.0,...,29.0,29.0,30.0,7.0,29.0,14.0,30.0,30.0,30.0,29.0


### FDR@3% Calculation (LogReg)

In [18]:
# create functions to calculate fdr
def fdr_cal(x_data, y_data, model_choice):
    model = model_choice
    pop = int(round(len(x_data)*0.03))
    result = pd.DataFrame(model.predict_proba(x_data),columns=['prob_0', 'prob_1'])
    temp = x_data.copy()
    temp['fraud_label'] = y_data
    temp['prob_1']= list(result.prob_1)
    temp0 = temp.sort_values('prob_1', ascending=False)
    temp1 = temp0.head(pop)
    fdr = temp1.fraud_label.sum() / y_data.sum()
    
    return fdr

In [19]:
final_var = list(X.columns)

In [24]:
%%time
# create a for loop to calculate all logreg FDR@3%

# using KFold
kf = KFold(n_splits=5)

var_num = [5,10,15,20,25,30]
train_fdr = []
test_fdr = []
oot_fdr = []

fdr_table = pd.DataFrame(var_num, columns=['Number of Variables'])

# loop through each setting for variables

for num in var_num:
    train_fdr_mlr = []
    test_fdr_mlr = []
    oot_fdr_mlr = []

    cols = final_var[:num]
    X_1 = X[cols]
    X_oot = oot_x[cols]
    
    for train_index, test_index in kf.split(X_1):
        X_train, X_test = X_1.iloc[train_index,:], X_1.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index, :], y.iloc[test_index, :]
        
        mlr = LogisticRegression()
        mlr.fit(X_train, y_train)
        
        fdr_train = fdr_cal(X_train, y_train, mlr)
        fdr_test = fdr_cal(X_test, y_test, mlr)
        fdr_oot = fdr_cal(X_oot, oot_y, mlr)
        
        train_fdr_mlr.append(fdr_train)
        test_fdr_mlr.append(fdr_test)
        oot_fdr_mlr.append(fdr_oot)
        
    train_fdr.append(np.mean(train_fdr_mlr))
    test_fdr.append(np.mean(test_fdr_mlr))
    oot_fdr.append(np.mean(oot_fdr_mlr))

fdr_table['Train'] = train_fdr
fdr_table['Test'] = test_fdr
fdr_table['OOT'] = oot_fdr

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (m

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pl

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

CPU times: user 17min 58s, sys: 2min 55s, total: 20min 53s
Wall time: 2min 48s


In [25]:
fdr_table['Model'] = 'Logistic Regression'
fdr_table

Unnamed: 0,Number of Variables,Train,Test,OOT,Model
0,5,0.472897,0.469605,0.453646,Logistic Regression
1,10,0.493447,0.492659,0.480302,Logistic Regression
2,15,0.507995,0.502294,0.480553,Logistic Regression
3,20,0.484128,0.481684,0.465298,Logistic Regression
4,25,0.496845,0.492098,0.475524,Logistic Regression
5,30,0.496628,0.493988,0.474518,Logistic Regression


In [101]:
fdr_table.to_excel('fdr_table.xlsx')

### FDR@3% Calculation (Boosted Tree - GradientBoostingModel)

In [111]:
# create a table to store the data
gbm_table = pd.DataFrame(columns=['# of Trees', 'Max Depth', 'Learning Rate'])
num_tree = [100,200,500]
max_depth = [1,2]
learn_rate = [0.1, 0.01]


i=0
for lr in learn_rate:
    for md in max_depth:
        for num in num_tree:
            gbm_table.loc[i,'# of Trees'] = num
            gbm_table.loc[i,'Max Depth'] = md
            gbm_table.loc[i,'Learning Rate'] = lr
            i+=1

for col in ['# of Trees', 'Max Depth', 'Learning Rate']:
    gbm_table[col] = gbm_table[col].astype('object')

gbm_table['Train'] = 0.000
gbm_table['Test'] = 0.000
gbm_table['OOT'] = 0.000

In [112]:
%%time
# create a for loop to calculate all GBM FDR@3%

# using KFold
kf = KFold(n_splits=5)


num_tree = [100,200,500]
max_depth = [1,2]
learn_rate = [0.1, 0.01]

for lr in learn_rate:
    for md in max_depth:
        for num in num_tree:
            for train_index, test_index in kf.split(X):
                X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
                y_train, y_test = y.iloc[train_index, :].values.ravel(), y.iloc[test_index, :].values.ravel()
                
                gbm = GradientBoostingClassifier(learning_rate=lr, n_estimators =num, max_depth=md)
                gbm.fit(X_train,y_train)
                
                gbm_table.loc[(gbm_table['# of Trees']==num)\
                              &(gbm_table['Max Depth']==md)\
                              &(gbm_table['Learning Rate']==lr),'Train'] += fdr_cal(X_train, y_train, gbm)
                gbm_table.loc[(gbm_table['# of Trees']==num)\
                              &(gbm_table['Max Depth']==md)\
                              &(gbm_table['Learning Rate']==lr),'Test']+= fdr_cal(X_test, y_test, gbm)
                gbm_table.loc[(gbm_table['# of Trees']==num)\
                              &(gbm_table['Max Depth']==md)\
                              &(gbm_table['Learning Rate']==lr),'OOT']+= fdr_cal(oot_x, oot_y, gbm)

CPU times: user 49min 47s, sys: 9.24 s, total: 49min 56s
Wall time: 49min 58s


In [113]:
for col in ['Train','Test','OOT']:
    gbm_table[col] = gbm_table[col]/5

In [114]:
gbm_table.to_excel('gbm_table.xlsx')

In [115]:
gbm_table

Unnamed: 0,# of Trees,Max Depth,Learning Rate,Train,Test,OOT
0,100,1,0.1,0.552788,0.551412,0.524895
1,200,1,0.1,0.552592,0.551775,0.524644
2,500,1,0.1,0.553315,0.552598,0.525901
3,100,2,0.1,0.557247,0.55642,0.53026
4,200,2,0.1,0.562827,0.559571,0.535624
5,500,2,0.1,0.570096,0.567215,0.546689
6,100,1,0.01,0.481384,0.48093,0.46228
7,200,1,0.01,0.496044,0.495115,0.475021
8,500,1,0.01,0.544409,0.542764,0.5171
9,100,2,0.01,0.497825,0.497089,0.480386


### FDR@3% Calculation (Random Forest)

In [None]:
# create a table to store the data
rfc_table = pd.DataFrame(columns=['# of Trees', 'Max Depth', 'Min Samples Leaf'])
num_tree = [50,100,200]
max_depth = [5,10]
min_leaf = 3


i=0
for md in max_depth:
    for num in num_tree:
        rfc_table.loc[i,'# of Trees'] = num
        rfc_table.loc[i,'Max Depth'] = md
        rfc_table.loc[i,'Min Samples Leaf'] = min_leaf
        i+=1

rfc_table['Train'] = 0.000
rfc_table['Test'] = 0.000
rfc_table['OOT'] = 0.000
rfc_table

In [None]:
%%time
# create a for loop to calculate all GBM FDR@3%

# using KFold
kf = KFold(n_splits=5)


num_tree = [50,100,200]
max_depth = [5,10]
min_leaf = 3

for md in max_depth:
    for num in num_tree:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y.iloc[train_index, :].values.ravel(), y.iloc[test_index, :].values.ravel()

            rfc = RandomForestClassifier(n_estimators =num, max_depth=md, min_samples_leaf=min_leaf)
            rfc.fit(X_train,y_train)

            rfc_table.loc[(rfc_table['# of Trees']==num)\
                          &(rfc_table['Max Depth']==md),\
                          'Train'] += fdr_cal(X_train, y_train, rfc)
            rfc_table.loc[(rfc_table['# of Trees']==num)\
                          &(rfc_table['Max Depth']==md),\
                          'Test']+= fdr_cal(X_test, y_test, rfc)
            rfc_table.loc[(rfc_table['# of Trees']==num)\
                          &(rfc_table['Max Depth']==md),\
                          'OOT']+= fdr_cal(oot_x, oot_y, rfc)

In [None]:
for col in ['Train','Test','OOT']:
    rfc_table[col] = rfc_table[col]/5

In [None]:
#rfc_table.to_excel('rfc_table.xlsx')
rfc_table

In [None]:
# create a table to store the data
mlp_table = pd.DataFrame(columns=['Node', 'Epoch', 'Layer'])
nodes = [10,30,40]
max_iter = [20, 50]
layer = 1


i=0
for mi in max_iter:
    for node in nodes:
        mlp_table.loc[i,'Node'] = node
        mlp_table.loc[i,'Epoch'] = mi
        mlp_table.loc[i,'Layer'] = layer
        i+=1

mlp_table['Train'] = 0.000
mlp_table['Test'] = 0.000
mlp_table['OOT'] = 0.000
mlp_table

In [None]:
%%time
# create a for loop to calculate all GBM FDR@3%

# using KFold
kf = KFold(n_splits=5)


nodes = [10,30,40]
max_iter = [20, 50]
layer = 1

for mi in max_iter:
    for node in nodes:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y.iloc[train_index, :].values.ravel(), y.iloc[test_index, :].values.ravel()

            mlp = MLPClassifier(max_iter=mi, hidden_layer_sizes=(node,))
            mlp.fit(X_train,y_train)

            mlp_table.loc[(mlp_table['Node']==node)\
                          &(mlp_table['Epoch']==mi),\
                          'Train'] += fdr_cal(X_train, y_train, mlp)
            mlp_table.loc[(mlp_table['Node']==node)\
                          &(mlp_table['Epoch']==mi),\
                          'Test']+= fdr_cal(X_test, y_test, mlp)
            mlp_table.loc[(mlp_table['Node']==node)\
                          &(mlp_table['Epoch']==mi),\
                          'OOT']+= fdr_cal(oot_x, oot_y, mlp)

LogReg: 3 min
Random Forest: 14 min
Neural Net: 30 min
Gradient Boosting: 50 min