### Load Libraries

In [1]:
# Load libraries
import datetime as dt
from datetime import datetime
import time 
import calendar
start_time = datetime.now()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import scipy.stats as sps
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import StackingClassifier

import keras
from imblearn.over_sampling import SMOTE

import math


print('LOAD DURATION:',datetime.now() - start_time)

LOAD DURATION: 0:00:04.371661


### Load Data

In [2]:
%%time
# load the data
df = pd.read_csv('train_test.csv')
oot = pd.read_csv('oot.csv')

CPU times: user 1.25 s, sys: 229 ms, total: 1.47 s
Wall time: 1.48 s


In [3]:
X = df.drop(columns=['fraud_label'])
y = df['fraud_label']

oot_x = oot.drop(columns=['fraud_label'])
oot_y = oot['fraud_label']

In [4]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

**FDR Function**

In [5]:
# create functions to calculate fdr
def fdr_cal(x_data, y_data, model_choice):
    model = model_choice
    pop = int(round(len(x_data)*0.03))
    result = pd.DataFrame(model.predict_proba(x_data),columns=['prob_0', 'prob_1'])
    temp = x_data.copy()
    temp['fraud_label'] = y_data
    temp['prob_1']= list(result.prob_1)
    temp0 = temp.sort_values('prob_1', ascending=False)
    temp1 = temp0.head(pop)
    fdr = temp1.fraud_label.sum() / y_data.sum()
    
    return fdr

In [6]:
# create functions to calculate fdr
def final_results(x_data, y_data, model_choice):
    model = model_choice
    result = pd.DataFrame(model.predict_proba(x_data),columns=['prob_0', 'prob_1'])
    temp = x_data.copy()
    temp['fraud_label'] = y_data
    temp['prob_1']= list(result.prob_1)
    temp0 = temp.sort_values('prob_1', ascending=False)
    
    return temp0

In [7]:
# Final Gradient Boosting
num_tree = 1000
max_depth = 5
learn_rate = 0.05
max_feature = 5
min_samples_leaf = 30
min_samples_split = 500


gbm = GradientBoostingClassifier(learning_rate=learn_rate, n_estimators =num_tree,
                                 max_depth=max_depth, max_features=max_feature,
                                 min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_leaf)
gbm.fit(X_train,y_train)

train_table = final_results(X_train, y_train, gbm)
test_table = final_results(X_test, y_test, gbm)
oot_table = final_results(oot_x, oot_y, gbm)

In [16]:
# create a function to generate final output
def report_table(df):
    # split the table into 100 groups
    df = df.reset_index()
    group_size = math.ceil(len(df)/100)
    lst = [df.iloc[i:i+group_size] for i in range(0,len(df)-group_size+1,group_size)]
    n = group_size*99
    lst.append(df.iloc[n:])
    
    # calculate summary statistics
    output = pd.DataFrame()
    for i in range(0,100):
        data = lst[i]
        output.loc[i,'Population Bin %'] = int(i+1)
        output.loc[i, '# Records'] = len(data)
        output.loc[i, '# Goods'] = len(data[data.fraud_label==0])
        output.loc[i, '# Bads'] = len(data[data.fraud_label==1])
        
    output['% Goods'] = round(output['# Goods']/output['# Records'],4)
    output['% Bads'] = round(output['# Bads']/output['# Records'],4)
    
    for i in range(0,100):
        if i == 0:
            output.loc[i,'Total # Records'] = output.loc[i,'# Records']
            output.loc[i,'Cumulative Goods'] = output.loc[i,'# Goods']
            output.loc[i,'Cumulative Bads'] = output.loc[i,'# Bads']
        else:
            output.loc[i,'Total # Records'] = output.loc[i-1,'Total # Records'] + output.loc[i,'# Records']
            output.loc[i,'Cumulative Goods'] = output.loc[i-1,'Cumulative Goods'] + output.loc[i,'# Goods']
            output.loc[i,'Cumulative Bads'] = output.loc[i-1,'Cumulative Bads'] + output.loc[i,'# Bads']
    
    total_goods = output['# Goods'].sum()
    total_bads = output['# Bads'].sum()
    
    output['Cumulative % Goods'] = round(output['Cumulative Goods']/total_goods,4)
    output['Cumulative % Bads (FDR)'] = round(output['Cumulative Bads']/total_bads,4)
    
    output['KS Score'] = round((output['Cumulative % Bads (FDR)']-output['Cumulative % Goods'])*100,2)
    output['FPR'] = round(output['Cumulative Goods']/output['Cumulative Bads'],2)
    
    return output

In [17]:
# get all 3 reports
train_report = report_table(train_table)
test_report =report_table(test_table)
oot_report = report_table(oot_table)

In [18]:
train_report.head(3)

Unnamed: 0,Population Bin %,# Records,# Goods,# Bads,% Goods,% Bads,Total # Records,Cumulative Goods,Cumulative Bads,Cumulative % Goods,Cumulative % Bads (FDR),KS Score,FPR
0,1.0,5963.0,1274.0,4689.0,0.2137,0.7863,5963.0,1274.0,4689.0,0.0022,0.5445,54.23,0.27
1,2.0,5963.0,5713.0,250.0,0.9581,0.0419,11926.0,6987.0,4939.0,0.0119,0.5735,56.16,1.41
2,3.0,5963.0,5886.0,77.0,0.9871,0.0129,17889.0,12873.0,5016.0,0.0219,0.5824,56.05,2.57


In [19]:
test_report.head(3)

Unnamed: 0,Population Bin %,# Records,# Goods,# Bads,% Goods,% Bads,Total # Records,Cumulative Goods,Cumulative Bads,Cumulative % Goods,Cumulative % Bads (FDR),KS Score,FPR
0,1.0,1988.0,462.0,1526.0,0.2324,0.7676,1988.0,462.0,1526.0,0.0024,0.531,52.86,0.3
1,2.0,1988.0,1900.0,88.0,0.9557,0.0443,3976.0,2362.0,1614.0,0.0121,0.5616,54.95,1.46
2,3.0,1988.0,1967.0,21.0,0.9894,0.0106,5964.0,4329.0,1635.0,0.0221,0.5689,54.68,2.65


In [20]:
oot_report.head(3)

Unnamed: 0,Population Bin %,# Records,# Goods,# Bads,% Goods,% Bads,Total # Records,Cumulative Goods,Cumulative Bads,Cumulative % Goods,Cumulative % Bads (FDR),KS Score,FPR
0,1.0,1665.0,434.0,1231.0,0.2607,0.7393,1665.0,434.0,1231.0,0.0026,0.5159,51.33,0.35
1,2.0,1665.0,1589.0,76.0,0.9544,0.0456,3330.0,2023.0,1307.0,0.0123,0.5478,53.55,1.55
2,3.0,1665.0,1646.0,19.0,0.9886,0.0114,4995.0,3669.0,1326.0,0.0224,0.5557,53.33,2.77


In [21]:
# overall summary table
summary_table = pd.DataFrame(columns=['Data','# Records', '# Goods', '# Bads', 'Fraud Rate'])
summary_table = summary_table.append({'Data':'Training',
                      '# Records':len(train_table), 
                      '# Goods':len(train_table[train_table.fraud_label==0]), 
                      '# Bads':len(train_table[train_table.fraud_label==1]), 
                      'Fraud Rate':len(train_table[train_table.fraud_label==1])/len(train_table)},
                     {'Data':'Testing',
                      '# Records':len(test_table), 
                      '# Goods':len(test_table[test_table.fraud_label==0]), 
                      '# Bads':len(test_table[test_table.fraud_label==1]), 
                      'Fraud Rate':len(test_table[test_table.fraud_label==1])/len(test_table)},
                     {'Data':'OOT',
                      '# Records':len(oot_table), 
                      '# Goods':len(oot_table[oot_table.fraud_label==0]), 
                      '# Bads':len(oot_table[oot_table.fraud_label==1]), 
                      'Fraud Rate':len(oot_table[oot_table.fraud_label==1])/len(oot_table)})
summary_table = summary_table.append({'Data':'Testing',
                      '# Records':len(test_table), 
                      '# Goods':len(test_table[test_table.fraud_label==0]), 
                      '# Bads':len(test_table[test_table.fraud_label==1]), 
                      'Fraud Rate':len(test_table[test_table.fraud_label==1])/len(test_table)},
                    ignore_index=True)
summary_table = summary_table.append({'Data':'OOT',
                      '# Records':len(oot_table), 
                      '# Goods':len(oot_table[oot_table.fraud_label==0]), 
                      '# Bads':len(oot_table[oot_table.fraud_label==1]), 
                      'Fraud Rate':len(oot_table[oot_table.fraud_label==1])/len(oot_table)},
                    ignore_index=True)
summary_table

Unnamed: 0,Data,# Records,# Goods,# Bads,Fraud Rate
0,Training,596247,587635,8612,0.014444
1,Testing,198749,195875,2874,0.01446
2,OOT,166493,164107,2386,0.014331


In [23]:
# export to excel
train_report.to_excel('train_report.xlsx')
test_report.to_excel('test_report.xlsx')
oot_report.to_excel('oot_report.xlsx')
summary_table.to_excel('high_level_summary.xlsx')

In [15]:
print('Total Time:', datetime.now()-start_time)

Total Time: 0:03:46.984868


### SMOTE (Not Using)

In [None]:
# Graident Boosting Using SMOTE
num_tree = 1000
max_depth = 5
learn_rate = 0.05
max_feature = 5
min_samples_leaf = 30
min_samples_split = 500


#SMOTE
os = SMOTE()
columns = X.columns
os_data_X,os_data_y=os.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns)


train_result = []
test_result = []
oot_result = []

gbm = GradientBoostingClassifier(learning_rate=learn_rate, n_estimators =num_tree,
                                 max_depth=max_depth, max_features=max_feature,
                                 min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_leaf)
gbm.fit(os_data_X,os_data_y)

train_result.append(fdr_cal(X_train, y_train, gbm))
test_result.append(fdr_cal(X_test, y_test, gbm))
oot_result.append(fdr_cal(oot_x, oot_y, gbm))

train_avg = np.mean(train_result)
test_avg = np.mean(test_result)
oot_avg = np.mean(oot_result)

In [None]:
print('SMOTE Train Average FDR@3%:', train_avg)
print('SMOTE Test Average FDR@3%:', test_avg)
print('SMOTE OOT Average FDR@3%:', oot_avg)

In [None]:
print('Total Time:', datetime.now()-start_time)