In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as sps
import datetime as dt
import calendar

# model data (train+test) csv file path
data = pd.read_csv('train_test.csv')
oot = pd.read_csv('oot.csv')

In [47]:
data.shape

(794996, 31)

In [48]:
oot.shape

(166493, 31)

# PyTorch

## Define PyTorch NN Model

In [32]:
# load packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

def PyTorch_nn(data, oot, n_kfold=10, lr=0.0001, epochs = 50, batch_size=64, layer_1=64, layer_2=64):
    
    #Adding (multi) GPU support with DataParallel
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
    class trainData(Dataset):
    
        def __init__(self, X_data, y_data):
            self.X_data = X_data
            self.y_data = y_data

        def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]

        def __len__ (self):
            return len(self.X_data)


    ## test data    
    class testData(Dataset):

        def __init__(self, X_data):
            self.X_data = X_data

        def __getitem__(self, index):
            return self.X_data[index]

        def __len__ (self):
            return len(self.X_data)

    class binaryClassification(nn.Module):
        def __init__(self):
            super(binaryClassification, self).__init__()
            # Number of input features is 30.
            self.layer_1 = nn.Linear(30, layer_1) 
            self.layer_2 = nn.Linear(layer_1, layer_2)
            self.layer_out = nn.Linear(layer_2, 1) 

            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(p=0.1)
            self.batchnorm1 = nn.BatchNorm1d(layer_1)
            self.batchnorm2 = nn.BatchNorm1d(layer_2)

        def forward(self, inputs):
            x = self.relu(self.layer_1(inputs))
            x = self.batchnorm1(x)
            x = self.relu(self.layer_2(x))
            x = self.batchnorm2(x)
            x = self.dropout(x)
            x = self.layer_out(x)

            return x

    # initialize optimizer and decide loss function
    model = binaryClassification()
    model.to(device)

    criterion = nn.BCEWithLogitsLoss() # loss function
    optimizer = optim.Adam(model.parameters(), lr=lr)

    def binary_acc(y_pred, y_test):
        y_pred_tag = torch.round(torch.sigmoid(y_pred))

        correct_results_sum = (y_pred_tag == y_test).sum().float()
        acc = correct_results_sum/y_test.shape[0]
        acc = torch.round(acc * 100)

        return acc
    
    ########################################### MODELLING STARTS HERE ##########################################
    
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    oot_X = oot.iloc[:,:-1]
    oot_y = oot.iloc[:,-1]

    kf = KFold(n_splits=n_kfold)
    kf.get_n_splits(X)

    for train_index, test_index in kf.split(X):

        X_train, X_test = X.iloc[train_index,], X.iloc[test_index,]
        y_train, y_test = y.iloc[train_index,], y.iloc[test_index,]

        # standardize train_test_oot data
        scaler = StandardScaler() 
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        oot_X = scaler.transform(oot_X)

        # oversampling
        os = SMOTE()
        columns = X.columns
        os_data_X,os_data_y=os.fit_resample(X_train, y_train)
        os_data_X = pd.DataFrame(data=os_data_X,columns=columns)
        os_data_y= pd.DataFrame(data=os_data_y,columns=['fraud_label'])
        os_data_y = os_data_y['fraud_label']

        # set up train_test data and load dataset into pytorch
        train_data = trainData(torch.FloatTensor(os_data_X.to_numpy()), \
                               torch.FloatTensor(os_data_y.to_numpy()))

        #train_data = trainData(torch.FloatTensor(X.to_numpy()), \
                          #     torch.FloatTensor(y.to_numpy()))

        test_data = testData(torch.FloatTensor(X_test))

        train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
        #test_loader = DataLoader(dataset=test_data, batch_size=1)
        #oot_loader = DataLoader(dataset=oot_X, batch_size=1)

        model.train()
        for e in range(1, epochs+1):
            epoch_loss = 0
            epoch_acc = 0
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()

                y_pred = model(X_batch)

                loss = criterion(y_pred, y_batch.unsqueeze(1))
                acc = binary_acc(y_pred, y_batch.unsqueeze(1))

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()


            #print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
            print(f'finish {e+0:03}', end=' ')
            ################################## MODELLING WITH TRAIN DATA ENDS HERE ##############################
    
    ############## EVALUATE TEST DATA STARTS HERE ##################
    
    y_test_pred_list = []
    y_test_pred_prob_list = []
    model.eval()

    scaler = StandardScaler() 
    X_test = scaler.fit_transform(X_test)
    test_data = testData(torch.FloatTensor(X_test))
    test_loader = DataLoader(dataset=test_data, batch_size=1)

    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(device)
            y_test_pred = model(X_batch)
            y_test_pred = torch.sigmoid(y_test_pred)
            y_test_pred_prob_list.append(y_test_pred)
            y_test_pred_tag = torch.round(y_test_pred)
            y_test_pred_list.append(y_test_pred_tag.cpu().numpy())


    y_test_pred_list = [a.squeeze().tolist() for a in y_test_pred_list]
    y_test_pred_prob_list = [a.squeeze().tolist() for a in y_test_pred_prob_list]

    result = pd.DataFrame(y_test_pred_prob_list,columns=['prob_1'])
    result.loc[:,'fraud_label'] = y_test.values.reshape(-1,)
    result.loc[:,'pred_class'] = pd.Series(y_test_pred_list).values.reshape(-1,)
    temp = result.sort_values('prob_1', ascending=False)
    pop = int(round(temp.shape[0]*0.03))
    temp1 = temp.head(pop)
    fdr = temp1.fraud_label.sum() / y_test.sum()
    
    print(f'\nTEST DATA - FDR @3%  {round(fdr,4)}')
    
    
    ################### TEST DATA EVALUATION ENDS HERE #######################
    ##########################################################################
    ################### EVALUATE OOT DATA STARTS HERE ########################
    
    y_oot_pred_list = []
    y_oot_pred_prob_list = []
    model.eval()

    oot_X = oot.iloc[:,:-1]
    oot_y = oot.iloc[:,-1]
    scaler = StandardScaler() 
    oot_X = scaler.fit_transform(oot_X)
    oot_X = testData(torch.FloatTensor(oot_X))
    oot_loader = DataLoader(dataset=oot_X, batch_size=1)

    with torch.no_grad():
        for X_batch in oot_loader:
            X_batch = X_batch.to(device)
            y_oot_pred = model(X_batch)
            y_oot_pred = torch.sigmoid(y_oot_pred)
            y_oot_pred_prob_list.append(y_oot_pred)
            y_oot_pred_tag = torch.round(y_oot_pred)
            y_oot_pred_list.append(y_oot_pred_tag.cpu().numpy())


    y_oot_pred_list = [a.squeeze().tolist() for a in y_oot_pred_list]
    y_oot_pred_prob_list = [a.squeeze().tolist() for a in y_oot_pred_prob_list]

    result1 = pd.DataFrame(y_oot_pred_prob_list,columns=['prob_1'])
    result1.loc[:,'fraud_label'] = oot_y.values.reshape(-1,1)
    result1.loc[:,'pred_class'] = pd.Series(y_oot_pred_list).values.reshape(-1,1)
    temp11 = result1.sort_values('prob_1', ascending=False)
    pop = int(round(temp11.shape[0]*0.03))
    temp12 = temp11.head(pop)
    fdr_oot = temp12.fraud_label.sum() / oot_y.sum()
    print(f'OOT DATA - FDR @3%  {round(fdr_oot,4)}')
    return fdr, fdr_oot

## Define Model w/o oversampling

In [39]:
y = data.iloc[:,-1]
y

0         0
1         0
2         0
3         0
4         0
         ..
794991    0
794992    0
794993    0
794994    0
794995    0
Name: fraud_label, Length: 794996, dtype: int64

In [40]:
# load packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

def PyTorch_nn(data, oot, n_kfold=10, lr=0.0001, epochs = 50, batch_size=64, layer_1=64, layer_2=64):
    
    #Adding (multi) GPU support with DataParallel
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
    class trainData(Dataset):
    
        def __init__(self, X_data, y_data):
            self.X_data = X_data
            self.y_data = y_data

        def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]

        def __len__ (self):
            return len(self.X_data)


    ## test data    
    class testData(Dataset):

        def __init__(self, X_data):
            self.X_data = X_data

        def __getitem__(self, index):
            return self.X_data[index]

        def __len__ (self):
            return len(self.X_data)

    class binaryClassification(nn.Module):
        def __init__(self):
            super(binaryClassification, self).__init__()
            # Number of input features is 30.
            self.layer_1 = nn.Linear(30, layer_1) 
            self.layer_2 = nn.Linear(layer_1, layer_2)
            self.layer_out = nn.Linear(layer_2, 1) 

            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(p=0.1)
            self.batchnorm1 = nn.BatchNorm1d(layer_1)
            self.batchnorm2 = nn.BatchNorm1d(layer_2)

        def forward(self, inputs):
            x = self.relu(self.layer_1(inputs))
            x = self.batchnorm1(x)
            x = self.relu(self.layer_2(x))
            x = self.batchnorm2(x)
            x = self.dropout(x)
            x = self.layer_out(x)

            return x

    # initialize optimizer and decide loss function
    model = binaryClassification()
    model.to(device)

    criterion = nn.BCEWithLogitsLoss() # loss function
    optimizer = optim.Adam(model.parameters(), lr=lr)

    def binary_acc(y_pred, y_test):
        y_pred_tag = torch.round(torch.sigmoid(y_pred))

        correct_results_sum = (y_pred_tag == y_test).sum().float()
        acc = correct_results_sum/y_test.shape[0]
        acc = torch.round(acc * 100)

        return acc
    
    ########################################### MODELLING STARTS HERE ##########################################
    
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    oot_X = oot.iloc[:,:-1]
    oot_y = oot.iloc[:,-1]

    kf = KFold(n_splits=n_kfold)
    kf.get_n_splits(X)

    for train_index, test_index in kf.split(X):

        X_train, X_test = X.iloc[train_index,], X.iloc[test_index,]
        y_train, y_test = y.iloc[train_index,], y.iloc[test_index,]

        # standardize train_test_oot data
        scaler = StandardScaler() 
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        oot_X = scaler.transform(oot_X)

        # oversampling
        #os = SMOTE()
        #columns = X.columns
        #os_data_X,os_data_y=os.fit_resample(X_train, y_train)
        #os_data_X = pd.DataFrame(data=os_data_X,columns=columns)
        #os_data_y= pd.DataFrame(data=os_data_y,columns=['fraud_label'])
        #os_data_y = os_data_y['fraud_label']

        # set up train_test data and load dataset into pytorch
        train_data = trainData(torch.FloatTensor(X_train), \
                               torch.FloatTensor(y_train.to_numpy()))

        #train_data = trainData(torch.FloatTensor(X.to_numpy()), \
                          #     torch.FloatTensor(y.to_numpy()))

        test_data = testData(torch.FloatTensor(X_test))

        train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
        #test_loader = DataLoader(dataset=test_data, batch_size=1)
        #oot_loader = DataLoader(dataset=oot_X, batch_size=1)

        model.train()
        for e in range(1, epochs+1):
            epoch_loss = 0
            epoch_acc = 0
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()

                y_pred = model(X_batch)

                loss = criterion(y_pred, y_batch.unsqueeze(1))
                acc = binary_acc(y_pred, y_batch.unsqueeze(1))

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()


            #print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
            print(f'finish {e+0:03}', end=' ')
            ################################## MODELLING WITH TRAIN DATA ENDS HERE ##############################
    
    ############## EVALUATE TEST DATA STARTS HERE ##################
    
    y_test_pred_list = []
    y_test_pred_prob_list = []
    model.eval()

    scaler = StandardScaler() 
    X_test = scaler.fit_transform(X_test)
    test_data = testData(torch.FloatTensor(X_test))
    test_loader = DataLoader(dataset=test_data, batch_size=1)

    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(device)
            y_test_pred = model(X_batch)
            y_test_pred = torch.sigmoid(y_test_pred)
            y_test_pred_prob_list.append(y_test_pred)
            y_test_pred_tag = torch.round(y_test_pred)
            y_test_pred_list.append(y_test_pred_tag.cpu().numpy())


    y_test_pred_list = [a.squeeze().tolist() for a in y_test_pred_list]
    y_test_pred_prob_list = [a.squeeze().tolist() for a in y_test_pred_prob_list]

    result = pd.DataFrame(y_test_pred_prob_list,columns=['prob_1'])
    result.loc[:,'fraud_label'] = y_test.values.reshape(-1,)
    result.loc[:,'pred_class'] = pd.Series(y_test_pred_list).values.reshape(-1,)
    temp = result.sort_values('prob_1', ascending=False)
    pop = int(round(temp.shape[0]*0.03))
    temp1 = temp.head(pop)
    fdr = temp1.fraud_label.sum() / y_test.sum()
    
    print(f'\nTEST DATA - FDR @3%  {round(fdr,4)}')
    
    
    ################### TEST DATA EVALUATION ENDS HERE #######################
    ##########################################################################
    ################### EVALUATE OOT DATA STARTS HERE ########################
    
    y_oot_pred_list = []
    y_oot_pred_prob_list = []
    model.eval()

    oot_X = oot.iloc[:,:-1]
    oot_y = oot.iloc[:,-1]
    scaler = StandardScaler() 
    oot_X = scaler.fit_transform(oot_X)
    oot_X = testData(torch.FloatTensor(oot_X))
    oot_loader = DataLoader(dataset=oot_X, batch_size=1)

    with torch.no_grad():
        for X_batch in oot_loader:
            X_batch = X_batch.to(device)
            y_oot_pred = model(X_batch)
            y_oot_pred = torch.sigmoid(y_oot_pred)
            y_oot_pred_prob_list.append(y_oot_pred)
            y_oot_pred_tag = torch.round(y_oot_pred)
            y_oot_pred_list.append(y_oot_pred_tag.cpu().numpy())


    y_oot_pred_list = [a.squeeze().tolist() for a in y_oot_pred_list]
    y_oot_pred_prob_list = [a.squeeze().tolist() for a in y_oot_pred_prob_list]

    result1 = pd.DataFrame(y_oot_pred_prob_list,columns=['prob_1'])
    result1.loc[:,'fraud_label'] = oot_y.values.reshape(-1,1)
    result1.loc[:,'pred_class'] = pd.Series(y_oot_pred_list).values.reshape(-1,1)
    temp11 = result1.sort_values('prob_1', ascending=False)
    pop = int(round(temp11.shape[0]*0.03))
    temp12 = temp11.head(pop)
    fdr_oot = temp12.fraud_label.sum() / oot_y.sum()
    print(f'OOT DATA - FDR @3%  {round(fdr_oot,4)}')
    
    return fdr, fdr_oot

In [43]:
# Trial 
PyTorch_nn(data, oot, n_kfold=10, lr=0.0001, epochs = 10, batch_size=64, layer_1=64, layer_2=64)

finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 001

(0.5852225020990764, 0.5519698239731768)

In [55]:
# Best model

# customize hyperparameters
n_kfold = [5]
lr = [0.0001]
epochs = [15]
batch_size = [64]
layer_1 = [128] 
layer_2 = [128] 
df = pd.DataFrame(0, columns =['n_kfold','learning rate','epochs','batch_size','layer_1','layer_2','TEST fdr@3%', 'OOT fdr@3%'], index=range(1))
j=0

data = pd.read_csv('train_test.csv')
oot = pd.read_csv('oot.csv')
print(f'|   lr   | epochs | batch_size | layer_1 | layer_2 |')

#################### for loops to experiment hyperparameters ###########################
for n in n_kfold:
    for l in lr:
        for epoch in epochs:
            for b in batch_size:
                for l1 in layer_1:
                    for l2 in layer_2:
                        df.loc[j,'n_kfold'] = n
                        df.loc[j,'learning rate']=l
                        df.loc[j,'epochs']=epoch
                        df.loc[j,'batch_size'] = b
                        df.loc[j,'layer_1']=l1
                        df.loc[j,'layer_2']=l2
                        
                        print(f'| {l:.4f} |  {epoch+0:03}   |     {b+0:03}    |   {l1+0:03}   |   {l2+0:03}   |')
                        fdr, fdr_oot = PyTorch_nn(data=data, oot=oot, n_kfold=n, lr=l, epochs = epoch, batch_size=b, layer_1=l1, layer_2=l2)
                        df.loc[j, 'TEST fdr@3%'] = fdr
                        df.loc[j, 'OOT fdr@3%'] = fdr_oot
                        j+=1

|   lr   | epochs | batch_size | layer_1 | layer_2 |
| 0.0001 |  015   |     064    |   128   |   128   |
finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 011 finish 012 finish 013 finish 014 finish 015 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 011 finish 012 finish 013 finish 014 finish 015 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 011 finish 012 finish 013 finish 014 finish 015 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 011 finish 012 finish 013 finish 014 finish 015 finish 001 finish 002 finish 003 finish 004 finish 005 finish 006 finish 007 finish 008 finish 009 finish 010 finish 011 finish 012 finish 013 finish 014 finish 015 
TEST DATA - FDR @3%  0.6019
OOT DATA - FDR @3%  0.5541


In [56]:
# best model hyperparameter table
df

Unnamed: 0,n_kfold,learning rate,epochs,batch_size,layer_1,layer_2,TEST fdr@3%,OOT fdr@3%
0,5,0.0001,15,64,128,128,0.601934,0.554065


In [51]:
# trials 
df1 = df.copy()
df1

Unnamed: 0,n_kfold,learning rate,epochs,batch_size,layer_1,layer_2,TEST fdr@3%,OOT fdr@3%
0,3.0,0.0001,15.0,64.0,32.0,32.0,0.577059,0.547779
1,3.0,0.0001,15.0,64.0,32.0,64.0,0.575038,0.545683
2,3.0,0.0001,15.0,64.0,64.0,32.0,0.574533,0.546102
3,3.0,0.0001,15.0,64.0,64.0,64.0,0.581607,0.55197


# CODES (BACKUP)

In [10]:
# Set up model structure
# can adjust layer_1, layer_2,
## train data
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.0001
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 30.
        self.layer_1 = nn.Linear(30, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

def fdr_cal(x_data, y_data, model):
    model = model
    pop = int(round(len(x_data)*0.03))
    result = pd.DataFrame(model.predict_proba(x_data),columns=['prob_0', 'prob_1'])
    temp = x_data.copy()
    temp['fraud_label'] = y_data
    temp['prob_1']= list(result.prob_1)
    temp0 = temp.sort_values('prob_1', ascending=False)
    temp1 = temp0.head(pop)
    fdr = temp1.fraud_label.sum() / y_data.sum()

    return fdr

# initialize optimizer and decide loss function
model = binaryClassification()
model.to(device)

criterion = nn.BCEWithLogitsLoss() # loss function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [27]:
# model preparation

X = data.iloc[:,:-1]
y = data.iloc[:,-1]
oot_X = oot.iloc[:,:-1]
oot_y = oot.iloc[:,-1]

kf = KFold(n_splits=n_kfold)
kf.get_n_splits(X)

5

In [12]:
os_data_X.shape

(1567020, 30)

In [13]:
model = binaryClassification()
model.to(device)

criterion = nn.BCEWithLogitsLoss() # loss function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for train_index, test_index in kf.split(X):

    X_train, X_test = X.iloc[train_index,], X.iloc[test_index,]
    y_train, y_test = y.iloc[train_index,], y.iloc[test_index,]

    # standardize train_test_oot data
    scaler = StandardScaler() 
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    oot_X = scaler.transform(oot_X)
    
    # oversampling
    os = SMOTE()
    columns = X.columns
    os_data_X,os_data_y=os.fit_resample(X_train, y_train)
    os_data_X = pd.DataFrame(data=os_data_X,columns=columns)
    os_data_y= pd.DataFrame(data=os_data_y,columns=['fraud_label'])
    os_data_y = os_data_y['fraud_label']

    # set up train_test data and load dataset into pytorch
    train_data = trainData(torch.FloatTensor(os_data_X.to_numpy()), \
                           torch.FloatTensor(os_data_y.to_numpy()))
    
    #train_data = trainData(torch.FloatTensor(X.to_numpy()), \
                      #     torch.FloatTensor(y.to_numpy()))

    test_data = testData(torch.FloatTensor(X_test))

    train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(dataset=test_data, batch_size=1)
    #oot_loader = DataLoader(dataset=oot_X, batch_size=1)

    model.train()
    for e in range(1, EPOCHS+1):
        epoch_loss = 0
        epoch_acc = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            y_pred = model(X_batch)

            loss = criterion(y_pred, y_batch.unsqueeze(1))
            acc = binary_acc(y_pred, y_batch.unsqueeze(1))

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()


        print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.45789 | Acc: 77.836
Epoch 002: | Loss: 0.44917 | Acc: 78.222
Epoch 003: | Loss: 0.44701 | Acc: 78.300
Epoch 004: | Loss: 0.44582 | Acc: 78.344
Epoch 005: | Loss: 0.44501 | Acc: 78.375
Epoch 006: | Loss: 0.44422 | Acc: 78.404
Epoch 007: | Loss: 0.44366 | Acc: 78.436
Epoch 008: | Loss: 0.44316 | Acc: 78.455
Epoch 009: | Loss: 0.44275 | Acc: 78.456
Epoch 010: | Loss: 0.44224 | Acc: 78.474
Epoch 011: | Loss: 0.44171 | Acc: 78.501
Epoch 012: | Loss: 0.44132 | Acc: 78.526
Epoch 013: | Loss: 0.44121 | Acc: 78.522
Epoch 014: | Loss: 0.44079 | Acc: 78.537
Epoch 015: | Loss: 0.44041 | Acc: 78.555
Epoch 016: | Loss: 0.44016 | Acc: 78.556
Epoch 017: | Loss: 0.44006 | Acc: 78.551
Epoch 018: | Loss: 0.43995 | Acc: 78.573
Epoch 019: | Loss: 0.43960 | Acc: 78.568
Epoch 020: | Loss: 0.43937 | Acc: 78.594
Epoch 021: | Loss: 0.43928 | Acc: 78.591
Epoch 022: | Loss: 0.43921 | Acc: 78.583
Epoch 023: | Loss: 0.43921 | Acc: 78.608
Epoch 024: | Loss: 0.43893 | Acc: 78.606
Epoch 025: | Los

Epoch 001: | Loss: 0.44525 | Acc: 77.993
Epoch 002: | Loss: 0.44434 | Acc: 78.019
Epoch 003: | Loss: 0.44398 | Acc: 78.029
Epoch 004: | Loss: 0.44376 | Acc: 78.043
Epoch 005: | Loss: 0.44352 | Acc: 78.053
Epoch 006: | Loss: 0.44338 | Acc: 78.063
Epoch 007: | Loss: 0.44339 | Acc: 78.064
Epoch 008: | Loss: 0.44317 | Acc: 78.084
Epoch 009: | Loss: 0.44289 | Acc: 78.085
Epoch 010: | Loss: 0.44295 | Acc: 78.094
Epoch 011: | Loss: 0.44294 | Acc: 78.074
Epoch 012: | Loss: 0.44354 | Acc: 78.052
Epoch 013: | Loss: 0.44303 | Acc: 78.088
Epoch 014: | Loss: 0.44274 | Acc: 78.097
Epoch 015: | Loss: 0.44265 | Acc: 78.103
Epoch 016: | Loss: 0.44276 | Acc: 78.082
Epoch 017: | Loss: 0.44284 | Acc: 78.093
Epoch 018: | Loss: 0.44250 | Acc: 78.101
Epoch 019: | Loss: 0.44253 | Acc: 78.085
Epoch 020: | Loss: 0.44260 | Acc: 78.090
Epoch 021: | Loss: 0.44256 | Acc: 78.104
Epoch 022: | Loss: 0.44253 | Acc: 78.101
Epoch 023: | Loss: 0.44221 | Acc: 78.126
Epoch 024: | Loss: 0.44232 | Acc: 78.097
Epoch 025: | Los

In [35]:
y_test_pred_list = []
y_test_pred_prob_list = []
model.eval()

scaler = StandardScaler() 
X_test = scaler.fit_transform(X_test)
test_data = testData(torch.FloatTensor(X_test))
test_loader = DataLoader(dataset=test_data, batch_size=1)

with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_test_pred_prob_list.append(y_test_pred)
        y_test_pred_tag = torch.round(y_test_pred)
        y_test_pred_list.append(y_test_pred_tag.cpu().numpy())


y_test_pred_list = [a.squeeze().tolist() for a in y_test_pred_list]
y_test_pred_prob_list = [a.squeeze().tolist() for a in y_test_pred_prob_list]

result = pd.DataFrame(y_test_pred_prob_list,columns=['prob_1'])
result.loc[:,'fraud_label'] = y_test.values.reshape(-1,)
result.loc[:,'pred_class'] = pd.Series(y_test_pred_list).values.reshape(-1,)
temp = result.sort_values('prob_1', ascending=False)
pop = int(round(temp.shape[0]*0.03))
temp1 = temp.head(pop)
fdr = temp1.fraud_label.sum() / y_test.sum()

In [36]:
fdr

0.5922656578394283

In [39]:
y_oot_pred_list = []
y_oot_pred_prob_list = []
model.eval()

oot_X = oot.iloc[:,:-1]
oot_y = oot.iloc[:,-1]
scaler = StandardScaler() 
oot_X = scaler.fit_transform(oot_X)
oot_X = testData(torch.FloatTensor(oot_X))
oot_loader = DataLoader(dataset=oot_X, batch_size=1)

with torch.no_grad():
    for X_batch in oot_loader:
        X_batch = X_batch.to(device)
        y_oot_pred = model(X_batch)
        y_oot_pred = torch.sigmoid(y_oot_pred)
        y_oot_pred_prob_list.append(y_oot_pred)
        y_oot_pred_tag = torch.round(y_oot_pred)
        y_oot_pred_list.append(y_oot_pred_tag.cpu().numpy())


y_oot_pred_list = [a.squeeze().tolist() for a in y_oot_pred_list]
y_oot_pred_prob_list = [a.squeeze().tolist() for a in y_oot_pred_prob_list]

result1 = pd.DataFrame(y_oot_pred_prob_list,columns=['prob_1'])
result1.loc[:,'fraud_label'] = oot_y.values.reshape(-1,1)
result1.loc[:,'pred_class'] = pd.Series(y_oot_pred_list).values.reshape(-1,1)
temp11 = result1.sort_values('prob_1', ascending=False)
pop = int(round(temp11.shape[0]*0.03))
temp12 = temp11.head(pop)
fdr_oot = temp12.fraud_label.sum() / oot_y.sum()

In [40]:
fdr_oot

0.5452640402347024

715497    0
715498    0
715499    0
715500    0
715501    0
         ..
794991    0
794992    0
794993    0
794994    0
794995    0
Name: fraud_label, Length: 79499, dtype: int64

In [121]:
fdr

0.2779177162048699

In [93]:
# y_oot_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in oot_loader:
        X_batch = X_batch.to(device)
        y_oot_pred = model(X_batch)
        y_oot_pred = torch.sigmoid(y_oot_pred)
        y_oot_pred_tag = torch.round(y_oot_pred)
        y_oot_pred_list.append(y_oot_pred_tag.cpu().numpy())

y_oot_pred_list = [a.squeeze().tolist() for a in y_oot_pred_list]

result = pd.DataFrame([y_oot_pred_list, oot_y.tolist()],columns=['prob_1','fraud_label'])
temp = result.sort_values('prob_1', ascending=False)
pop = int(round(temp.shape[0]*0.03))
temp1 = temp.head(pop)
fdr = temp1.fraud_label.sum() / oot_y.tolist().sum()

RuntimeError: expected scalar type Float but found Double

In [None]:
# define fdr in pytorch

def fdr_cal(x_data, y_data, model_choice):
    model = model_choice
    pop = int(round(len(x_data)*0.03))
    result = pd.DataFrame(model.predict_proba(x_data),columns=['prob_0', 'prob_1'])
    temp = x_data.copy()
    temp['fraud_label'] = y_data
    temp['prob_1']= list(result.prob_1)
    temp0 = temp.sort_values('prob_1', ascending=False)
    temp1 = temp0.head(pop)
    fdr = temp1.fraud_label.sum() / y_data.sum()
    
    return fdr