In [1]:
# Import libraries
import pandas as pd
import numpy as np
import keras
import lightgbm as lgbm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, accuracy_score, recall_score, confusion_matrix, precision_score, f1_score

In [2]:
# Import dataset
df = pd.read_csv('data/input.csv')
df2 = df[['transaction_risk_score','cc_amount','ledger_balance','cardholder_presence','card_presence',
          'partial_approval_capable','channel','processing_type','date','cc_acceptor_state','cc_acceptor_country','is_fraud']]
df2 = df2.sort_values(by='date')
df2 = df2.drop("date",axis=1)

In [3]:
df

Unnamed: 0.1,Unnamed: 0,channel,transaction_risk_score,processing_type,cc_amount,ledger_balance,cc_acceptor_state,cc_acceptor_country,cardholder_presence,card_presence,partial_approval_capable,pin_present,date,is_fraud,hour,month,dayofweek,year
0,0,2,54.0,5,52.79,0.00,52,32,0,0,0,0,2018-12-25,1,0,12,1,2018
1,1,2,27.0,1,7.75,51.80,20,33,1,1,1,0,2018-12-25,0,19,12,1,2018
2,2,2,54.0,5,52.79,0.00,52,32,0,0,0,0,2018-12-25,1,0,12,1,2018
3,3,0,86.0,4,195.00,2100.96,36,33,0,0,0,0,2018-12-25,1,12,12,1,2018
4,4,0,43.0,4,220.00,1490.96,36,33,0,0,0,0,2018-12-25,1,11,12,1,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35702,35702,0,1.0,0,10.00,171.56,13,33,0,0,0,0,2022-03-01,0,17,3,1,2022
35703,35703,2,4.0,1,99.10,237.64,36,33,1,1,1,0,2022-03-01,0,18,3,1,2022
35704,35704,2,3.0,1,130.49,0.00,40,33,1,1,1,0,2022-03-01,0,18,3,1,2022
35705,35705,2,1.0,1,29.67,206.69,15,33,1,1,1,0,2022-03-01,0,16,3,1,2022


In [109]:
# General Split data
def GeneralSplit(data, ratio, method):
    training_set, test_set = np.split(data, [int(ratio *len(data))])

    X_train = training_set.drop("is_fraud",axis=1)
    y_train = training_set[['is_fraud']].values.flatten()

    X_test = test_set.drop("is_fraud",axis=1)
    y_test = test_set[['is_fraud']].values.flatten()

    scalar = StandardScaler()
    x_train_scale = scalar.fit_transform(X_train)
    x_test_scale = scalar.transform(X_test)
    
    if method == "RandomForest":
        clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0)  
    elif method == "LightGBM":
        clf = lgbm.LGBMClassifier(objective="binary", n_estimators=10000)
    
    print('-----' + method + '-----')
    print('Size of train set: ', X_train.shape)
    print('Size of test set: ', X_test.shape)
    print()
    
    clf.fit(x_train_scale,y_train)
    pred = clf.predict(x_test_scale)
    
    precisions = round(precision_score(y_test, pred),3)
    recalls = round(recall_score(y_test, pred),3)
    accuracies = round(accuracy_score(y_test, pred),3)
    f1_scores = round(f1_score(y_test, pred),3)
    
    fpr, tpr, thresh = roc_curve(y_test, pred, pos_label=1)
    random_probs = [0 for i in range(len(y_test))]
    p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
    auc_scores = round(roc_auc_score(y_test, pred),3)
    
    plt.style.use('seaborn')
    plt.plot(fpr, tpr, linestyle='--',color='orange', label=method)
    plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.show()
    
    print('Precision: ', precisions)
    print('Recall: ', recalls)
    print('Accuracy: ', accuracies)
    print('F1_score: ', f1_scores)
    print('AUC: ' + str(auc_scores))
    print('Confusion-matrix: \n' + str(confusion_matrix(y_test, pred)) + '\n')
    return [precisions, recalls, accuracies, f1_scores, auc_scores]

In [98]:
# Time series split

def TimeSeriesKFold(X_train, y_train, number_folds, method):
    print("-----" + method + "-----")
    print('Size of data set: ', X_train.shape)
    k = int(np.floor(float(X_train.shape[0]) / number_folds))
    print('Size of each fold: ', k)
    accuracies = np.zeros(number_folds-1)
    recalls = np.zeros(number_folds-1)
    precisions = np.zeros(number_folds-1)
    f1_scores = np.zeros(number_folds-1)
    auc_scores = np.zeros(number_folds-1)
    if method == "RandomForest":
        clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0)  
    elif method == "LightGBM":
        clf = lgbm.LGBMClassifier(objective="binary", n_estimators=10000)
    for i in range(2, number_folds + 1):
        print()
        split = float(i-1)/i
        print('Splitting the first ' + str(i) + ' chunks with ratio ' + str(i-1) + ':1')
        X = X_train[:(k*i)]
        y = y_train[:(k*i)]
        print('Size of train + test: ', X.shape)
        index = int(np.floor(X.shape[0] * split))
        X_trainFolds = X[:index]        
        y_trainFolds = y[:index]
        
        # fold used to test the model
        X_testFold = X[(index + 1):]
        y_testFold = y[(index + 1):]
            
        clf.fit(X_trainFolds,y_trainFolds)
        pred = clf.predict(X_testFold)
        precisions[i-2] = round(precision_score(y_testFold, pred),3)
        recalls[i-2] = round(recall_score(y_testFold, pred),3)
        accuracies[i-2] = round(accuracy_score(y_testFold, pred),3)
        f1_scores[i-2] = round(f1_score(y_testFold, pred),3)
        
        fpr, tpr, thresh = roc_curve(y_testFold, pred, pos_label=1)
        random_probs = [0 for i in range(len(y_testFold))]
        p_fpr, p_tpr, _ = roc_curve(y_testFold, random_probs, pos_label=1)
        auc_scores[i-2] = round(roc_auc_score(y_testFold, pred),3)
        
        print('Precision on fold ' + str(i) + ': ', precisions[i-2])
        print('Recall on fold ' + str(i) + ': ', recalls[i-2])
        print('Accuracy on fold ' + str(i) + ': ', accuracies[i-2])
        print('F1_score on fold ' + str(i) + ': ', f1_scores[i-2])
        print('AUC on fold ' + str(i) + ': ', auc_scores[i-2])
        print('Confusion-matrix: \n' + str(confusion_matrix(y_testFold, pred)))
        
        plt.style.use('seaborn')
        plt.plot(fpr, tpr, linestyle='--',color='orange', label=method)
        plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
        plt.title('ROC curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive rate')
        plt.legend(loc='best')
        plt.show()
        
    print('Precision mean: ' + str(precisions.mean()))
    print('Recall mean: ' + str(recalls.mean()))
    print('Accuracy mean: ' + str(accuracies.mean()))
    print('F1-Score mean: ' + str(f1_scores.mean()))
    print('AUC mean: ' + str(auc_scores.mean()))
    return [precisions, recalls, accuracies, f1_scores, auc_scores]