In [33]:
# Import libraries
import pandas as pd
import numpy as np
import keras
import lightgbm as lgbm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, confusion_matrix, precision_score, f1_score

In [2]:
# Import dataset
df = pd.read_csv('data/input.csv')
df2 = df[['transaction_risk_score','cc_amount','ledger_balance','cardholder_presence','card_presence',
          'partial_approval_capable','channel','processing_type','date','cc_acceptor_state','cc_acceptor_country','is_fraud']]
df2 = df2.sort_values(by='date')
df2 = df2.drop("date",axis=1)

In [35]:
X = df2.drop("is_fraud",axis=1)
y = df2[['is_fraud']].values.flatten()
scalar = StandardScaler()
X_scale = scalar.fit_transform(X)

In [17]:
# Split data
training_set, test_set = np.split(df2, [int(.8 *len(df2))])

X_train = training_set.drop("is_fraud",axis=1)
y_train = training_set[['is_fraud']].values.flatten()

X_test = test_set.drop("is_fraud",axis=1)
y_test = test_set[['is_fraud']].values.flatten()

scalar = StandardScaler()
x_train_scale = scalar.fit_transform(X_train)
x_test_scale = scalar.transform(X_test)

In [36]:
＃Time Series split
def TimeSeriesKFold(X_train, y_train, number_folds, method):
    print('Size of train set: ', X_train.shape)
    k = int(np.floor(float(X_train.shape[0]) / number_folds))
    print('Size of each fold: ', k)
    accuracies = np.zeros(number_folds-1)
    recalls = np.zeros(number_folds-1)
    precisions = np.zeros(number_folds-1)
    f1_scores = np.zeros(number_folds-1)
    for i in range(2, number_folds + 1):
        print()
        split = float(i-1)/i
        print('Splitting the first ' + str(i) + ' chunks with ratio ' + str(i-1) + ':1')
        X = X_train[:(k*i)]
        y = y_train[:(k*i)]
        print('Size of train + test: ', X.shape)
        index = int(np.floor(X.shape[0] * split))
        X_trainFolds = X[:index]        
        y_trainFolds = y[:index]
        
        # fold used to test the model
        X_testFold = X[(index + 1):]
        y_testFold = y[(index + 1):]
        
        if method == "RandomForest":
            clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0)  
        elif method == "LightGBM":
            clf = lgbm.LGBMClassifier(objective="binary", n_estimators=10000)
            
        clf.fit(X_trainFolds,y_trainFolds)
        pred = clf.predict(X_testFold)
        precisions[i-2] = round(precision_score(y_testFold, pred),3)
        recalls[i-2] = round(recall_score(y_testFold, pred),3)
        accuracies[i-2] = round(accuracy_score(y_testFold, pred),3)
        f1_scores[i-2] = round(f1_score(y_testFold, pred),3)
        
        print('Precision on fold ' + str(i) + ': ', precisions[i-2])
        print('Recall on fold ' + str(i) + ': ', recalls[i-2])
        print('Accuracy on fold ' + str(i) + ': ', accuracies[i-2])
        print('f1_score on fold ' + str(i) + ': ', f1_scores[i-2])
    return [precisions.mean(),recalls.mean(),accuracies.mean(),f1_scores.mean()]

TimeSeriesKFold(X_scale, y, 10, 'LightGBM')

Size of train set:  (35707, 10)
Size of each fold:  3570

Splitting the first 2 chunks with ratio 1:1
Size of train + test:  (7140, 10)
Precision on fold 2:  0.972
Recall on fold 2:  0.86
Accuracy on fold 2:  0.991
f1_score on fold 2:  0.912

Splitting the first 3 chunks with ratio 2:1
Size of train + test:  (10710, 10)
Precision on fold 3:  0.985
Recall on fold 3:  0.84
Accuracy on fold 3:  0.989
f1_score on fold 3:  0.907

Splitting the first 4 chunks with ratio 3:1
Size of train + test:  (14280, 10)
Precision on fold 4:  0.977
Recall on fold 4:  0.809
Accuracy on fold 4:  0.991
f1_score on fold 4:  0.885

Splitting the first 5 chunks with ratio 4:1
Size of train + test:  (17850, 10)
Precision on fold 5:  0.97
Recall on fold 5:  0.757
Accuracy on fold 5:  0.987
f1_score on fold 5:  0.85

Splitting the first 6 chunks with ratio 5:1
Size of train + test:  (21420, 10)
Precision on fold 6:  0.948
Recall on fold 6:  0.92
Accuracy on fold 6:  0.996
f1_score on fold 6:  0.934

Splitting the

[0.9788888888888889, 0.8982222222222221, 0.994, 0.9348888888888889]

In [19]:
confusion_matrix(y_test, pred)

array([[17012,     7],
       [   22,   813]])