In [9]:
from numpy.random import seed
import pandas as pd
import numpy as np
from sklearn.preprocessing import minmax_scale
# from keras_diagram import ascii
from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import keras
import math
from sklearn.metrics import roc_curve, auc, roc_auc_score

data_raw = pd.read_csv("creditcard.csv")



In [10]:
# Assign variables x and y corresponding to row data and it's class value
X = data_raw.loc[:, data_raw.columns != 'Class']
y = data_raw.loc[:, data_raw.columns == 'Class']

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def generate_train_test_timeseries(data, test_ratio=0.3):
#     cutoff = math.floor(X.shape[0] *cuttoff_percent) 
#     print cutoff
    total_samples = data.shape[0]
    # splitting index
    cutoff = int(total_samples * (1 - test_ratio))

    
    data.sort_values('Time', inplace=True)
    
    X_train = data.loc[0:cutoff, data.columns != 'Class']
    y_train = data.loc[0:cutoff, data.columns == 'Class']
    X_test = data.loc[cutoff:, data.columns != 'Class']
    y_test = data.loc[cutoff:, data.columns == 'Class']
    
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
    ])
    preprocessor = pipeline.fit(X_train)
    X_train_prp = preprocessor.transform(X_train)
    

    X_test_prp = preprocessor.transform(X_test)
    
    return X_train_prp, y_train, X_test_prp, y_test
    
########################################################################

In [12]:
def reshape_to_batches(a, batch_size):
    # pad with zeros if the length is not divisible by the batch_size
    batch_num = np.ceil((float)(a.shape[0]) / batch_size)
    modulo = batch_num * batch_size - a.shape[0]
    if modulo != 0:
        pad = np.zeros((int(modulo), a.shape[1]))
        a = np.vstack((a, pad))
    return np.array(np.split(a, batch_num))

In [13]:
# a function to reshape batches into the original shape
def _3d_to_2d(arr):
    return arr.reshape(arr.shape[0] * arr.shape[1], arr.shape[2])

In [14]:
def create_model(batch_size=100, window_height=5):
    #import all dependencies
    from keras.layers import Input, Dense, Conv1D
    from keras.models import Model

    seed(2018)
    
    inputs = Input(shape=(batch_size, 30)) # This returns a tensor
    
    # a layer instance is callable on a tensor, and returns a tensor
    conv1 = Conv1D(32, (window_height), # 32 filters with a window of width 5
    strides=1, # think autoregression
    padding='causal', # forward in time
    )(inputs) # syntax to chain layers: Layer(...)(PreviousLayer)
    
    fc1 = Dense(64, activation='relu')(conv1)
    predictions = Dense(2, activation='softmax')(fc1)
    
    model = Model(inputs=inputs,
    outputs=predictions)
    model.compile(optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])
    
    return model

In [15]:
def time_series_cross_val(batch_size=100, window_height=5):
    
    precision = []
    recall = []
    f1score = []
    aucs = []
    
    cv = 0.3
    for i in range(3):
        print('Cross validating epoch ', i)
        X_train, y_train, X_test, y_test = generate_train_test_timeseries(data_raw, test_ratio = cv)

        y_test = keras.utils.to_categorical(y_test)
        y_train = keras.utils.to_categorical(y_train)

        X_train_batch = reshape_to_batches(X_train, batch_size)
        y_train_batch = reshape_to_batches(y_train, batch_size)
        y_test_batch = reshape_to_batches(y_test, batch_size)
        X_test_batch = reshape_to_batches(X_test, batch_size)

        print X_train_batch.shape
        print X_test_batch.shape
        print y_train_batch.shape
        print y_test_batch.shape

        model = create_model(batch_size, window_height )
        model.fit(X_train_batch, y_train_batch, epochs=30)
        y_pred = model.predict(X_test_batch)

        # store the raw predictions in a copy
        y_hat = np.copy(y_pred)

        # Classification reporting
        from sklearn.metrics import confusion_matrix, classification_report
        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0

        y_pred_2d = _3d_to_2d(y_pred)[:,1]
        y_test_2d = _3d_to_2d(y_test_batch)[:,1]

        print(confusion_matrix(
        _3d_to_2d(y_test_batch)[:, 1],
        _3d_to_2d(y_pred)[:, 1]))

        prfs = precision_recall_fscore_support(y_test_2d, y_pred_2d, labels=[1])

        precision.append(prfs[0])
        recall.append(prfs[1])
        f1score.append(prfs[2])
        aucs.append(roc_auc_score(y_test_2d, _3d_to_2d(y_hat)[:, 1]))

        print prfs
        
        print(classification_report(
        y_test_2d,
        y_pred_2d,
        target_names = ["Genuine", "Fraud"],
        digits = 5))
        
        cv = cv - 0.1
        
    
    return [np.mean(f1score),np.mean(precision),np.mean(recall),np.mean(aucs)]

In [16]:
results_1 = time_series_cross_val(100, 5)
results_1

('Cross validating epoch ', 0)
(1994, 100, 30)
(855, 100, 30)
(1994, 100, 2)
(855, 100, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85392     0]
 [   55    53]]
(array([ 1.]), array([ 0.49074074]), array([ 0.65838509]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99936   1.00000   0.99968     85392
      Fraud    1.00000   0.49074   0.65839       108

avg / total    0.99936   0.99936   0.99925     85500

('Cross validating epoch ', 1)
(2279, 100, 30)
(570, 100, 30)
(2279, 100, 2)
(570, 100, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
E

Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28476     2]
 [   11    11]]
(array([ 0.84615385]), array([ 0.5]), array([ 0.62857143]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99961   0.99993   0.99977     28478
      Fraud    0.84615   0.50000   0.62857        22

avg / total    0.99950   0.99954   0.99949     28500



[0.6748871465906392,
 0.9345335515548282,
 0.53024691358024689,
 0.95273114292797467]

In [17]:
results_2 = time_series_cross_val(200, 5)
results_2

('Cross validating epoch ', 0)
(997, 200, 30)
(428, 200, 30)
(997, 200, 2)
(428, 200, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85492     0]
 [   58    50]]
(array([ 1.]), array([ 0.46296296]), array([ 0.63291139]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99932   1.00000   0.99966     85492
      Fraud    1.00000   0.46296   0.63291       108

avg / total    0.99932   0.99932   0.99920     85600

('Cross validating epoch ', 1)
(1140, 200, 30)
(285, 200, 30)
(1140, 200, 2)
(285, 200, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epo

Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28577     1]
 [   10    12]]
(array([ 0.92307692]), array([ 0.54545455]), array([ 0.68571429]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99965   0.99997   0.99981     28578
      Fraud    0.92308   0.54545   0.68571        22

avg / total    0.99959   0.99962   0.99957     28600



[0.68398633715089396,
 0.96695156695156703,
 0.53169472502805837,
 0.93828613074893796]

In [18]:
results_3 = time_series_cross_val(500, 5)
results_3

('Cross validating epoch ', 0)
(399, 500, 30)
(171, 500, 30)
(399, 500, 2)
(171, 500, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85392     0]
 [   61    47]]
(array([ 1.]), array([ 0.43518519]), array([ 0.60645161]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99929   1.00000   0.99964     85392
      Fraud    1.00000   0.43519   0.60645       108

avg / total    0.99929   0.99929   0.99915     85500

('Cross validating epoch ', 1)
(456, 500, 30)
(114, 500, 30)
(456, 500, 2)
(114, 500, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch

Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28478     0]
 [   14     8]]
(array([ 1.]), array([ 0.36363636]), array([ 0.53333333]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99951   1.00000   0.99975     28478
      Fraud    1.00000   0.36364   0.53333        22

avg / total    0.99951   0.99951   0.99939     28500



[0.60799849085078284, 1.0, 0.43960718294051632, 0.92482992133313457]

In [19]:
results_4 = time_series_cross_val(100, 10)
results_4

('Cross validating epoch ', 0)
(1994, 100, 30)
(855, 100, 30)
(1994, 100, 2)
(855, 100, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85392     0]
 [   43    65]]
(array([ 1.]), array([ 0.60185185]), array([ 0.75144509]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99950   1.00000   0.99975     85392
      Fraud    1.00000   0.60185   0.75145       108

avg / total    0.99950   0.99950   0.99943     85500

('Cross validating epoch ', 1)
(2279, 100, 30)
(570, 100, 30)
(2279, 100, 2)
(570, 100, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
E

Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28475     3]
 [    8    14]]
(array([ 0.82352941]), array([ 0.63636364]), array([ 0.71794872]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99972   0.99989   0.99981     28478
      Fraud    0.82353   0.63636   0.71795        22

avg / total    0.99958   0.99961   0.99959     28500



[0.73424237932908454,
 0.93376906318082786,
 0.60829405162738492,
 0.93075398609097126]

In [20]:
results_5 = time_series_cross_val(200, 10)
results_5

('Cross validating epoch ', 0)
(997, 200, 30)
(428, 200, 30)
(997, 200, 2)
(428, 200, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85492     0]
 [   53    55]]
(array([ 1.]), array([ 0.50925926]), array([ 0.67484663]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99938   1.00000   0.99969     85492
      Fraud    1.00000   0.50926   0.67485       108

avg / total    0.99938   0.99938   0.99928     85600

('Cross validating epoch ', 1)
(1140, 200, 30)
(285, 200, 30)
(1140, 200, 2)
(285, 200, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epo

Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28575     3]
 [    8    14]]
(array([ 0.82352941]), array([ 0.63636364]), array([ 0.71794872]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99972   0.99990   0.99981     28578
      Fraud    0.82353   0.63636   0.71795        22

avg / total    0.99958   0.99962   0.99959     28600



[0.71563123479044233,
 0.93408427200667499,
 0.58631874298540965,
 0.91178867309952005]

In [21]:
results_6 = time_series_cross_val(500, 10)
results_6

('Cross validating epoch ', 0)
(399, 500, 30)
(171, 500, 30)
(399, 500, 2)
(171, 500, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85392     0]
 [   53    55]]
(array([ 1.]), array([ 0.50925926]), array([ 0.67484663]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99938   1.00000   0.99969     85392
      Fraud    1.00000   0.50926   0.67485       108

avg / total    0.99938   0.99938   0.99928     85500

('Cross validating epoch ', 1)
(456, 500, 30)
(114, 500, 30)
(456, 500, 2)
(114, 500, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch

Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28477     1]
 [   14     8]]
(array([ 0.88888889]), array([ 0.36363636]), array([ 0.51612903]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99951   0.99996   0.99974     28478
      Fraud    0.88889   0.36364   0.51613        22

avg / total    0.99942   0.99947   0.99936     28500



[0.62118067656878384,
 0.96296296296296291,
 0.4598540965207632,
 0.9024895427458185]