In [6]:
from numpy.random import seed
import pandas as pd
import numpy as np
from sklearn.preprocessing import minmax_scale
# from keras_diagram import ascii
from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import keras
import math
from sklearn.metrics import roc_curve, auc, roc_auc_score

data_raw = pd.read_csv("creditcard.csv")



In [7]:
# Assign variables x and y corresponding to row data and it's class value
X = data_raw.loc[:, data_raw.columns != 'Class']
y = data_raw.loc[:, data_raw.columns == 'Class']

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def generate_train_test_timeseries(data, test_ratio=0.3):
#     cutoff = math.floor(X.shape[0] *cuttoff_percent) 
#     print cutoff
    total_samples = data.shape[0]
    # splitting index
    cutoff = int(total_samples * (1 - test_ratio))

    
    data.sort_values('Time', inplace=True)
    
    X_train = data.loc[0:cutoff, data.columns != 'Class']
    y_train = data.loc[0:cutoff, data.columns == 'Class']
    X_test = data.loc[cutoff:, data.columns != 'Class']
    y_test = data.loc[cutoff:, data.columns == 'Class']
    
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
    ])
    preprocessor = pipeline.fit(X_train)
    X_train_prp = preprocessor.transform(X_train)
    

    X_test_prp = preprocessor.transform(X_test)
    
    return X_train_prp, y_train, X_test_prp, y_test
    
########################################################################

In [9]:
def reshape_to_batches(a, batch_size):
    # pad with zeros if the length is not divisible by the batch_size
    batch_num = np.ceil((float)(a.shape[0]) / batch_size)
    modulo = batch_num * batch_size - a.shape[0]
    if modulo != 0:
        pad = np.zeros((int(modulo), a.shape[1]))
        a = np.vstack((a, pad))
    return np.array(np.split(a, batch_num))

In [10]:
# a function to reshape batches into the original shape
def _3d_to_2d(arr):
    return arr.reshape(arr.shape[0] * arr.shape[1], arr.shape[2])

In [11]:
def create_model():
    #import all dependencies
    from keras.layers import Input, Dense, Conv1D
    from keras.models import Model

    seed(2018)
    
    inputs = Input(shape=(100, 30)) # This returns a tensor
    
    # a layer instance is callable on a tensor, and returns a tensor
    conv1 = Conv1D(32, (5), # 32 filters with a window of width 5
    strides=1, # think autoregression
    padding='causal', # forward in time
    )(inputs) # syntax to chain layers: Layer(...)(PreviousLayer)
    
    fc1 = Dense(64, activation='relu')(conv1)
    predictions = Dense(2, activation='softmax')(fc1)
    
    model = Model(inputs=inputs,
    outputs=predictions)
    model.compile(optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])
    
    return model

In [15]:
def time_series_cross_val():
    
    precision = []
    recall = []
    f1score = []
    aucs = []
    
    cv = 0.3
    for i in range(3):
        print('Cross validating epoch ', i)
        X_train, y_train, X_test, y_test = generate_train_test_timeseries(data_raw, test_ratio = cv)

        y_test = keras.utils.to_categorical(y_test)
        y_train = keras.utils.to_categorical(y_train)

        X_train_batch = reshape_to_batches(X_train, 100)
        y_train_batch = reshape_to_batches(y_train, 100)
        y_test_batch = reshape_to_batches(y_test, 100)
        X_test_batch = reshape_to_batches(X_test, 100)

        print X_train_batch.shape
        print X_test_batch.shape
        print y_train_batch.shape
        print y_test_batch.shape

        model = create_model()
        model.fit(X_train_batch, y_train_batch, epochs=30)
        y_pred = model.predict(X_test_batch)

        # store the raw predictions in a copy
        y_hat = np.copy(y_pred)

        # Classification reporting
        from sklearn.metrics import confusion_matrix, classification_report
        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0

        y_pred_2d = _3d_to_2d(y_pred)[:,1]
        y_test_2d = _3d_to_2d(y_test_batch)[:,1]

        print(confusion_matrix(
        _3d_to_2d(y_test_batch)[:, 1],
        _3d_to_2d(y_pred)[:, 1]))

        prfs = precision_recall_fscore_support(y_test_2d, y_pred_2d, labels=[1])

        precision.append(prfs[0])
        recall.append(prfs[1])
        f1score.append(prfs[2])
        aucs.append(roc_auc_score(y_test_2d, _3d_to_2d(y_hat)[:, 1]))

        print prfs
        
        print(classification_report(
        y_test_2d,
        y_pred_2d,
        target_names = ["Genuine", "Fraud"],
        digits = 5))
        
        cv = cv - 0.1
        
    
    return [np.mean(f1score),np.mean(precision),np.mean(recall),np.mean(aucs)]

In [16]:
results = time_series_cross_val()

('Cross validating epoch ', 0)
(1994, 100, 30)
(855, 100, 30)
(1994, 100, 2)
(855, 100, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[85392     0]
 [   56    52]]
(array([1.]), array([0.48148148]), array([0.65]), array([108]))
             precision    recall  f1-score   support

    Genuine    0.99934   1.00000   0.99967     85392
      Fraud    1.00000   0.48148   0.65000       108

avg / total    0.99935   0.99935   0.99923     85500

('Cross validating epoch ', 1)
(2279, 100, 30)
(570, 100, 30)
(2279, 100, 2)
(570, 100, 2)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/3

Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[28476     2]
 [   11    11]]
(array([0.84615385]), array([0.5]), array([0.62857143]), array([22]))
             precision    recall  f1-score   support

    Genuine    0.99961   0.99993   0.99977     28478
      Fraud    0.84615   0.50000   0.62857        22

avg / total    0.99950   0.99954   0.99949     28500



In [17]:
results

[0.6706349206349206, 0.9413105413105414, 0.522716049382716, 0.9524019821436042]