In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import tensorflow as tf
from keras_tqdm import TQDMNotebookCallback

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}

In [4]:
train_transactions = pd.read_csv("./data/train_transaction.csv")

In [5]:
filtered_transactions = train_transactions.iloc[:, 0:17]
encoded_dataset = pd.get_dummies(filtered_transactions, columns=[
                                 'ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain'], drop_first=True)
encoded_dataset.fillna(0, inplace=True)

In [28]:
y = encoded_dataset.isFraud.values
X = encoded_dataset.drop(columns='isFraud').values

In [45]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [104]:
input_dim = X.shape[1]

138

In [149]:
model = Sequential()
model.add(Dense(10000, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [150]:
model.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_116 (Dense)            (None, 10000)             1390000   
_________________________________________________________________
dropout_56 (Dropout)         (None, 10000)             0         
_________________________________________________________________
dense_117 (Dense)            (None, 1)                 10001     
Total params: 1,400,001
Trainable params: 1,400,001
Non-trainable params: 0
_________________________________________________________________
None


In [151]:
es = [EarlyStopping(monitor='acc', mode='auto', verbose=1, patience=2, restore_best_weights=True),
     TQDMNotebookCallback(leave_inner=True,leave_outer=True)]

In [152]:
class_weights = get_class_weights(y_train)
history = model.fit(X_train,
                 y_train,
                 epochs=100,
                 batch_size=500,
                 verbose=0,
                 class_weight=class_weights,
                 validation_split=0.2,
                 callbacks = es
                 )

HBox(children=(IntProgress(value=0, description='Training', style=ProgressStyle(description_width='initial')),…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=425188, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=425188, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=425188, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=425188, style=ProgressStyle(description_width='…

Restoring model weights from the end of the best epoch
Epoch 00004: early stopping


In [155]:
y_pred  = model.predict_classes(X_test)

In [156]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[39462 17587]
 [  512  1493]]
              precision    recall  f1-score   support

           0       0.99      0.69      0.81     57049
           1       0.08      0.74      0.14      2005

    accuracy                           0.69     59054
   macro avg       0.53      0.72      0.48     59054
weighted avg       0.96      0.69      0.79     59054

