In [1]:

import tensorflow as tf

import keras
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding, LSTM
from tensorflow.keras.optimizers import RMSprop, Adam, Nadam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import TensorBoard

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

%matplotlib inline

import sys

print("Python: ", sys.version)
print("pandas: ", pd.__version__)
print("numpy: ", np.__version__)
print("seaborn: ", sns.__version__)
print("matplotlib: ", matplotlib.__version__)
print("sklearn: ", sklearn.__version__)
print("keras: ", keras.__version__)
print("tensorflow: ", tf.__version__)

# https://www.kaggle.com/mlg-ulb/creditcardfraud
filepath= './data-sample/creditcard.csv'
df = pd.read_csv(filepath_or_buffer=filepath, header=0, sep=",")
print(df.shape)

# You will collect 20k normal and 400 abnormal records. 
# You can pick different ratios to try, but in general more normal data examples are better because you want to teach your autoencoder what normal data looks like. 
# Too much abnormal data in training will train the autoencoder to learn that the anomalies are actually normal, which goes against your goal.

df["Amount"] = StandardScaler().fit_transform(df["Amount"].values.reshape(-1, 1))

df0 = df.query('Class == 0').sample(20000)
df1 = df.query('Class == 1').sample(400)

df = pd.concat([df0, df1])

x_train, x_test, y_train, y_test = train_test_split(df.drop(labels=['Time', 'Class'], axis=1), df['Class'], test_size=0.2, random_state=42)

print(x_train.shape, 'train samples')
print(x_test.shape, 'test samples')

Using TensorFlow backend.


Python:  3.6.8 (default, Apr 25 2019, 21:02:35) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-36)]
pandas:  0.25.1
numpy:  1.17.2
seaborn:  0.9.0
matplotlib:  3.1.1
sklearn:  0.20.4
keras:  2.2.4
tensorflow:  2.0.0
(284807, 31)
(16320, 29) train samples
(4080, 29) test samples


In [3]:
log_file_name = "deppautoencoder"

encoding_dim = 16
input_dim = x_train.shape[1]

inputArray = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(inputArray)
encoded = Dense(8, activation='relu')(encoded)
encoded = Dense(4, activation='relu')(encoded)

decoded = Dense(8, activation='relu')(encoded)
decoded = Dense(encoding_dim, activation='relu')(decoded)
decoded = Dense(input_dim, activation='softmax')(decoded)

autoencoder = Model(inputArray, decoded)

autoencoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 29)]              0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                480       
_________________________________________________________________
dense_7 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_8 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_9 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_10 (Dense)             (None, 16)                144       
_________________________________________________________________
dense_11 (Dense)             (None, 29)                493 

In [5]:
autoencoder.compile(optimizer=RMSprop(), loss='mean_squared_error', metrics=['mae', 'accuracy'])

batch_size = 32
epochs = 30

history = autoencoder.fit(x_train, x_train, 
                          batch_size=batch_size, 
                          epochs=epochs, 
                          verbose=1, 
                          shuffle=True, 
                          validation_data=(x_test, x_test),
                          callbacks=[TensorBoard(log_dir='../logs/' + log_file_name)])

score = autoencoder.evaluate(x_test, x_test, verbose=1)

print("Test loss: ", score[0])
print("Test accuracy: ", score[1])


Train on 16320 samples, validate on 4080 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test loss:  1.7066149784069435
Test accuracy:  0.67825013
