In [57]:
#import sys
#!{sys.executable} -m pip install keras

In [58]:
#!{sys.executable} -m pip install tensorflow

In [59]:

'''Main'''
import numpy as np
import pandas as pd
import os, time, re
import pickle, gzip

'''Data Viz'''
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import matplotlib as mpl

%matplotlib inline

'''Data Prep and Model Evaluation'''
from sklearn import preprocessing as pp
from sklearn import model_selection 
#from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

'''TensorFlow and Keras'''
# import tensorflow as tf
# Tensoflow is configured as backend of Keras already
import keras
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout
from keras.layers import BatchNormalization, Input, Lambda
from keras import regularizers
from keras.losses import mse, binary_crossentropy

In [60]:

# Preprocess the data (credit card transaction - detect fraud)

def get_train_test_data():
    df = pd.read_csv('C:/dev/research/temp/handson-unsupervised-learning/datasets/credit_card_data/credit_card.csv',  header=0, index_col=0)
    dataX = df.copy().drop(['Class'], axis=1)
    dataY = df['Class'].copy()

    # We need to standardize the featurs - mean 0 and std 1
    features = dataX.columns
    ss = pp.StandardScaler()
    dataX.loc[:, features] = ss.fit_transform(dataX[features])
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(dataX, dataY, test_size=0.33, random_state=2018, stratify=dataY)
    return X_train.copy(), X_test.copy(), y_train, y_test

In [61]:
## Reference https://github.com/aapatel09/handson-unsupervised-learning/blob/master/08_autoencoders.ipynb
def anomalyScores(originalDF, reducedDF):
    loss = np.sum((np.array(originalDF) - \
                   np.array(reducedDF))**2, axis=1)
    loss = pd.Series(data=loss,index=originalDF.index)
    loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
    return loss

In [62]:
## Reference https://github.com/aapatel09/handson-unsupervised-learning/blob/master/08_autoencoders.ipynb
def plotResults(trueLabels, anomalyScores):
    preds = pd.concat([trueLabels, anomalyScores], axis=1)
    preds.columns = ['trueLabel', 'anomalyScore']
    precision, recall, thresholds = \
        precision_recall_curve(preds['trueLabel'], \
                               preds['anomalyScore'])
    average_precision = average_precision_score( \
                        preds['trueLabel'], preds['anomalyScore'])
    
    '''
    plt.step(recall, precision, color='k', alpha=0.7, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    
    
    plt.title('Precision-Recall curve: Average Precision = \
        {0:0.2f}'.format(average_precision))
    '''

    fpr, tpr, thresholds = roc_curve(preds['trueLabel'], \
                                     preds['anomalyScore'])
    areaUnderROC = auc(fpr, tpr)
    
    '''
    plt.figure()
    plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: Area under the \
        curve = {0:0.2f}'.format(areaUnderROC))
    plt.legend(loc="lower right")
    plt.show()
    '''
    return preds, average_precision, areaUnderROC

In [63]:
# Model building function

def getAModel(activation='linear', input_hidden_units=29, output_hidden_units=29, input_dim=29, metrics=['accuracy']):
    model = Sequential()
    model.add(Dense(units=input_hidden_units, activation=activation, input_dim=input_dim))
    model.add(Dense(units=output_hidden_units, activation=activation))
    # output layer?

    model.compile(optimizer='Adam', loss='mean_squared_error', metrics=['accuracy'])
    return model

In [64]:
## Try with the simplest model

# input_hidden_units, output_hidden_units, input_dim
model = getAModel('linear',29,29, 29,['accuracy'])

In [65]:
X_train, X_test, y_train, y_test= get_train_test_data()

In [66]:
X_train.shape

(190820, 29)

In [67]:
## Fit a model with parameters like epochs and batch size

def fitAModel(model, X, y, validation_data, epochs=10, batch_size = 32, verbose=1):
    history = model.fit(x=X,y=y, epochs=epochs, batch_size=batch_size, shuffle=True,
                    validation_data=validation_data,verbose=verbose)
    return history

In [68]:
epochs =10
batch_size = 32 ## total number of rows is 190820, every 32 rows will adjust the weights

history = fitAModel(model, X_train, X_train,(X_train, X_train),  epochs, batch_size, 0)


In [69]:
def evaluation(model, test_X, test_y):
    predictions = model.predict(test_X, verbose=1)
    anomalyScoresAE = anomalyScores(test_X, predictions)
    preds, avgPrecision, roc_area = plotResults(test_y, anomalyScoresAE)
    model.reset_states() ## why?
    return avgPrecision, anomalyScoresAE, roc_area

In [72]:
def calculateAvgResults(model, test_x, test_y, num = 10):
    test_precision = []
    test_scores = []
    test_roc_area = []

    # Why do we need to get an average over 10 runs??
    for i in range(0,num):
        precision, score, roc_area = evaluation(model,test_x,test_y)

        test_scores.append(score)
        test_precision.append(precision)
        test_roc_area.append(roc_area)

    print('==========')
    print("Mean average precision over 10 runs: ", np.mean(test_precision))
    print("Coefficient of variation over 10 runs: ", np.std(test_precision)/ \
                                                np.mean(test_precision))
    #print(test_precision)
    
    print('---------')
    
    print("Mean average anomaly scores over 10 runs: ", np.mean(test_scores))
    print("Coefficient of variation over 10 runs: ", np.std(test_scores)/ \
                                                np.mean(test_scores))
    #print(test_scores)
    
    print("Mean average roc area over 10 runs: ", np.mean(test_roc_area))
    print("Coefficient of variation over 10 runs: ", np.std(test_roc_area)/ \
                                                np.mean(test_roc_area))

In [73]:
## Main flow for executions

epochs =10
batch_size = 32 ## total number of rows is 190820, every 32 rows will adjust the weights

# input_hidden_units, output_hidden_units, input_dim
model = getAModel('linear',29, 29, 29,['accuracy'])

fitAModel(model, X_train, X_train,(X_train, X_train), epochs, batch_size, 0)
calculateAvgResults(model, X_test, y_test, 10)

Mean average precision over 10 runs:  0.5488220757674558
Coefficient of variation over 10 runs:  0.0
---------
Mean average anomaly scores over 10 runs:  0.001061009928544794
Coefficient of variation over 10 runs:  18.77428325760857
Mean average roc area over 10 runs:  0.9523418631350064
Coefficient of variation over 10 runs:  1.1657820238736776e-16


### Undercomplete Autoencoders - linear

In [74]:
# Change input hidden units from 29 to 20
# input_hidden_units, output_hidden_units, input_dim
model = getAModel('linear',20, 29, 29,['accuracy'])

fitAModel(model, X_train, X_train,(X_train, X_train), epochs, batch_size, 0)
calculateAvgResults(model, X_test, y_test, 10)

Mean average precision over 10 runs:  0.31260688752814675
Coefficient of variation over 10 runs:  0.0
---------
Mean average anomaly scores over 10 runs:  0.0017948164701140131
Coefficient of variation over 10 runs:  7.185072533749804
Mean average roc area over 10 runs:  0.9505417558956951
Coefficient of variation over 10 runs:  1.1679897466250643e-16


In [75]:
##
# Above results are not good ...

In [76]:
# input_hidden_units, output_hidden_units, input_dim
model = getAModel('linear',27, 29, 29,['accuracy'])

fitAModel(model, X_train, X_train,(X_train, X_train), epochs, batch_size, 0)
calculateAvgResults(model, X_test, y_test, 10)

Mean average precision over 10 runs:  0.6977205730384144
Coefficient of variation over 10 runs:  0.0
---------
Mean average anomaly scores over 10 runs:  0.0005464932881718011
Coefficient of variation over 10 runs:  30.61849531986307
Mean average roc area over 10 runs:  0.9353208133082014
Coefficient of variation over 10 runs:  2.3739940538654996e-16


In [77]:
###
# by changing to 27 (number of nodes in the first layer) instead of 20 there's quite a lot of improvement

### Overcomplete Autoencoders - linear

In [78]:
# Change input hidden units from 29 to 20
# input_hidden_units, output_hidden_units, input_dim
model = getAModel('linear', 32, 29, 29,['accuracy'])

fitAModel(model, X_train, X_train,(X_train, X_train), epochs, batch_size, 0)
calculateAvgResults(model, X_test, y_test, 10)

Mean average precision over 10 runs:  0.06061943464887889
Coefficient of variation over 10 runs:  1.1446648989880604e-16
---------
Mean average anomaly scores over 10 runs:  0.00010980700174002143
Coefficient of variation over 10 runs:  44.4914522024267
Mean average roc area over 10 runs:  0.9441595694637706
Coefficient of variation over 10 runs:  1.1758849462868875e-16


In [None]:
###
# this is even worse 
#
###

### Undercomplete Autoencoders - ReLu

In [79]:
## 
# Using 27 (produces 0.69 precision with linear activation)
# to see if ReLu can beat it
##

# input_hidden_units, output_hidden_units, input_dim
model = getAModel('relu',27, 29, 29,['accuracy'])

fitAModel(model, X_train, X_train,(X_train, X_train), epochs, batch_size, 0)
calculateAvgResults(model, X_test, y_test, 10)

Mean average precision over 10 runs:  0.24286990977225026
Coefficient of variation over 10 runs:  1.1428165655291152e-16
---------
Mean average anomaly scores over 10 runs:  0.0027946525229334135
Coefficient of variation over 10 runs:  4.810012783594438
Mean average roc area over 10 runs:  0.9462006690943541
Coefficient of variation over 10 runs:  0.0


In [None]:
### No good at all!!!