# Code for Jupyter Notebook demo (July-2018): 

## Building a deep learning models for fraud detection 

## Pre-requisities:
Spin up a Data Science Virtual Machine via the Azure portal
https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/provision-vm

- Preconfigured Virtual Machines: https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/ 

- Download the Credit card dataset and upload it onto Azure blob storage https://azure.microsoft.com/en-us/services/storage/blobs/ note down the blob credentials for authentication via the code. 

## References: 

- Data Source: https://www.kaggle.com/mlg-ulb/creditcardfraud

- Blog Post by Venelin Valkov: https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd

- Deep Learning Book by Ian Goodfellow, Yoshua Bengio, Aaron Courville: http://www.deeplearningbook.org/ 

## Environment setup

In [None]:
# Import necessary components
import os
import keras
import shutil
import json

In [None]:
import re
import pandas as pd
import numpy as np
import datetime

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from math import ceil

In [None]:
import pickle
from scipy import stats
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

In [None]:
import os
cwd = os.getcwd()
cwd

In [None]:
import matplotlib as plt 

In [None]:
%matplotlib inline

In [None]:
import glob
import os

from azure.storage.blob import BlockBlobService
from azure.storage.blob import PublicAccess

Enter the credentials to access the data from the cloud and then download the file for analysis.

In [None]:
# Azure blob credentials to read data
storage_account = '****'
storage_key = '****'

input_container = 'stratalondon'
output_container = 'modeldeploy'

az_blob_service = BlockBlobService(account_name=storage_account, account_key=storage_key)

In [None]:
blob_service = BlockBlobService(account_name=storage_account, account_key=storage_key)
input_container_folder = 'stratalondon/'
generator = blob_service.list_blobs(input_container_folder)
for blob in generator:
    if ("creditcard" in blob.name):
        print(blob.name)
        fname=blob.name

In [None]:
aml_dir = cwd
my_service = BlockBlobService(account_name=storage_account, account_key=storage_key)
my_service.get_blob_to_path('stratalondon', fname, 'C://dsvm//notebooks/creditcard.csv')

## Import the Credit card data set

In [None]:
# Check the path
aml_dir

In [None]:
# Ingest the dataset
cc = pd.read_csv('C://dsvm//notebooks/creditcard.csv')

After data ingestion from Blob, check to see the various columns and number of rows/columns of the dataset.

In [None]:
# Check sample data
cc.head(1)

In [None]:
# Check the number of rows/columns
cc.shape

Now that the data is properly imported, check the descriptive statistics of the columns in the dataset.

In [None]:
# Check data statistics
print(cc.describe())

Here we visualize and access the distribution of the variable 'Class'. This is the variable which indicates whether a transaction was fraud/normal. 

In [None]:
from matplotlib import pyplot as plt 

In [None]:
# Variable class is used for the classification of entries as fraud/non-fraud, check the distribution of the variable
class_freq = pd.value_counts(cc['Class'], sort = True)
class_freq.plot(kind = 'bar', rot=0)
plt.title("Class Frequency")
plt.xlabel("Class")
plt.ylabel("Frequency");

In [None]:
# Count of Fraud/normal transactions
fraud = cc[cc.Class == 1]
normal = cc[cc.Class == 0]
print("Number of Fraud transactions:")
print(fraud.shape)
print("Number of Non-Fraud transactions:")
print(normal.shape)
print("% of Fraud transactions:")
prop = (len(fraud)/(len(fraud)+len(normal)))*100
print(prop)

Check to see how the fraud/normal transactions vary in terms of variable 'Amount'.

In [None]:
# Check Fraud data statistics for variable = 'Amount'
fraud.Amount.describe()

In [None]:
# Compare Fraud data statistics with normal data for variable = 'Amount'
normal.Amount.describe()

## Modeling 

First exclude the variable 'Time'. Since the spread of the variable 'Amount' is large, this variable is standardized. 

In [None]:
# Remove the column 'Time' and standardize the variable 'Amount'
from sklearn.preprocessing import StandardScaler
data = cc.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

Next step is to split the data into train/test.

In [None]:
# Split the data into train/test and remove variable 'Class' and prepare for autoencoder
X_train, X_test = train_test_split(data, test_size=0.3, random_state=123)
X_train = X_train.drop(['Class'], axis=1)
y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)
X_train = X_train.values
X_test = X_test.values

print("X_train:")
print(X_train.shape)
print("X_test:")
print(X_test.shape)

In [None]:
X_test[1]

Define the framework for the autoencoder and then compile and fit using the training data.

In [None]:
# Define the encoded/decoder framework
input_dim = X_train.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(10e-5))(input_layer)
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
# Compile and fit the autoencoder
nb_epoch = 5
batch_size = 32
autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath="model.h5", verbose=0, save_best_only=True)

history = autoencoder.fit(X_train, X_train, epochs=nb_epoch, batch_size=batch_size, shuffle=True, validation_data=(X_test, X_test), verbose=1).history

In [None]:
predictions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - predictions, 2), axis=1)

In [None]:
error_df = pd.DataFrame({'reconstruction_error': mse, 'true_class': y_test})
error_df.describe()

The model can be accessed based on various metrics: ROC curve, precision, recall based on the business requirement. 

In [None]:
# Metrics
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

In [None]:
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('ROC curve')
plt.plot(fpr, tpr, label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'o--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate (TPR)')
plt.xlabel('False Positive Rate (FPR)')
plt.show();

In [None]:
precision, recall, th = precision_recall_curve(error_df.true_class, error_df.reconstruction_error)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

## Saving the model

In [None]:
autoencoder

In [None]:
# Save the model for operationalization: https://machinelearningmastery.com/save-load-keras-deep-learning-models/
from keras.models import model_from_json
import os
import h5py
from sklearn import datasets 
 
# save model
# serialize model to JSON
model_json = autoencoder.to_json()
with open("C://dsvm//notebooks/autoencoder.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
autoencoder.save_weights("C://dsvm//notebooks/autoencoder.h5")
print("Model saved")

In [None]:
# load json and create model
json_file = open('C://dsvm//notebooks/autoencoder.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("C://dsvm//notebooks/autoencoder.h5")
print("Model loaded")

In [None]:
score = loaded_model.predict(X_test)
print(score.shape)

In [None]:
score[1]

In [None]:
X_test[1]