In [None]:
import pandas as pd
import numpy as np
import pickle

from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


import torch.utils.data as data_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams

from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)
# Set the theme of graph (style), color (palette), font scale 
sns.set(style='darkgrid', palette='muted', font_scale=1.5)
# figure size in inches
rcParams['figure.figsize'] = 14, 8

RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]



In [None]:
df = pd.read_csv('data/creditcard.csv')

In [None]:
df.head(5)

In [None]:
print(df.shape)
print(df.describe())

In [None]:
df.isnull().values.any()

In [None]:
count_classes = pd.value_counts(df['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0, color="g")
plt.title("Normal vs Fraudulant Transactions")
plt.xticks(range(2), LABELS)
plt.xlabel("Transaction Class")
plt.ylabel("Frequency")



In [None]:
fraudsDF = df[df.Class == 1]
normalDF = df[df.Class == 0]

In [None]:
fraudsDF.shape

In [None]:
normalDF.shape

In [None]:
fraudsDF.Amount.describe()

In [None]:
normalDF.Amount.describe()

Create two histograms to show number of transactions to a group of amount ranging from 0 to 20000

In [None]:
f, (ax1, ax2) = plt.subplots(2,1, sharex=True)
f.suptitle('Amount per transaction by class')

bins = 50

ax1.hist(fraudsDF.Amount, bins=bins)
ax1.set_title('Fraud')

ax2.hist(normalDF.Amount, bins=bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('# of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show()

In [None]:

f, (ax1, ax2) = plt.subplots(2,1, sharex=True)
f.suptitle('Time per transaction by class')

bins = 1

ax1.hist(fraudsDF.Time, bins=bins)
ax1.set_title('Fraud')

ax2.hist(normalDF.Time, bins=bins)
ax2.set_title('Normal')

plt.xlabel('Time ($)')
plt.ylabel('# of Transactions')
plt.xlim((0, 1000))
plt.yscale('log')
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(2,1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')

ax1.scatter(fraudsDF.Time, fraudsDF.Amount)
ax1.set_title('Fraud')

ax2.scatter(normalDF.Time, normalDF.Amount)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

In [None]:
transactionData = df.drop(['Time'], axis=1)
print(transactionData)

class StandardScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
"""Standardize features by removing the mean and scaling to unit variance.
The standard score of a sample `x` is calculated as:

        z = (x - u) / s
where `u` is the mean of the training samples or zero if `with_mean=False`,
    and `s` is the standard deviation of the training samples or one if
    `with_std=False`

Why? Covariance vs correlation.





In [None]:
transactionData['Amount'] = StandardScaler().fit_transform(transactionData['Amount'].values.reshape(-1, 1))
print(transactionData['Amount'].describe())

In [None]:
X_train, X_test = train_test_split(transactionData, test_size=0.2, random_state=RANDOM_SEED)

X_train = X_train[X_train.Class == 0]
X_train = X_train.drop(['Class'], axis=1)
print(type(X_train))
y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)

X_train = X_train.values
print(type(X_train))
X_test = X_test.values
y_test = y_test.values
print(y_test.size)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(29, 14),
            nn.Tanh(),
            nn.Linear(14, 7),
            nn.LeakyReLU(),
            )
        
        self.decoder = nn.Sequential(
           nn.Linear(7, 7),
           nn.Tanh(),
           nn.Linear(7, 29),
           nn.LeakyReLU()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
model = Autoencoder().double().cpu()

In [None]:


num_epochs = 100
minibatch_size = 32
learning_rate = 1e-3



In [None]:
train_loader = data_utils.DataLoader(X_train, batch_size=minibatch_size, shuffle=True)

In [None]:
test_loader = data_utils.DataLoader(X_test, batch_size=1, shuffle=False)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
model.parameters(), lr=learning_rate, weight_decay=10e-05)

In [None]:
history = {}
history['train_loss'] = []
history['test_loss'] = []

In [None]:
for epoch in range(num_epochs):
    h = np.array([])
    for data in train_loader:
        print(type(data))
        data = Variable(data).cpu()
        print(type(data))
        # ===================forward=====================
        output = model(data)
        loss = criterion(output, data)
        h = np.append(h, loss.item())
        
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    mean_loss = np.mean(h)
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch + 1, num_epochs, mean_loss))
    history['train_loss'].append(mean_loss)
    

torch.save(model.state_dict(), './credit_card_model.pth')

In [None]:

plt.plot(history['train_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.axis([0,100,0.69,0.80])

plt.show()

In [None]:


pred_losses = {'pred_loss' : []}
model.eval()
with torch.no_grad():
   # test_loss = 0
    for data in test_loader:
        inputs = data
        print(inputs)
        outputs = model(inputs)
        loss = criterion(outputs, inputs).data.item()
        print(loss)
        pred_losses['pred_loss'].append(loss)
        # pred_losses = model([y_test.size, y_test])
reconstructionErrorDF = pd.DataFrame(pred_losses)
reconstructionErrorDF['Class'] = y_test



In [None]:
reconstructionErrorDF.describe()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
normal_error_df = reconstructionErrorDF[(reconstructionErrorDF['Class']== 0) & (reconstructionErrorDF['pred_loss'] < 10)]
_ = ax.hist(normal_error_df.pred_loss.values, bins=10)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
fraud_error_df = reconstructionErrorDF[(reconstructionErrorDF['Class']== 1) ]
_ = ax.hist(fraud_error_df.pred_loss.values, bins=10)

In [None]:
fpr, tpr, thresholds = roc_curve(reconstructionErrorDF.Class, reconstructionErrorDF.pred_loss)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positi ve Rte')
plt.show()