# [ LG전자 H&A DX Intensive Course - Auto-Encoder for Anomaly Detection ]

Auto-Encoder를 활용한 tabular anomaly detection

In [None]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1e541AXa81DqeD-XpPhNWnWlewo8yjbOa

# Import modules

In [None]:
import numpy as np
import pandas as pd
import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam, SGD


from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

# Functions

In [None]:
def torch_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
    # CUDA randomness
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    np.random.seed(random_seed)
    random.seed(random_seed)
    os.environ['PYTHONHASHSEED'] = str(random_seed)


def train(
    model, dataloader, criterion, optimizer, log_interval: int, device: str) -> list:

    total_loss = []

    model.train()
    for i, (inputs, _) in enumerate(dataloader):

        # convert device
        inputs = inputs.to(device)

        # model outputs
        outputs = model(inputs)

        # loss
        loss = criterion(inputs, outputs).mean()
        total_loss.append(loss.item())

        # calculate gradients
        loss.backward()

        # update model weights
        optimizer.step()
        optimizer.zero_grad()

        # log learning history
        if i % log_interval == 0 or (i+1) == len(dataloader):
            print(f"{'TRAIN':5s} [{i+1:5d}/{len(dataloader):5d}] loss: {np.mean(total_loss):.4f}")

    # average loss
    avg_loss = np.mean(total_loss)

    return avg_loss

def test(
    model, dataloader, criterion, log_interval: int, device: str) -> list:

    # for auroc
    total_loss = []
    total_inputs = []
    total_targets = []
    total_outputs = []

    torch_seed(223)
    model.eval()

    with torch.no_grad():
        for i, (inputs, targets) in enumerate(dataloader):
            # get inputs and targets
            total_inputs.extend(inputs.numpy())
            total_targets.extend(targets.numpy())

            # convert device
            inputs = inputs.to(device)

            # model outputs
            outputs = model(inputs)
            total_outputs.extend(outputs.cpu().numpy())

            # loss
            loss = criterion(inputs, outputs).max(dim=-1)[0]
            total_loss.extend(loss.cpu().numpy())

            # log learning history
            if i % log_interval == 0 or (i+1) == len(dataloader):
                print(f"{'TSET':5s} [{i+1:5d}/{len(dataloader):5d}] loss: {np.mean(total_loss):.4f}")

    # total inputs, outputs, targets and loss
    total_inputs = np.concatenate(total_inputs, axis=0)
    total_outputs = np.concatenate(total_outputs, axis=0)
    total_targets = np.array(total_targets).reshape(-1)
    total_loss = np.array(total_loss).reshape(-1)

    # auroc
    if sum(total_targets) == 0:
        auroc = 1.
    else:
        auroc = roc_auc_score(total_targets, total_loss)

    # return
    return auroc, total_inputs, total_outputs, total_loss


def fit(
    model, trainloader, testloader, criterion, optimizer,
    epochs: int, log_interval: int, device: str) -> list:

    train_history = []
    test_history_auroc = []

    # fitting model
    for i in range(epochs):
        print(f'\nEpoch: [{i+1}/{epochs}]')
        train_loss = train(
            model        = model,
            dataloader   = trainloader,
            criterion    = criterion,
            optimizer    = optimizer,
            log_interval = log_interval,
            device       = device
        )

        test_auroc, total_inputs, total_outputs, total_loss = test(
            model        = model,
            dataloader   = testloader,
            criterion    = criterion,
            log_interval = log_interval,
            device       = device
        )

        print(f'\nTest AUROC: {test_auroc:.4f}')

        train_history.append(train_loss)
        test_history_auroc.append(test_auroc)

    return train_history, test_history_auroc


def figure(
    all_train_history: list, all_test_history_auroc: list, all_exp_name: list) -> None:

    fig, ax = plt.subplots(1, 2, figsize=(10,4))

    # train line plot
    for i, (train_h, exp_name) in enumerate(zip(all_train_history, all_exp_name)):
        sns.lineplot(
            x     = range(1, len(train_h)+1),
            y     = train_h,
            label = exp_name,
            ax    = ax[0]
        )

    # test AUROC lineplot
    for i, (test_h, exp_name) in enumerate(zip(all_test_history_auroc, all_exp_name)):
        sns.lineplot(
            x     = range(1, len(test_h)+1),
            y     = test_h,
            label = exp_name,
            ax    = ax[1]
        )

    # set y axis label
    ax[0].set_ylabel('MSE Loss')
    ax[1].set_ylabel('AUROC')

    # set x axis label
    ax[0].set_xlabel('Epochs')
    ax[1].set_xlabel('Epochs')

    # set title
    ax[0].set_title('Train loss history')
    ax[1].set_title('Test AUROC history')

    # set y value limit
    max_train = np.max(all_train_history)

    ax[0].set_ylim(0, max_train+0.01)
    ax[1].set_ylim(0, 1)

    # set legend
    ax[0].legend(loc='upper left')
    ax[1].legend(loc='upper left')
    plt.tight_layout()
    plt.show()

# Configuration for experiments

In [None]:
class Config:
    # dataset 관련 parameters
    datapath = './creditcard.csv'

    # training 관련 parameters
    epochs = 15
    batch_size = 512
    test_batch_size = 128
    learning_rate = 0.001
    num_workers = 2
    log_interval = 2000

    # device
    device = 'cuda'

    # seed
    seed = 223

cfg = Config()

# Load dataset and dataloader

**Feature Description**
- **Time**: Number of seconds elapsed between this transaction and the first transaction in the dataset
- **V{ID}**: PCA results
- **Amount**: Transaction amount
- **Class**: 1 for fraudulent transactions, 0 otherwise

In [None]:
df = pd.read_csv(cfg.datapath)
print('df.shape: ',df.shape)
df.head()

In [None]:
df.isna().sum(axis=0)

In [None]:
# drop NaN
df = df.dropna()
print('df.shape: ',df.shape)

In [None]:
# target
pd.concat([df['Class'].value_counts(), df['Class'].value_counts(normalize=True)], axis=1)

## Split dataset into train and test dataset

In [None]:
train_idx, _ = train_test_split(df[df['Class']==0].index.values, test_size=0.1, random_state=cfg.seed)

In [None]:
df_train = df.iloc[train_idx, :]
df_test = df.drop(train_idx, axis=0)

X_train = df_train.drop('Class', axis=1).values
y_train = df_train['Class'].values

X_test = df_test.drop('Class', axis=1).values
y_test = df_test['Class'].values

print('X_train.shape: ',X_train.shape)
print('y_train.shape: ',y_train.shape)
print('X_test.shape: ',X_test.shape)
print('y_test.shape: ',y_test.shape)

## Scaling

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
trainset = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
testset = TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))

trainloader = DataLoader(trainset, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers)
testloader = DataLoader(testset, batch_size=cfg.test_batch_size, shuffle=False, num_workers=cfg.num_workers)

# Auto-Encoder

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim: int, dims: list):
        super().__init__()

        dims = [input_dim] + dims

        self.enc = nn.Sequential(*self.build_layer(dims=dims))
        self.dec = nn.Sequential(*self.build_layer(dims=dims[::-1], up=True))
        self.output = nn.Linear(in_features=input_dim, out_features=input_dim)

    def build_layer(self, dims, up=False):
        layer = []

        for i in range(1, len(dims)):
            if up:
                layer_i = [
                    nn.Linear(
                        in_features  = dims[i-1],
                        out_features = dims[i],
                    ),
                    nn.ReLU()
                ]
            else:
                layer_i = [
                    nn.Linear(
                        in_features  = dims[i-1],
                        out_features = dims[i],
                    ),
                    nn.ReLU(),
                ]

            layer.extend(layer_i)

        return layer

    def encoder(self, x):
        out = self.enc(x)

        return out

    def decoder(self, out):
        out = self.dec(out)
        out = self.output(out)
        out = F.sigmoid(out)

        return out

    def forward(self, x):
        out = self.encoder(x)
        out = self.decoder(out)

        return out

In [None]:
torch_seed(cfg.seed)
ae = AutoEncoder(input_dim=X_train.shape[1], dims=[64, 32, 16])
ae.to(cfg.device)
print('load Auto-Encoder')
print('The number of model parameters: ',sum([p.numel() for p in ae.parameters()]))

# set reduction to none
criterion = nn.MSELoss(reduction='none')
optimizer = Adam(ae.parameters(), lr=cfg.learning_rate)

In [None]:
ae

In [None]:
inputs, targets = next(iter(trainloader))
inputs = inputs.to(cfg.device)
print('inputs.shape: ',inputs.shape)

outputs = ae(inputs)
print('outputs.shape: ',outputs.shape)

In [None]:
torch_seed(cfg.seed)
train_history, test_history_auroc = fit(
    model        = ae,
    trainloader  = trainloader,
    testloader   = testloader,
    criterion    = criterion,
    optimizer    = optimizer,
    epochs       = cfg.epochs,
    log_interval = cfg.log_interval,
    device       = cfg.device
)

In [None]:
all_train_history = [train_history]
all_test_history_auroc = [test_history_auroc]
all_exp_name = ['AE']

figure(
    all_train_history      = all_train_history,
    all_test_history_auroc = all_test_history_auroc,
    all_exp_name           = all_exp_name
)

In [None]:
test_auroc, total_inputs, total_outputs, total_loss = test(
    model        = ae,
    dataloader   = testloader,
    criterion    = criterion,
    log_interval = cfg.log_interval,
    device       = cfg.device
)

In [None]:
print('TEST AUROC: {:.2%}'.format(test_auroc))

In [None]:
def minmax(x):
    return (x - x.min()) / (x.max() - x.min())

df_test['pred'] = minmax(total_loss)

sns.boxplot(x='Class', y='pred', hue='Class', data=df_test)
plt.title('Anomaly Score Distribution')
plt.xticks([0,1], ['Normal','Abnomral'])
plt.ylabel('Anomaly Score')
plt.show()