## Import & Data Load

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import seaborn as sns

from sklearn.preprocessing import Normalizer
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import torch.utils.data as data_utils

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(41)

In [4]:
train_df = pd.read_csv('open/train.csv')
train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('open/val.csv')
val_df = val_df.drop(columns=['ID'])

new_train_df = train_df[['V2','V3','V4','V7','V9','V10','V11','V12','V14','V16','V17','V18']]
new_val_df = val_df[['V2','V3','V4','V7','V9','V10','V11','V12','V14','V16','V17','V18','Class']]

## Method 1. GANomaly와 EllipticEnvelope를 이용한 Ensemble

## GANomaly

*   GANomaly는 semi-supervised 이상치 탐지에 사용되는 모델입니다

*   기존의 AnoGAN에 비해 Encoder 부분이 추가 됨으로써 latent vector를 두 번 생성하여 loss를 한 번 더 계산해주어 정확도를 높인다는 점이 장점이라 할 수 있습니다다



In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [6]:
LR = 1e-2
batch_size = 16384

In [7]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [8]:
train_dataset = MyDataset(df=train_df, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MyDataset(df = val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [9]:
class GANnomaly(object):

    def __init__(self):
        self.encoder = Encoder().to(device)
        self.decoder = Decoder().to(device)
        self.discriminator = Discriminator().to(device)
        
        self.models = [self.encoder, self.decoder, self.discriminator]   
                
        self.params = None
        for idx_m, model in enumerate(self.models):
            if (self.params is None):
                self.params = list(model.parameters())
            else:
                self.params = self.params + list(model.parameters())
                
        self.optimizer = optim.Adam(self.params, lr=0.0001)
        
class Encoder(nn.Module):

    def __init__(self):
        super(Encoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.ELU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.ELU(),
            nn.Linear(128,256),
            nn.BatchNorm1d(256),
            nn.ELU(),
        )

    def forward(self, x):

        z_code = self.encoder(x)

        return z_code

class Decoder(nn.Module):

    def __init__(self):
        super(Decoder, self).__init__()

        self.decoder = nn.Sequential(
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.ELU(),
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.ELU(),
            nn.Linear(64,30),
        )

    def forward(self, x):

        x_hat = self.decoder(x)

        return x_hat
    
    

class Discriminator(nn.Module):

    def __init__(self):
        super(Discriminator, self).__init__()

        self.dis_dense = nn.ModuleList([
            nn.Linear(30, 64),
            nn.BatchNorm1d(64),
            nn.ELU(),
            nn.Linear(64, 1),
            nn.BatchNorm1d(1),
            nn.Sigmoid(),
        ])
        
    def forward(self, x):

        featurebank = []
        
        for idx, layer in enumerate(self.dis_dense):
            x = layer(x)
            if("torch.nn.modules.activation" in str(type(layer))):
                featurebank.append(x)
        disc_score = x

        return disc_score, featurebank

In [10]:

def loss_enc(z_code, z_code_hat):

    l_enc = torch.sum((z_code - z_code_hat)**2, dim=(1))

    return l_enc

def loss_rec(x, x_hat):
    l_con = torch.sum(torch.abs(x - x_hat), dim=(1))

    return l_con

def loss_adv(dis_x, dis_x_hat, features_real, features_fake):

    l_adv = torch.sum((dis_x - dis_x_hat)**2, dim=(1))


    for fidx, _ in enumerate(features_real):
        
        l_adv += torch.sum((features_real[fidx] - features_fake[fidx])**2, dim=(1))
            
    return l_adv


def loss_gan(z_code, z_code_hat, x, x_hat,
    dis_x, dis_x_hat, features_real, features_fake,
    w_enc=1, w_con=50, w_adv=1):
    

    z_code, z_code_hat, x, x_hat = z_code, z_code_hat, x, x_hat

    for fidx, _ in enumerate(features_real):
        features_real[fidx] = features_real[fidx]
        features_fake[fidx] = features_fake[fidx]
        
    l_enc = loss_enc(z_code, z_code_hat)
    l_con = loss_rec(x, x_hat)
    l_adv = loss_adv(dis_x, dis_x_hat, features_real, features_fake)


    l_tot = torch.mean((w_enc * l_enc) + (w_con * l_con) + (w_adv * l_adv))
    
    l_enc = torch.mean(l_enc)
    l_con = torch.mean(l_con)
    l_adv = torch.mean(l_adv)


    return l_tot, l_enc, l_con, l_adv

In [11]:
epochs = 200
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

model = GANnomaly()
model.encoder.train()
model.decoder.train()
model.discriminator.train()


scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model.optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

for i in range(1,epochs+1) :
        
    train_loss = []
    best_score = 0

    for idx, data in enumerate(train_loader):
        
        x = data.to(device)
        
        z_code = model.encoder(x)
        x_hat = model.decoder(z_code)
        z_code_hat = model.encoder(x_hat)
        
        dis_x, features_real = model.discriminator(x)
        dis_x_hat, features_fake = model.discriminator(x_hat)
        

        l_tot, l_enc, l_con, l_adv = loss_gan(z_code, z_code_hat, 
                                       x, x_hat, 
                                       dis_x, dis_x_hat, 
                                       features_real, features_fake)

        model.optimizer.zero_grad()
        l_tot.backward()
        model.optimizer.step()
        
        train_loss.append(l_tot.item())
    
    #validation
    model.encoder.eval()
    model.decoder.eval()
    
    pred = []
    true = []
    
    
    with torch.no_grad():
        for x, y in iter(val_loader):
            x = x.float().to(device)

            _x = model.decoder(model.encoder(x))
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff) < 0.95, 1,0).tolist()
            pred += batch_pred
            true += y.tolist()
        
        score = f1_score(true, pred, average='macro')
    
    
    print(f'Epoch : [{i}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')


    scheduler.step(score)

    if best_score < score:
        best_score = score
        torch.save(model.encoder.state_dict(), 'best_encoder.pth', _use_new_zipfile_serialization=False)
        torch.save(model.decoder.state_dict(), 'best_decoder.pth', _use_new_zipfile_serialization=False)
    
        

Epoch : [1] Train loss : [1463.4795968191963] Val Score : [0.0010529271374420891])
Epoch : [2] Train loss : [1081.2173723493304] Val Score : [0.0010529271374420891])
Epoch : [3] Train loss : [1047.4806431361608] Val Score : [0.0010529271374420891])
Epoch : [4] Train loss : [1020.5997924804688] Val Score : [0.0010529271374420891])
Epoch : [5] Train loss : [997.6226545061384] Val Score : [0.0010529271374420891])
Epoch : [6] Train loss : [976.8873291015625] Val Score : [0.0010529271374420891])
Epoch : [7] Train loss : [957.3501935686384] Val Score : [0.0010529271374420891])
Epoch : [8] Train loss : [938.3730119977679] Val Score : [0.0010529271374420891])
Epoch : [9] Train loss : [919.4818115234375] Val Score : [0.0010529271374420891])
Epoch : [10] Train loss : [900.511492047991] Val Score : [0.0010529271374420891])
Epoch : [11] Train loss : [881.2325352260044] Val Score : [0.0010529271374420891])
Epoch : [12] Train loss : [861.3802228655134] Val Score : [0.0010529271374420891])
Epoch 0001

Epoch : [102] Train loss : [188.1869179861886] Val Score : [0.5065714272906093])
Epoch : [103] Train loss : [186.20270211356026] Val Score : [0.5070295408296489])
Epoch : [104] Train loss : [184.2627716064453] Val Score : [0.5075664862314123])
Epoch : [105] Train loss : [182.34217834472656] Val Score : [0.5080666648195433])
Epoch : [106] Train loss : [180.48748561314173] Val Score : [0.508625929326738])
Epoch : [107] Train loss : [178.6505584716797] Val Score : [0.5091475667752355])
Epoch : [108] Train loss : [176.80838666643416] Val Score : [0.509577923511901])
Epoch : [109] Train loss : [175.01354108537947] Val Score : [0.5099121032616577])
Epoch : [110] Train loss : [173.25343540736608] Val Score : [0.510303376299578])
Epoch : [111] Train loss : [171.55914306640625] Val Score : [0.5107009582598172])
Epoch : [112] Train loss : [169.83494785853796] Val Score : [0.5109966687836232])
Epoch : [113] Train loss : [168.1860089983259] Val Score : [0.5114884236319622])
Epoch : [114] Train los

In [14]:
model = GANnomaly()
model.encoder.load_state_dict(torch.load('best_encoder.pth'))
model.decoder.load_state_dict(torch.load('best_decoder.pth'))
model.encoder.eval()
model.decoder.eval()

Decoder(
  (decoder): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ELU(alpha=1.0)
    (6): Linear(in_features=64, out_features=30, bias=True)
  )
)

In [15]:
val_dataset = MyDataset(df = val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)

In [16]:
model.encoder.eval()
model.decoder.eval()
val_sim = np.zeros(0)

with torch.no_grad():
    for data, label in val_loader:
        data, target = data.to(device), data.cpu()
        output = model.decoder(model.encoder(data))
        output = output.reshape(-1,30).cpu()
        target = target.reshape(-1,30)
        val_sim = np.append(val_sim, cos(output,target))

In [17]:
thr = 0.814

pred_GAN = np.where(val_sim < thr,1,0)

print(f1_score(pred_GAN,val_df['Class'],average='macro'))
print(confusion_matrix(val_df['Class'],pred_GAN))

0.9165787375726882
[[28427     5]
 [    5    25]]


In [18]:
print(classification_report(val_df['Class'],pred_GAN))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.83      0.83      0.83        30

    accuracy                           1.00     28462
   macro avg       0.92      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462



## EllipticEnvelope

*   사이킷런에서 제공하는 공분산 추정을 할 수 있는 객체입니다

*   데이터의 모양을 정의하는데, 중심 데이터에 타원을 맞추고 그 바깥의 점을 무시하는 방식입니다

*   인라이어 데이터가 정규 분포를 이룬다고 가정하면 인라이어 위치와 분산을 추정할 수 있고 이를 토대로 Mahalanobis 거리를 얻어 아웃라이어를 추정합니다


In [19]:
ratio = val_df['Class'].sum()/len(val_df)
ratio

0.0010540369615627855

In [20]:
EE = EllipticEnvelope(support_fraction = 0.994, contamination = ratio, random_state = 2022)
EE.fit(train_df)

In [21]:
def get_pred_label_EE(model, x, k):
    prob = model.score_samples(x)
    prob = torch.tensor(prob, dtype = torch.float)
    topk_indices = torch.topk(prob, k = k, largest = False).indices

    pred = torch.zeros(len(x), dtype = torch.long)
    pred[topk_indices] = 1
    return pred.tolist(), prob.tolist()

val_x = val_df.drop(columns= 'Class') # Input Data
val_y = val_df['Class'] # Label

pred_EE, prob = get_pred_label_EE(EE, val_x, 29)
val_score = f1_score(val_y, pred_EE, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, pred_EE))
print(confusion_matrix(val_y, pred_EE))

Validation F1 Score : [0.9236496787663914]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

[[28428     4]
 [    5    25]]


## Ensemble

In [22]:
from collections import Counter

def mode (x) :
    cnt = Counter(x)
    mode = cnt.most_common(1)
    return mode[0][0]

In [23]:
def get_ensemble_pred (x,) :
    # pred GAN
    test_dataset = MyDataset(x, False)
    test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
    test_sim = np.zeros(0)
    print('start')
    with torch.no_grad():
        for data in test_loader:
            data, target = data.to(device), data.cpu()
            output = model.decoder(model.encoder(data))
            output = output.reshape(-1,30).cpu()
            target = target.reshape(-1,30)
            test_sim = np.append(test_sim, cos(output,target))
    print('GAN_done')
    
    pred_GAN = np.where(test_sim < 0.95,1,0)
    # pred EE
    k = 0.002231531967748047
    pred_EE, prob = get_pred_label_EE(EE, x, int(len(x)*k))
    print('EE_done')
    preds = pd.DataFrame(zip(pred_GAN, pred_EE))
    preds.columns = ['pred_GAN', 'pred_EE']
    return preds, preds.apply(mode,axis = 1)
    

## Method 2. Auto Encoder + GAN

In [26]:
train_df = pd.read_csv('open/train.csv')
train_df = train_df.drop(columns=['ID'])
train_df["ABS Amount"] = train_df.abs().sum(1)
train_df["Amount"] = train_df.sum(1)
val_df = pd.read_csv('open/val.csv')
val_df = val_df.drop(columns=['ID'])
val_df["ABS Amount"] = val_df.abs().sum(1)
val_df["Amount"] = val_df.sum(1)
test_df = pd.read_csv('open/test.csv')
test_df = test_df.drop(columns=['ID'])
test_df["ABS Amount"] = test_df.abs().sum(1)
test_df["Amount"] = test_df.sum(1)

In [27]:
scaler = Normalizer()
train = scaler.fit_transform(train_df)
train = pd.DataFrame(train, columns = train_df.columns)
val = scaler.transform(val_df.drop(columns = ["Class"]))
val = pd.DataFrame(val, columns = [col for col in val_df.columns if col != "Class"])
val["Class"] = val_df["Class"]
test = scaler.transform(test_df)
test = pd.DataFrame(test, columns = test_df.columns)

In [28]:
class Encoder(nn.Module):
  def __init__(self, input_size, z_size):
    super().__init__()
    self.linear1 = nn.Linear(input_size, 128)
    self.linear2 = nn.Linear(128, 512)
    self.linear3 = nn.Linear(512, z_size)
    self.relu = nn.ReLU(True)
        
  def forward(self, real):
    out = self.linear1(real)
    out = self.relu(out)
    out = self.linear2(out)
    out = self.relu(out)
    out = self.linear3(out)
    z = self.relu(out)
    return z
    
class Decoder(nn.Module):
  def __init__(self, z_size, output_size):
    super().__init__()
    self.linear1 = nn.Linear(z_size, 512)
    self.linear2 = nn.Linear(512, 128)
    self.linear3 = nn.Linear(128, output_size)
    self.relu = nn.ReLU(True)
        
  def forward(self, z):
    out = self.linear1(z)
    out = self.relu(out)
    out = self.linear2(out)
    out = self.relu(out)
    out = self.linear3(out)
    return out
    
class UsadModel(nn.Module):
  def __init__(self, input_size, z_size):
    super().__init__()
    self.encoder = Encoder(input_size, z_size)
    self.decoder1 = Decoder(z_size, input_size)
    self.decoder2 = Decoder(z_size, input_size)
  
  def training_step(self, batch, n):
    z = self.encoder(batch)
    w1 = self.decoder1(z)
    w2 = self.decoder2(z)
    w3 = self.decoder2(self.encoder(w1))
    loss1 = 1/n*torch.mean((batch-w1)**2)+(1-1/n)*torch.mean((batch-w3)**2)
    loss2 = 1/n*torch.mean((batch-w2)**2)-(1-1/n)*torch.mean((batch-w3)**2)
    return loss1,loss2

def training(epochs, model, train_loader, val_loader, opt_func=torch.optim.Adam):
    history = []
    optimizer1 = opt_func(list(model.encoder.parameters())+list(model.decoder1.parameters()))
    optimizer2 = opt_func(list(model.encoder.parameters())+list(model.decoder2.parameters()))
    for epoch in range(epochs):
        for [batch] in train_loader:
            batch=to_device(batch,device)
            
            #Train AE1
            loss1,loss2 = model.training_step(batch,epoch+1)
            loss1.backward()
            optimizer1.step()
            optimizer1.zero_grad()
            
            
            #Train AE2
            loss1,loss2 = model.training_step(batch,epoch+1)
            loss2.backward()
            optimizer2.step()
            optimizer2.zero_grad()

        print(f"Epoch : {epoch+1}")
        f1_score, threshold = calc_f1(model, val_loader)
        torch.save(model.state_dict(), './usad_normal.pth', _use_new_zipfile_serialization=False)
    return  threshold


In [29]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [30]:
def calc_f1(model, val_loader):
    model.eval()
    pred = []
    true = []
    diffs = []
    with torch.no_grad():
        for x, y in iter(val_loader):
            x = x.float().to(device)

            _x = model.decoder1(model.encoder(x))
            l1loss = nn.L1Loss(reduction = "none")
            diff = l1loss(x, _x)
            diff = torch.sum(diff, 1).cpu().numpy().tolist()
            diffs.extend(diff)

            # f1 score
            true += y.tolist()
    thresholds = []
    f1_scores = []
    for thr in np.linspace(0,100, 1000):
      thresholds.append(thr)
      pred = np.where(np.array(diffs)>thr, 1,0).tolist()
      f1_scores.append(f1_score(true, pred, average='macro'))

    max_f1 = max(f1_scores)
    threshold = thresholds[np.argmax(f1_scores)]

    diffs = [[d] for d in diffs]
    return max_f1, threshold

In [31]:
device = get_default_device()
BATCH_SIZE =  256
N_EPOCHS = 100
input_size = 32
z_size = 1024

train_loader = torch.utils.data.DataLoader(
                                          data_utils.TensorDataset(torch.from_numpy(train.values).float()),
                                          batch_size=BATCH_SIZE,
                                          shuffle=True, 
                                          num_workers=6
                                          )

val_loader = torch.utils.data.DataLoader( 
                                          data_utils.TensorDataset(torch.from_numpy(val.drop(columns=["Class"]).values).float(), torch.from_numpy(val["Class"].values).float()), 
                                          batch_size=BATCH_SIZE, 
                                          shuffle=False, 
                                          num_workers=6
                                          )

test_loader = torch.utils.data.DataLoader(
                                          data_utils.TensorDataset(torch.from_numpy(test.values).float()),
                                          batch_size=BATCH_SIZE,
                                          shuffle=False, 
                                          num_workers=6
                                          )

model = UsadModel(input_size, z_size)
model = to_device(model,device)

In [32]:
threshold = training(N_EPOCHS,model,train_loader,val_loader)

Epoch : 1
Epoch : 2
Epoch : 3
Epoch : 4
Epoch : 5
Epoch : 6
Epoch : 7
Epoch : 8
Epoch : 9
Epoch : 10
Epoch : 11
Epoch : 12
Epoch : 13
Epoch : 14
Epoch : 15
Epoch : 16
Epoch : 17
Epoch : 18
Epoch : 19
Epoch : 20
Epoch : 21
Epoch : 22
Epoch : 23
Epoch : 24
Epoch : 25
Epoch : 26
Epoch : 27
Epoch : 28
Epoch : 29
Epoch : 30
Epoch : 31
Epoch : 32
Epoch : 33
Epoch : 34
Epoch : 35
Epoch : 36
Epoch : 37
Epoch : 38
Epoch : 39
Epoch : 40
Epoch : 41
Epoch : 42
Epoch : 43
Epoch : 44
Epoch : 45
Epoch : 46
Epoch : 47
Epoch : 48
Epoch : 49
Epoch : 50
Epoch : 51
Epoch : 52
Epoch : 53
Epoch : 54
Epoch : 55
Epoch : 56
Epoch : 57
Epoch : 58
Epoch : 59
Epoch : 60
Epoch : 61
Epoch : 62
Epoch : 63
Epoch : 64
Epoch : 65
Epoch : 66
Epoch : 67
Epoch : 68
Epoch : 69
Epoch : 70
Epoch : 71
Epoch : 72
Epoch : 73
Epoch : 74
Epoch : 75
Epoch : 76
Epoch : 77
Epoch : 78
Epoch : 79
Epoch : 80
Epoch : 81
Epoch : 82
Epoch : 83
Epoch : 84
Epoch : 85
Epoch : 86
Epoch : 87
Epoch : 88
Epoch : 89
Epoch : 90
Epoch : 91
Epoch : 

In [33]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    diffs = []
    with torch.no_grad():
        for [x] in iter(test_loader):

            x = x.float().to(device)
            _x = model.decoder1(model.encoder(x))
            l1loss = nn.L1Loss(reduction = "none")
            diff = l1loss(x, _x)
            diff = torch.sum(diff, 1).cpu().numpy().tolist()
            diffs.extend(diff)

    pred = np.where(np.array(diffs)>thr, 1,0).tolist()
    return pred

In [34]:
preds = prediction(model, 1.4, test_loader, device)