In [84]:
import numpy as np
import pandas as pd
import scipy.io

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support as prf

import torchvision.datasets as dset
import os

from types import SimpleNamespace

In [85]:
class Data_Loader:

    def __init__(self, n_trains=None):
        self.n_train = n_trains
        self.urls = [
        "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz",
        "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names"
        ]

    def norm_kdd_data(self, train_real, val_real, val_fake, cont_indices):
        symb_indices = np.delete(np.arange(train_real.shape[1]), cont_indices)
        mus = train_real[:, cont_indices].mean(0)
        sds = train_real[:, cont_indices].std(0)
        sds[sds == 0] = 1

        def get_norm(xs, mu, sd):
            bin_cols = xs[:, symb_indices]
            cont_cols = xs[:, cont_indices]
            cont_cols = np.array([(x - mu) / sd for x in cont_cols])
            return np.concatenate([bin_cols, cont_cols], 1)

        train_real = get_norm(train_real, mus, sds)
        val_real = get_norm(val_real, mus, sds)
        val_fake = get_norm(val_fake, mus, sds)
        return train_real, val_real, val_fake


    def norm_data(self, train_real, val_real, val_fake):
        mus = train_real.mean(0)
        sds = train_real.std(0)
        sds[sds == 0] = 1

        def get_norm(xs, mu, sd):
            return np.array([(x - mu) / sd for x in xs])

        train_real = get_norm(train_real, mus, sds)
        val_real = get_norm(val_real, mus, sds)
        val_fake = get_norm(val_fake, mus, sds)
        return train_real, val_real, val_fake

    def norm(self, data, mu=1):
        return 2 * (data / 255.) - mu

    def get_dataset(self, dataset_name, c_percent=None, true_label=1):
        if dataset_name == 'cifar10':
            return self.load_data_CIFAR10(true_label)
        if dataset_name == 'kdd':
            return self.KDD99_train_valid_data()
        if dataset_name == 'kddrev':
            return self.KDD99Rev_train_valid_data()
        if dataset_name == 'thyroid':
            return self.Thyroid_train_valid_data()
        if dataset_name == 'arrhythmia':
            return self.Arrhythmia_train_valid_data()
        if dataset_name == 'ckdd':
            return self.contaminatedKDD99_train_valid_data(c_percent)


    def load_data_CIFAR10(self, true_label):
        root = './data'
        if not os.path.exists(root):
            os.mkdir(root)

        trainset = dset.CIFAR10(root, train=True, download=True)
        train_data = np.array(trainset.data)
        train_labels = np.array(trainset.targets)

        testset = dset.CIFAR10(root, train=False, download=True)
        test_data = np.array(testset.data)
        test_labels = np.array(testset.targets)

        train_data = train_data[np.where(train_labels == true_label)]
        x_train = self.norm(np.asarray(train_data, dtype='float32'))
        x_test = self.norm(np.asarray(test_data, dtype='float32'))
        return x_train, x_test, test_labels


    def Thyroid_train_valid_data(self):
        data = scipy.io.loadmat("./thyroid.mat")
        samples = data['X']  # 3772
        labels = ((data['y']).astype(np.int32)).reshape(-1)

        norm_samples = samples[labels == 0]  # 3679 norm
        anom_samples = samples[labels == 1]  # 93 anom

        n_train = len(norm_samples) // 2
        x_train = norm_samples[:n_train]  # 1839 train

        val_real = norm_samples[n_train:]
        val_fake = anom_samples
        return self.norm_data(x_train, val_real, val_fake)


    def Arrhythmia_train_valid_data(self):
        data = scipy.io.loadmat("./arrhythmia.mat")
        samples = data['X']  # 518
        labels = ((data['y']).astype(np.int32)).reshape(-1)

        norm_samples = samples[labels == 0]  # 452 norm
        anom_samples = samples[labels == 1]  # 66 anom

        n_train = len(norm_samples) // 2
        x_train = norm_samples[:n_train]  # 226 train

        val_real = norm_samples[n_train:]
        val_fake = anom_samples
        return self.norm_data(x_train, val_real, val_fake)


    def KDD99_preprocessing(self):
        df_colnames = pd.read_csv(self.urls[1], skiprows=1, sep=':', names=['f_names', 'f_types'])
        df_colnames.loc[df_colnames.shape[0]] = ['status', ' symbolic.']
        df = pd.read_csv(self.urls[0], header=None, names=df_colnames['f_names'].values)
        df_symbolic = df_colnames[df_colnames['f_types'].str.contains('symbolic.')]
        df_continuous = df_colnames[df_colnames['f_types'].str.contains('continuous.')]
        samples = pd.get_dummies(df.iloc[:, :-1], columns=df_symbolic['f_names'][:-1])

        smp_keys = samples.keys()
        cont_indices = []
        for cont in df_continuous['f_names']:
            cont_indices.append(smp_keys.get_loc(cont))

        labels = np.where(df['status'] == 'normal.', 1, 0)
        return np.array(samples), np.array(labels), cont_indices


    def KDD99_train_valid_data(self):
        samples, labels, cont_indices = self.KDD99_preprocessing()
        anom_samples = samples[labels == 1]  # norm: 97278

        norm_samples = samples[labels == 0]  # attack: 396743

        n_norm = norm_samples.shape[0]
        ranidx = np.random.permutation(n_norm)
        n_train = n_norm // 2

        x_train = norm_samples[ranidx[:n_train]]
        norm_test = norm_samples[ranidx[n_train:]]

        val_real = norm_test
        val_fake = anom_samples
        return self.norm_kdd_data(x_train, val_real, val_fake, cont_indices)


    def KDD99Rev_train_valid_data(self):
        samples, labels, cont_indices = self.KDD99_preprocessing()

        norm_samples = samples[labels == 1]  # norm: 97278

        # Randomly draw samples labeled as 'attack'
        # so that the ratio btw norm:attack will be 4:1
        # len(anom) = 24,319
        anom_samples = samples[labels == 0]  # attack: 396743

        rp = np.random.permutation(len(anom_samples))
        rp_cut = rp[:24319]
        anom_samples = anom_samples[rp_cut]  # attack:24319

        n_norm = norm_samples.shape[0]
        ranidx = np.random.permutation(n_norm)
        n_train = n_norm // 2

        x_train = norm_samples[ranidx[:n_train]]
        norm_test = norm_samples[ranidx[n_train:]]

        val_real = norm_test
        val_fake = anom_samples
        return self.norm_kdd_data(x_train, val_real, val_fake, cont_indices)


    def contaminatedKDD99_train_valid_data(self, c_percent):
        samples, labels, cont_indices = self.KDD99_preprocessing()

        ranidx = np.random.permutation(len(samples))
        n_test = len(samples)//2
        x_test = samples[ranidx[:n_test]]
        y_test = labels[ranidx[:n_test]]

        x_train = samples[ranidx[n_test:]]
        y_train = labels[ranidx[n_test:]]

        norm_samples = x_train[y_train == 0]  # attack: 396743
        anom_samples = x_train[y_train == 1]  # norm: 97278
        n_contaminated = int((c_percent/100)*len(anom_samples))

        rpc = np.random.permutation(n_contaminated)
        x_train = np.concatenate([norm_samples, anom_samples[rpc]])

        val_real = x_test[y_test == 0]
        val_fake = x_test[y_test == 1]
        return self.norm_kdd_data(x_train, val_real, val_fake, cont_indices)

In [86]:
def tc_loss(zs, m):
    means = zs.mean(0).unsqueeze(0)
    res = ((zs.unsqueeze(2) - means.unsqueeze(1)) ** 2).sum(-1)
    pos = torch.diagonal(res, dim1=1, dim2=2)
    offset = torch.diagflat(torch.ones(zs.size(1))).unsqueeze(0).cuda() * 1e6
    neg = (res + offset).min(-1)[0]
    loss = torch.clamp(pos + m - neg, min=0).mean()
    return loss

def f_score(scores, labels, ratio):
    thresh = np.percentile(scores, ratio)
    y_pred = (scores >= thresh).astype(int)
    y_true = labels.astype(int)
    precision, recall, f_score, support = prf(y_true, y_pred, average='binary')
    return f_score


class TransClassifierTabular():
    def __init__(self, args):
        self.ds = args.dataset
        self.m = args.m
        self.lmbda = args.lmbda
        self.batch_size = args.batch_size
        self.ndf = args.ndf
        self.n_rots = args.n_rots
        self.d_out = args.d_out
        self.eps = args.eps

        self.n_epoch = args.n_epoch
        if args.dataset == "thyroid" or args.dataset == "arrhythmia":
            self.netC = netC1(self.d_out, self.ndf, self.n_rots).cuda()
        else:
            self.netC = netC5(self.d_out, self.ndf, self.n_rots).cuda()
        weights_init(self.netC)
        self.optimizerC = optim.Adam(self.netC.parameters(), lr=args.lr, betas=(0.5, 0.999))


    def fit_trans_classifier(self, train_xs, x_test, y_test, ratio):
        labels = torch.arange(self.n_rots).unsqueeze(0).expand((self.batch_size, self.n_rots)).long().cuda()
        celoss = nn.CrossEntropyLoss()
        print('Training')
        for epoch in range(self.n_epoch):
            self.netC.train()
            rp = np.random.permutation(len(train_xs))
            n_batch = 0
            sum_zs = torch.zeros((self.ndf, self.n_rots)).cuda()

            for i in range(0, len(train_xs), self.batch_size):
                self.netC.zero_grad()
                batch_range = min(self.batch_size, len(train_xs) - i)
                train_labels = labels
                if batch_range == len(train_xs) - i:
                    train_labels = torch.arange(self.n_rots).unsqueeze(0).expand((len(train_xs) - i, self.n_rots)).long().cuda()
                idx = np.arange(batch_range) + i
                xs = torch.from_numpy(train_xs[rp[idx]]).float().cuda()
                tc_zs, ce_zs = self.netC(xs)
                sum_zs = sum_zs + tc_zs.mean(0)
                tc_zs = tc_zs.permute(0, 2, 1)

                loss_ce = celoss(ce_zs, train_labels)
                er = self.lmbda * tc_loss(tc_zs, self.m) + loss_ce
                er.backward()
                self.optimizerC.step()
                n_batch += 1

            means = sum_zs.t() / n_batch
            means = means.unsqueeze(0)
            self.netC.eval()

            with torch.no_grad():
                val_probs_rots = np.zeros((len(y_test), self.n_rots))
                for i in range(0, len(x_test), self.batch_size):
                    batch_range = min(self.batch_size, len(x_test) - i)
                    idx = np.arange(batch_range) + i
                    xs = torch.from_numpy(x_test[idx]).float().cuda()
                    zs, fs = self.netC(xs)
                    zs = zs.permute(0, 2, 1)
                    diffs = ((zs.unsqueeze(2) - means) ** 2).sum(-1)

                    diffs_eps = self.eps * torch.ones_like(diffs)
                    diffs = torch.max(diffs, diffs_eps)
                    logp_sz = torch.nn.functional.log_softmax(-diffs, dim=2)

                    val_probs_rots[idx] = -torch.diagonal(logp_sz, 0, 1, 2).cpu().data.numpy()

                val_probs_rots = val_probs_rots.sum(1)
                f1_score = f_score(val_probs_rots, y_test, ratio)
                print("Epoch:", epoch, ", fscore: ", f1_score)
        return f1_score


In [87]:
def weights_init(m):
    classname = m.__class__.__name__
    if isinstance(m, nn.Linear):
        init.xavier_normal_(m.weight, gain=np.sqrt(2.0))
    elif classname.find('Conv') != -1:
        init.xavier_normal_(m.weight, gain=np.sqrt(2.0))
    elif classname.find('Linear') != -1:
        init.eye_(m.weight)
    elif classname.find('Emb') != -1:
        init.normal(m.weight, mean=0, std=0.01)

class netC5(nn.Module):
    def __init__(self, d, ndf, nc):
        super(netC5, self).__init__()
        self.trunk = nn.Sequential(
        nn.Conv1d(d, ndf, kernel_size=1, bias=False),
        nn.LeakyReLU(0.2, inplace=True),
        nn.Conv1d(ndf, ndf, kernel_size=1, bias=False),
        nn.LeakyReLU(0.2, inplace=True),
        nn.Conv1d(ndf, ndf, kernel_size=1, bias=False),
        nn.LeakyReLU(0.2, inplace=True),
        nn.Conv1d(ndf, ndf, kernel_size=1, bias=False),
        nn.LeakyReLU(0.2, inplace=True),
        nn.Conv1d(ndf, ndf, kernel_size=1, bias=False),
        )
        self.head = nn.Sequential(
        nn.LeakyReLU(0.2, inplace=True),
        nn.Conv1d(ndf, nc, kernel_size=1, bias=True),
        )


    def forward(self, input):
        tc = self.trunk(input)
        ce = self.head(tc)
        return tc, ce


class netC1(nn.Module):
    def __init__(self, d, ndf, nc):
        super(netC1, self).__init__()
        self.trunk = nn.Sequential(
        nn.Conv1d(d, ndf, kernel_size=1, bias=False),
        )
        self.head = nn.Sequential(
        nn.LeakyReLU(0.2, inplace=True),
        nn.Conv1d(ndf, nc, kernel_size=1, bias=True),
        )

    def forward(self, input):
        tc = self.trunk(input)
        ce = self.head(tc)
        return tc, ce

In [88]:
np.random.randn(args.n_rots, 6, args.d_out).shape

(256, 6, 32)

In [91]:
def load_trans_data(args):
    dl = Data_Loader()
    train_real, val_real, val_fake = dl.get_dataset(args.dataset, args.c_pr)
    y_test_fscore = np.concatenate([np.zeros(len(val_real)), np.ones(len(val_fake))])
    ratio = 100.0 * len(val_real) / (len(val_real) + len(val_fake))

    n_train, n_dims = train_real.shape
    rots = np.random.randn(args.n_rots, n_dims, args.d_out)

    print('data trafo', train_real.shape, n_dims, rots[0].shape)
    print('Calculating transforms')
    x_train = np.stack([train_real.dot(rot) for rot in rots], 2)
    val_real_xs = np.stack([val_real.dot(rot) for rot in rots], 2)
    val_fake_xs = np.stack([val_fake.dot(rot) for rot in rots], 2)
    x_test = np.concatenate([val_real_xs, val_fake_xs])
    return x_train, x_test, y_test_fscore, ratio, rots


def train_anomaly_detector(args):
    x_train, x_test, y_test, ratio, _ = load_trans_data(args)
    tc_obj = TransClassifierTabular(args)
    f_score = tc_obj.fit_trans_classifier(x_train, x_test, y_test, ratio)
    return f_score

In [92]:
args_dict = {'lr': 0.001,
    'n_rots': 32,
    'batch_size': 64, 
    'n_epoch': 25,
    'd_out': 4,
    'dataset': 'thyroid',
    'exp': 'affine',
    'c_pr': 0,
    'true_label': 1, 
    'ndf': 8,
    'm': 1,
    'lmbda': 0.1,
    'eps': 0,
    'n_iters': 500}


# python train_ad_tabular.py --n_rots=256 --n_epoch=1 --d_out=32 --ndf=8 --dataset=thyroid
args = SimpleNamespace(**args_dict)
args.n_rots = 256
args.n_epoch = 1
args.d_out = 32
args.ndf = 8
args.n_iters = 2

print("Dataset: ", args.dataset)

if args.dataset == 'thyroid' or args.dataset == 'arrhythmia':
    n_iters = args.n_iters
    f_scores = np.zeros(n_iters)
    for i in range(n_iters):
        f_scores[i] = train_anomaly_detector(args)
    print("AVG f1_score", f_scores.mean())
else:
    train_anomaly_detector(args)

Dataset:  thyroid
data trafo (1839, 6) 6 (6, 32)
Calculating transforms
Training
Epoch: 0 , fscore:  0.7419354838709677
data trafo (1839, 6) 6 (6, 32)
Calculating transforms
Training
Epoch: 0 , fscore:  0.7311827956989246
AVG f1_score 0.7365591397849462


In [93]:
x_train, x_test, y_test, ratio, rots = load_trans_data(args)
tc_obj = TransClassifierTabular(args)
f_score = tc_obj.fit_trans_classifier(x_train, x_test, y_test, ratio)

data trafo (1839, 6) 6 (6, 32)
Calculating transforms
Training
Epoch: 0 , fscore:  0.7419354838709677


In [34]:
x_train.shape

(1839, 32, 256)

In [35]:
tc_obj.netC

netC1(
  (trunk): Sequential(
    (0): Conv1d(32, 8, kernel_size=(1,), stride=(1,), bias=False)
  )
  (head): Sequential(
    (0): LeakyReLU(negative_slope=0.2, inplace=True)
    (1): Conv1d(8, 256, kernel_size=(1,), stride=(1,))
  )
)

In [53]:
rots.shape

(256, 6, 32)

In [70]:
tc_obj.netC.trunk[0].weight.shape

torch.Size([8, 32, 1])

In [75]:
data_rot = np.stack([np.array([[0,0,0,0,0,1]]).dot(rot) for rot in rots], 2)

In [79]:
data = torch.tensor(data_rot, dtype=torch.float)

tc_obj.netC.trunk(data.to('cuda'))#.shape

tensor([[[-0.5375, -0.1371,  0.6404,  ..., -0.2383, -0.6767, -0.3132],
         [-0.3886, -1.3001, -0.3662,  ...,  0.7672,  0.3513,  0.2654],
         [-0.1008, -0.4975, -0.9123,  ...,  0.7854, -0.3531, -0.8367],
         ...,
         [-0.0561,  0.6632,  0.1491,  ..., -0.8441, -0.7028,  0.9083],
         [ 0.1384,  0.1224, -0.7818,  ..., -0.4075, -0.5954, -0.1950],
         [ 0.8121, -0.1285, -0.3154,  ..., -0.3568, -0.0724,  0.1526]]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

In [82]:
t1 = tc_obj.netC.trunk(data.to('cuda'))
tc_obj.netC.head(t1).shape

torch.Size([1, 256, 256])

In [25]:
tc_obj.netC.trunk[0].weight.shape

torch.Size([8, 32, 1])

In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--n_rots', default=32, type=int)
    parser.add_argument('--batch_size', default=64, type=int)
    parser.add_argument('--n_epoch', default=25, type=int)
    parser.add_argument('--d_out', default=4, type=int)
    parser.add_argument('--dataset', default='thyroid', type=str)
    parser.add_argument('--exp', default='affine', type=str)
    parser.add_argument('--c_pr', default=0, type=int)
    parser.add_argument('--true_label', default=1, type=int)
    parser.add_argument('--ndf', default=8, type=int)
    parser.add_argument('--m', default=1, type=float)
    parser.add_argument('--lmbda', default=0.1, type=float)
    parser.add_argument('--eps', default=0, type=float)
    parser.add_argument('--n_iters', default=500, type=int)

    args = parser.parse_args()
    print("Dataset: ", args.dataset)

    if args.dataset == 'thyroid' or args.dataset == 'arrhythmia':
        n_iters = args.n_iters
        f_scores = np.zeros(n_iters)
        for i in range(n_iters):
            f_scores[i] = train_anomaly_detector(args)
        print("AVG f1_score", f_scores.mean())
    else:
        train_anomaly_detector(args)
