In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter
import socket,struct
import timeit
import math

import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

from dgl import from_networkx
import dgl.function as fn
import networkx as nx

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from pandas import MultiIndex, Int64Index


PROJ_PATH=/home/hoang/github/TS-IDS


In [2]:
# https://github.com/waimorris/Anomal-E/blob/main/Anomal_E_cicids2017.ipynb
class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
        self.activation = F.relu
        self.W_edge = nn.Linear(128 * 2, 128)
        self.reset_parameters()

    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
        return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

            # Compute edge embeddings
            u, v = g.edges()
            edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
        if corrupt:
            e_perm = torch.randperm(g.number_of_edges())
            #n_perm = torch.randperm(g.number_of_nodes())
            efeats = efeats[e_perm]
            #nfeats = nfeats[n_perm]
        for i, layer in enumerate(self.layers):
            #nfeats = layer(g, nfeats, efeats)
            nfeats, e_feats = layer(g, nfeats, efeats)
            #return nfeats.sum(1)
        return nfeats.sum(1), e_feats.sum(1)
    
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
        super(Discriminator, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
        self.reset_parameters()

    def uniform(self, size, tensor):
        bound = 1.0 / math.sqrt(size)
        if tensor is not None:
            tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
        size = self.weight.size(0)
        self.uniform(size, self.weight)

    def forward(self, features, summary):
        features = torch.matmul(features, torch.matmul(self.weight, summary))
        return features

class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
        super(DGI, self).__init__()
        self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
        self.discriminator = Discriminator(128)
        # self.discriminator = Discriminator(256)
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
        positive = self.encoder(g, n_features, e_features, corrupt=False)
        negative = self.encoder(g, n_features, e_features, corrupt=True)
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
        positive = self.encoder(g, n_features, e_features, corrupt=False)
        negative = self.encoder(g, n_features, e_features, corrupt=True)

        positive = positive[1]
        negative = negative[1]

        summary = torch.sigmoid(positive.mean(dim=0))

        positive = self.discriminator(positive, summary)
        negative = self.discriminator(negative, summary)

        l1 = self.loss(positive, torch.ones_like(positive))
        l2 = self.loss(negative, torch.zeros_like(negative))

        return l1 + l2
    
class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super().__init__()
        self.dgi = DGI(ndim_in, ndim_out, edim, activation)
    def forward(self, g, nfeats, efeats):
        loss = self.dgi(g, nfeats, efeats)
        return loss

In [3]:
def build_graph(scaler, encoder, X, y, cols_to_norm, cname_label):
    X = encoder.transform(X)
    print('Number of samples:', X.shape, y.shape)
    print(cols_to_norm)
    X[cols_to_norm] = scaler.transform(X[cols_to_norm])
    X['h'] = X[cols_to_norm].values.tolist()
    X['h'] = X['h'].apply(lambda x: torch.tensor(x))
    G_nx = nx.from_pandas_edgelist(
        X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiDiGraph())
#     G_nx = nx.from_pandas_edgelist(
#         X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiGraph())
#     G_nx = G_nx.to_directed()
    print('Convert NX graph to DGL')
    G = from_networkx(G_nx, edge_attrs=['h', cname_label])
    # Eq1
    G.ndata['h'] = torch.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = torch.ones(len(G.edata['h']), dtype=torch.bool)
    
    G.ndata['h'] = torch.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
    G.edata['h'] = torch.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))
    
    return G

def create_embd_df(
    tvt_str,
    embs,
    cname_label,
    actual
):
    df = pd.DataFrame(embs, columns=[str(i) for i in range(embs.shape[1])])
    df[cname_label] = actual
    df[f"{cname_label}_tvt"] = tvt_str
    return df

def create_prob_df(
    tvt_str,
    model,
    G,
    node_features, 
    edge_features,
    actual
):
#     probs_0,probs_1,probs_2,probs_3,probs_4,gts,tvt
    pred_prop = model(G, node_features, edge_features)
    norm_pred_prop = torch.softmax(pred_prop, dim=1)
    data_array = [pred_prop_ + [actual_, tvt_str] for pred_prop_, actual_ in zip(norm_pred_prop.tolist(), actual.tolist())]
    cnames = [f'probs_{i}' for i in range(norm_pred_prop.shape[1])]
    prob_df = pd.DataFrame(data_array, columns=cnames+['gts', 'tvt'])
    return prob_df



In [4]:
def run_baseline(
    ds_name,
    g_name,
    cname_label,
    cname_tvt,
    n_epochs
):
    
    data = pd.read_csv(f'../datasets/{ds_name}.csv')
    label2idx = pd.read_pickle(f'../datasets/{g_name}.pkl')['label2idx']
    if cname_label == 'Attack':
        data['Attack'] = data['Attack'].map(label2idx)
        
    ####
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
        lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
    data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
    data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
    data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

    data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
    data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

    data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    
    ####
    X_cnames = [c for c in data.columns if not c.startswith('Label_tvt') and not c.startswith('Attack_tvt')]
    X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                        data[data[cname_tvt]=='test'][X_cnames], 
                                        data[data[cname_tvt]!='test'][cname_label], 
                                        data[data[cname_tvt]=='test'][cname_label])
    
    ####
    cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
    scaler = StandardScaler()
    scaler.fit(X_train[cols_to_norm])
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
    encoder.fit(X_train, y_train)

    G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm, cname_label)
    G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm, cname_label)
    
    print('To device')
    G_train = G_train.to(torch.device(device))
    G_test = G_test.to(torch.device(device))
    
    node_features = G_train.ndata['h']
    edge_features = G_train.edata['h']

    node_features_test = G_test.ndata['h']
    edge_features_test = G_test.edata['h']
    
    assert edge_features.shape[0] == X_train.shape[0], "Incorrect number of edges"
    assert edge_features_test.shape[0] == X_test.shape[0], "Incorrect number of edges"
    
    ####
    ndim_in = G_train.ndata['h'].shape[2]
    ndim_out = 128 
    edim = G_train.ndata['h'].shape[2]
    activation = F.relu
    dropout = 0.2
    n_classes = data[cname_label].nunique()
    
    ####
    model = Model(ndim_in, ndim_out, edim, activation, dropout).to(device)
    opt = torch.optim.Adam(model.parameters())
    
    t_G_train = copy.deepcopy(G_train)
    ####
    print('Start training')
    for epoch in range(1, n_epochs+1):
        loss = model(t_G_train, node_features, edge_features).to(device)
        opt.zero_grad()
        loss.backward()
        opt.step()
        if epoch % 10 == 0:
            print(f'{epoch:04d} - Loss:', loss.item())

    #### create embs df
    training_emb = model.dgi.encoder(G_train, G_train.ndata['h'], G_train.edata['h'])[1]
    training_emb = training_emb.detach().cpu().numpy()

    testing_emb = model.dgi.encoder(G_test, G_test.ndata['h'], G_test.edata['h'])[1]
    testing_emb = testing_emb.detach().cpu().numpy()
    
    train_actual = G_train.edata[cname_label].detach().cpu().numpy()
    test_actual = G_test.edata[cname_label].detach().cpu().numpy()
    
    train_tvt_str = data[data[cname_tvt]!='test'][cname_tvt].values
    
    train_embd_df = create_embd_df(
        train_tvt_str,
        training_emb,
        cname_label,
        train_actual
    )
    test_embd_df = create_embd_df(
        'test',
        testing_emb,
        cname_label,
        test_actual
    )
    pdXY = pd.concat([train_embd_df, test_embd_df], axis=0).reset_index(drop=True)
    return pdXY

In [5]:
def compute_evaluation_metrics(model, best_ntree, x_train, y_train, x_val, y_val, x_test, y_test, is_binary):
    train_score = model.predict_proba(x_train, ntree_limit=best_ntree)
    train_pred = model.predict(x_train, ntree_limit=best_ntree)
    val_score = model.predict_proba(x_val, ntree_limit=best_ntree)
    val_pred = model.predict(x_val, ntree_limit=best_ntree)
    test_score = model.predict_proba(x_test, ntree_limit=best_ntree)
    test_pred = model.predict(x_test, ntree_limit=best_ntree)
    
    if is_binary:
        train_auc = roc_auc_score(y_true=y_train, y_score=train_score[:, 1])
        val_auc = roc_auc_score(y_true=y_val, y_score=val_score[:, 1])
        test_auc = roc_auc_score(y_true=y_test, y_score=test_score[:, 1])
    else:
        train_auc = roc_auc_score(y_true=y_train, y_score=train_score, multi_class='ovo')
        val_auc = roc_auc_score(y_true=y_val, y_score=val_score, multi_class='ovo')
        test_auc = roc_auc_score(y_true=y_test, y_score=test_score, multi_class='ovo')
    
    train_acc = accuracy_score(y_true=y_train, y_pred=train_pred)
    val_acc = accuracy_score(y_true=y_val, y_pred=val_pred)
    test_acc = accuracy_score(y_true=y_test, y_pred=test_pred)
    return train_auc, train_acc, val_auc, val_acc, test_auc, test_acc

def train_xgb(dfXY, cname_feats, cname_target='Label', cname_tvt='Label_tvt', option_init={}, option_fit={}):
    default_option_fit = {
        'eval_metric': 'auc',
        'verbose': False,
        'early_stopping_rounds': 20,
    }
    default_option_init = {
        'use_label_encoder': False,
        'objective': 'binary:logistic',
        'random_state': 0,
        'n_jobs': 32
    }
    default_option_fit.update(option_fit)
    default_option_init.update(option_init)
    option_fit = default_option_fit
    option_init = default_option_init
    
    if dfXY[cname_target].nunique() == 2:
        is_binary = True
    else:
        is_binary = False
        
    # train/test
    x_train = dfXY[dfXY[cname_tvt]=='train'][cname_feats].values
    y_train = dfXY[dfXY[cname_tvt]=='train'][cname_target].values.astype("i4")
    x_val = dfXY[dfXY[cname_tvt]=='val'][cname_feats].values
    y_val = dfXY[dfXY[cname_tvt]=='val'][cname_target].values.astype("i4")
    x_test = dfXY[dfXY[cname_tvt]=='test'][cname_feats].values
    y_test = dfXY[dfXY[cname_tvt]=='test'][cname_target].values.astype("i4")
    
    # classify
    eval_set = [
        (x_train, y_train),
        (x_val, y_val),
    ]
    
    model = xgb.XGBClassifier(**option_init)
    model.fit(x_train, y_train, eval_set=eval_set, **option_fit)
    best_ntree = model.get_booster().best_ntree_limit  
    
    train_auc, train_acc, val_auc, val_acc, test_auc, test_acc = compute_evaluation_metrics(
        model, best_ntree, x_train, y_train, x_val, y_val, x_test, y_test, is_binary)
    
    pd_res = pd.DataFrame({
        'n_features': [len(cname_feats)],
        'n_train': [x_train.shape[0]],
        'n_val': [x_val.shape[0]],
        'n_test': [x_test.shape[0]],
        'n_tree': [best_ntree],
        'train_auc': [train_auc],
        'train_acc': [train_acc],
        'val_auc': [val_auc],
        'val_acc': [val_acc],
        'test_auc': [test_auc],
        'test_acc': [test_acc],  
    })
    
    display(pd_res)
    
    # track
    fmodel = {
        'model': model,
        'cname_target': cname_target,
        'cname_tvt': cname_tvt,
        'cname_feats': cname_feats,  
    }
    return fmodel

def predict(f_model, dfXY):
    probs = f_model['model'].predict_proba(dfXY[f_model['cname_feats']])
    df = pd.DataFrame(probs)
    df.columns = [f'probs_{i}' for i in range(df.shape[1])]
    df['gts'] = dfXY[f_model['cname_target']]
    df['tvt'] = dfXY[f_model['cname_tvt']]
    return df

# CV

In [6]:
# from pyod.models.cblof import CBLOF

# benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label"])
# normal_train_samples = df_train.drop(columns=["Label"])

# train_labels = df_train["Label"]
# test_labels = df_test["Label"]

# test_samples = df_test.drop(columns=["Label"])
# from sklearn.metrics import classification_report, f1_score
# n_est = 2
# con = 0.04
# score = -1
# bs = None
# clf_if = CBLOF(n_clusters=n_est, contamination=con)
# clf_if.fit(benign_train_samples)
# y_pred = clf_if.predict(test_samples)
# test_pred = y_pred

# f1 = f1_score(test_labels, test_pred, average='macro')
# print(f1)
# print(classification_report(test_labels, test_pred, digits=4))

## BoT

In [7]:
device = 'cuda:0'
n_folds = 5
flag_save = True

### Binary

In [7]:
n_epochs = 2000
cname_label = 'Label'
ds_name = 'NF-BoT-IoT_cv'
g_name = 'NF-BoT-IoT_cv0_graph_binary'
for fold in range(1, n_folds):
    print('Fold:', fold)
    cname_tvt = f'{cname_label}_tvt_fold_{fold}'
    df_result = run_baseline(
        ds_name,
        g_name,
        cname_label,
        cname_tvt,
        n_epochs
    )
    display(df_result.head())
    if flag_save:
        out_path = f'../output_emb/AnomalE_nf_bot_binary_cv{fold}.pkl'
        print('Save:', out_path)
        df_result.to_pickle(out_path)

Fold: 1




Number of samples: (480080, 12) (480080,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.6431936025619507
0020 - Loss: 1.3319742679595947
0030 - Loss: 1.278975486755371
0040 - Loss: 1.125908613204956
0050 - Loss: 0.892997145652771
0060 - Loss: 0.6590012907981873
0070 - Loss: 0.5716143846511841
0080 - Loss: 0.550341010093689
0090 - Loss: 0.5414172410964966
0100 - Loss: 0.5403742790222168
0110 - Loss: 0.529935896396637
0120 - Loss: 0.5306460857391357
0130 - Loss: 0.5148089528083801
0140 - Loss: 0.5103889107704163
0150 - Loss: 0.5072812438011169
0160 - Loss: 0.5001993179321289
0170 - Loss: 0.49119848012924194
0180 - Loss: 0.4974181652069092
0190 - Loss: 0.4888855516910553
0200 - 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Label,Label_tvt
0,-1.92734,0.025692,10.381227,10.819204,-2.646116,10.050588,2.068347,1.36188,3.793991,7.03836,...,3.07203,2.181427,13.270711,-2.382159,7.8302,16.638126,2.985399,16.608747,1,train
1,-0.747934,-1.707039,2.313488,2.28582,-2.911323,3.171084,0.940423,1.615129,-1.178686,0.808631,...,4.006696,2.036296,4.449018,1.640095,2.654618,6.597771,3.565621,4.684122,0,val
2,-0.632139,-0.860037,1.891119,1.313892,-0.241508,1.345683,0.611589,-0.546503,0.458764,0.295978,...,0.977973,0.483716,1.752334,0.001217,1.088345,2.714722,0.647017,2.893874,0,val
3,-1.039994,-1.228239,2.487728,2.177692,-1.462019,2.827553,0.334752,0.315581,0.128015,1.125941,...,2.449733,1.551589,3.805914,-0.038243,1.797948,5.328384,2.165499,4.384638,0,train
4,0.06273,-0.186216,0.10477,0.238044,0.020658,0.178592,0.041246,0.010073,-0.043814,-0.0545,...,-0.021537,0.136695,0.131291,0.018041,0.205625,0.194706,0.037719,0.132037,0,val


Save: ../output_emb/AnomalE_nf_bot_binary_cv1.pkl
Fold: 2




Number of samples: (480080, 12) (480080,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.7004342079162598
0020 - Loss: 1.3201208114624023
0030 - Loss: 1.2696839570999146
0040 - Loss: 1.1343052387237549
0050 - Loss: 0.9337615966796875
0060 - Loss: 0.7000356912612915
0070 - Loss: 0.5724088549613953
0080 - Loss: 0.5476571321487427
0090 - Loss: 0.5333722829818726
0100 - Loss: 0.5273695588111877
0110 - Loss: 0.513565182685852
0120 - Loss: 0.5073208808898926
0130 - Loss: 0.5003672242164612
0140 - Loss: 0.4980427622795105
0150 - Loss: 0.4877215623855591
0160 - Loss: 0.4822026491165161
0170 - Loss: 0.4804658889770508
0180 - Loss: 0.4905295968055725
0190 - Loss: 0.4627456068992615
0200

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Label,Label_tvt
0,0.063792,0.007712,0.022217,0.022801,0.006228,0.01871,0.031559,0.037245,0.068138,0.213498,...,0.165189,0.032222,0.020239,0.12305,0.123098,0.075037,0.096202,0.103357,0,train
1,15.948688,4.275068,-2.250632,3.175306,13.463216,-5.05549,12.772231,8.006457,-7.266319,19.052296,...,-0.565771,-5.166986,3.802348,7.912642,-9.257119,1.993399,-9.402494,5.726709,1,train
2,1.454095,-0.453376,-1.316782,0.235356,3.80362,0.569729,1.210547,2.025203,0.323354,5.421437,...,0.699955,-0.012781,2.78126,1.785366,-0.241204,1.845124,-0.774581,0.642625,0,train
3,2.805194,0.476372,-1.578392,0.065712,4.736784,0.898649,1.407388,2.71979,-0.438241,7.21885,...,0.770349,-1.443472,4.98832,3.583281,-0.978328,1.701169,-1.778465,2.113579,0,train
4,1.454095,-0.453376,-1.316782,0.235356,3.80362,0.569729,1.210547,2.025203,0.323354,5.421437,...,0.699955,-0.012781,2.78126,1.785366,-0.241204,1.845124,-0.774581,0.642625,0,train


Save: ../output_emb/AnomalE_nf_bot_binary_cv2.pkl
Fold: 3




Number of samples: (480080, 12) (480080,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.58842933177948
0020 - Loss: 1.3294333219528198
0030 - Loss: 1.2921520471572876
0040 - Loss: 1.148543119430542
0050 - Loss: 0.9244838953018188
0060 - Loss: 0.6839194297790527
0070 - Loss: 0.5707769989967346
0080 - Loss: 0.5503889322280884
0090 - Loss: 0.542603611946106
0100 - Loss: 0.5281205177307129
0110 - Loss: 0.5292831659317017
0120 - Loss: 0.5190469622612
0130 - Loss: 0.5143947005271912
0140 - Loss: 0.5125987529754639
0150 - Loss: 0.5054950714111328
0160 - Loss: 0.49992746114730835
0170 - Loss: 0.49067962169647217
0180 - Loss: 0.48999154567718506
0190 - Loss: 0.48442015051841736
0200 -

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Label,Label_tvt
0,-0.070751,0.059966,0.117903,0.092193,0.103957,0.031937,0.042294,-0.009799,0.137479,0.037918,...,0.02044,0.01525,0.118184,0.086401,0.158853,-0.072365,0.033541,0.145198,0,train
1,2.288239,2.74111,17.763975,11.258263,12.583649,14.251354,-9.991842,7.53412,19.482954,-6.228168,...,-5.467793,-0.990781,5.942996,5.320913,-0.375661,-1.611554,-0.498233,11.223248,1,train
2,0.464679,1.025936,4.18982,2.832623,2.644827,1.555814,-1.594677,0.803284,3.757948,-1.412666,...,1.800846,0.095253,2.071327,-0.00328,0.012625,-0.5577,-0.537777,3.511494,0,train
3,0.464679,1.025936,4.18982,2.832623,2.644827,1.555814,-1.594677,0.803284,3.757948,-1.412666,...,1.800846,0.095253,2.071327,-0.00328,0.012625,-0.5577,-0.537777,3.511494,0,val
4,0.164935,0.31253,0.827401,0.517086,0.430917,0.350457,-0.162918,0.104214,0.824957,-0.267235,...,0.250668,0.066855,0.334417,0.086087,0.296122,-0.124058,-0.037529,0.608901,0,train


Save: ../output_emb/AnomalE_nf_bot_binary_cv3.pkl
Fold: 4




Number of samples: (480080, 12) (480080,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['IN_PKTS', 'OUT_PKTS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.7416284084320068
0020 - Loss: 1.29060697555542
0030 - Loss: 1.240566611289978
0040 - Loss: 1.0975866317749023
0050 - Loss: 0.8897351026535034
0060 - Loss: 0.6774301528930664
0070 - Loss: 0.5737987756729126
0080 - Loss: 0.549710750579834
0090 - Loss: 0.5388028621673584
0100 - Loss: 0.5341207385063171
0110 - Loss: 0.5322606563568115
0120 - Loss: 0.5294433236122131
0130 - Loss: 0.5198702812194824
0140 - Loss: 0.5119867324829102
0150 - Loss: 0.5067940950393677
0160 - Loss: 0.5036683082580566
0170 - Loss: 0.49765485525131226
0180 - Loss: 0.49693334102630615
0190 - Loss: 0.49330925941467285
0200

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Label,Label_tvt
0,9.278213,2.145815,9.485803,9.526907,2.578449,-5.064191,1.922569,9.912421,2.245402,2.089078,...,7.780299,2.782974,2.810694,-7.082822,15.051437,11.517078,-6.343623,7.581882,1,train
1,1.313287,3.654418,0.955213,1.034643,0.563504,-2.005035,-0.168802,3.94741,0.440415,1.390412,...,1.952508,-0.414132,0.235292,-0.156026,6.459072,2.273144,-1.584096,2.570495,0,val
2,1.379795,5.364019,0.794791,1.534493,0.011945,-3.167106,0.26706,4.953243,1.240804,2.543075,...,1.579578,-0.000748,0.495656,-0.386586,8.804118,2.888632,-2.481511,3.651071,0,train
3,1.313287,3.654418,0.955213,1.034643,0.563504,-2.005035,-0.168802,3.94741,0.440415,1.390412,...,1.952508,-0.414132,0.235292,-0.156026,6.459072,2.273144,-1.584096,2.570495,0,train
4,0.238033,0.740559,0.355658,0.146589,0.138142,-0.47202,-0.110256,0.862269,-0.057378,0.304482,...,0.449086,-0.053351,0.01339,-0.083122,1.29217,0.406874,-0.280358,0.379724,0,val


Save: ../output_emb/AnomalE_nf_bot_binary_cv4.pkl


In [8]:
cname_target = 'Label'
cname_tvt = f'{cname_target}_tvt'
flag_save = True
for fold in range(n_folds):
    print('Fold:', fold)
    cname_feats = [str(i) for i in range(128)]
    dfXY = pd.read_pickle(f'../output_emb/AnomalE_nf_bot_binary_cv{fold}.pkl')
    
    f_model = train_xgb(
        dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
    df = predict(f_model, dfXY)
    # df.to_csv(f'../output_cv/xgb_nf_bot_binary_cv{fold}.csv', index=False)
    if flag_save:
        out_path = f'../output_cv/AnomalE_nf_bot_binary_cv{fold}.csv'
        print('Save:', out_path)
        df.to_csv(out_path, index=False)

Fold: 0




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,76,0.983966,0.987385,0.982286,0.987394,0.979043,0.979687


Save: ../output_cv/AnomalE_nf_bot_binary_cv0.csv
Fold: 1




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,36,0.983895,0.987474,0.982386,0.987144,0.972799,0.98192


Save: ../output_cv/AnomalE_nf_bot_binary_cv1.csv
Fold: 2




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,46,0.983542,0.987233,0.982364,0.987577,0.979148,0.985477


Save: ../output_cv/AnomalE_nf_bot_binary_cv2.csv
Fold: 3




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,88,0.983889,0.987363,0.982752,0.987452,0.974097,0.985869


Save: ../output_cv/AnomalE_nf_bot_binary_cv3.csv
Fold: 4




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,32,0.98407,0.987488,0.981528,0.987019,0.974635,0.984353


Save: ../output_cv/AnomalE_nf_bot_binary_cv4.csv


### Multi

In [8]:
n_epochs = 2000
cname_label = 'Attack'
ds_name = 'NF-BoT-IoT_cv'
g_name = 'NF-BoT-IoT_cv0_graph_multi'
for fold in range(n_folds):
    print('Fold:', fold)
    cname_tvt = f'{cname_label}_tvt_fold_{fold}'
    df_result = run_baseline(
        ds_name,
        g_name,
        cname_label,
        cname_tvt,
        n_epochs
    )
    display(df_result.head())
    if flag_save:
        out_path = f'../output_emb/AnomalE_nf_bot_multi_cv{fold}.pkl'
        print('Save:', out_path)
        df_result.to_pickle(out_path)

Fold: 0




Number of samples: (480080, 12) (480080,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.608283281326294
0020 - Loss: 1.2825309038162231
0030 - Loss: 1.2088931798934937
0040 - Loss: 1.0182335376739502
0050 - Loss: 0.7642197608947754
0060 - Loss: 0.5966204404830933
0070 - Loss: 0.5455110669136047
0080 - Loss: 0.5268533229827881
0090 - Loss: 0.5131720304489136
0100 - Loss: 0.5008375644683838
0110 - Loss: 0.48518484830856323
0120 - Loss: 0.47545698285102844
0130 - Loss: 0.4578186869621277
0140 - Loss: 0.44783729314804077
0150 - Loss: 0.43593689799308777
0160 - Loss: 0.4510277807712555
0170 - Loss: 0.4213714897632599
0180 - Loss: 0.4194221794605255
0190 - Loss: 0.4128879904747009


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Attack,Attack_tvt
0,0.049649,0.016933,0.111899,0.03126,9.4e-05,0.195302,0.451293,0.24227,0.389275,0.063263,...,0.402145,-0.011338,0.031974,0.075711,0.157718,0.224275,0.186393,-0.059726,0,train
1,2.417537,0.602608,1.205747,4.688486,0.74904,1.270898,7.161662,5.174139,2.682396,-2.646943,...,3.497228,-1.169678,-0.457631,-0.082949,1.436263,2.66897,3.934399,1.270787,0,train
2,3.209163,2.034337,1.231138,5.421963,2.021488,1.537319,9.004616,5.985013,2.162567,-3.181685,...,3.239772,-0.921208,0.260931,0.116396,2.275891,2.755877,4.180099,1.7657,0,train
3,2.417537,0.602608,1.205747,4.688486,0.74904,1.270898,7.161662,5.174139,2.682396,-2.646943,...,3.497228,-1.169678,-0.457631,-0.082949,1.436263,2.66897,3.934399,1.270787,0,train
4,0.249439,0.159942,0.304622,0.779711,0.235869,0.268761,1.507465,0.838555,0.612913,-0.417979,...,0.791147,-0.205647,0.110764,0.138706,0.232681,0.451751,0.814445,0.008145,0,train


Save: ../output_emb/AnomalE_nf_bot_multi_cv0.pkl
Fold: 1




Number of samples: (480080, 12) (480080,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.4116002321243286
0020 - Loss: 1.2473639249801636
0030 - Loss: 0.9968037605285645
0040 - Loss: 0.714645504951477
0050 - Loss: 0.5675256252288818
0060 - Loss: 0.5289593935012817
0070 - Loss: 0.5169270634651184
0080 - Loss: 0.4995276927947998
0090 - Loss: 0.4957360625267029
0100 - Loss: 0.4730433523654938
0110 - Loss: 0.4599887728691101
0120 - Loss: 0.45892781019210815
0130 - Loss: 0.43304604291915894
0140 - Loss: 0.42770272493362427
0150 - Loss: 0.4302581548690796
0160 - Loss: 0.4046557545661926
0170 - Loss: 0.392721563577652
0180 - Loss: 0.40443044900894165
0190 - Loss: 0.38461023569107056


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Attack,Attack_tvt
0,28.988852,17.490372,4.483373,10.509126,5.397744,-0.926823,3.355568,8.757118,4.470564,-2.814009,...,3.334058,-6.393062,0.072684,4.18645,-1.49437,3.81122,-0.573833,8.586548,4,train
1,7.884548,5.748482,0.403125,1.40404,0.65322,1.257143,-0.593541,2.184529,-0.704112,-1.652778,...,2.461198,-1.745446,2.36289,1.442992,-0.611457,0.822156,-1.776823,2.040227,0,val
2,4.121422,2.480523,0.404007,0.250735,0.482196,0.335659,0.472907,0.459064,0.495564,0.292008,...,1.581233,-0.229335,0.57402,0.871957,0.245003,0.645052,0.121356,0.293409,0,val
3,6.781711,4.50003,0.547421,1.054502,0.525633,0.789035,0.103779,1.189162,0.177748,-0.602985,...,2.223367,-1.218611,1.466509,1.397878,-0.115091,0.659171,-0.984125,1.209394,0,train
4,0.524147,0.209694,-0.005608,-0.06317,-0.004228,-0.078369,-0.054491,0.191618,0.049581,-0.006723,...,0.093082,-0.005747,0.015671,0.100194,-0.135161,0.177163,0.031842,0.076852,0,val


Save: ../output_emb/AnomalE_nf_bot_multi_cv1.pkl
Fold: 2




Number of samples: (480080, 12) (480080,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.6085435152053833
0020 - Loss: 1.3136672973632812
0030 - Loss: 1.211733102798462
0040 - Loss: 1.0541478395462036
0050 - Loss: 0.8296260237693787
0060 - Loss: 0.6504720449447632
0070 - Loss: 0.5811723470687866
0080 - Loss: 0.561342716217041
0090 - Loss: 0.5544930696487427
0100 - Loss: 0.5374630689620972
0110 - Loss: 0.526496171951294
0120 - Loss: 0.5204838514328003
0130 - Loss: 0.5086454153060913
0140 - Loss: 0.49657952785491943
0150 - Loss: 0.5006409287452698
0160 - Loss: 0.4846552312374115
0170 - Loss: 0.4653604030609131
0180 - Loss: 0.45228976011276245
0190 - Loss: 0.4486609101295471
0200

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Attack,Attack_tvt
0,-0.053724,0.055671,0.122328,0.020789,0.152847,0.014215,0.129281,0.011974,0.102001,-0.006122,...,0.008323,0.020187,0.154942,-0.01318,0.029683,0.091296,0.119536,0.04517,0,train
1,9.445944,0.404989,10.664376,-10.314894,2.830842,22.498606,1.334156,7.196707,15.806669,0.311824,...,-1.407197,-2.896572,13.56117,9.72353,2.532262,16.235571,0.511161,11.12277,4,train
2,2.475341,0.668582,2.650001,-1.005129,0.413942,2.353671,-0.34503,0.386677,2.09181,-0.656011,...,0.962305,0.444945,2.555262,0.655033,1.52587,2.91528,2.858174,1.486351,0,train
3,4.669859,1.572931,3.542829,-2.425849,0.757295,3.078577,-1.363463,0.788575,2.855751,-1.53523,...,0.884767,-0.068048,4.749258,0.978374,1.629967,4.266332,4.550956,2.493099,0,train
4,2.475341,0.668582,2.650001,-1.005129,0.413942,2.353671,-0.34503,0.386677,2.09181,-0.656011,...,0.962305,0.444945,2.555262,0.655033,1.52587,2.91528,2.858174,1.486351,0,train


Save: ../output_emb/AnomalE_nf_bot_multi_cv2.pkl
Fold: 3




Number of samples: (480080, 12) (480080,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.7135640382766724
0020 - Loss: 1.310429573059082
0030 - Loss: 1.2382841110229492
0040 - Loss: 1.0735101699829102
0050 - Loss: 0.834894061088562
0060 - Loss: 0.6210454106330872
0070 - Loss: 0.5485843420028687
0080 - Loss: 0.525503933429718
0090 - Loss: 0.5069828033447266
0100 - Loss: 0.49468228220939636
0110 - Loss: 0.4764484763145447
0120 - Loss: 0.4664996862411499
0130 - Loss: 0.4534280300140381
0140 - Loss: 0.4401567280292511
0150 - Loss: 0.43233269453048706
0160 - Loss: 0.4311213493347168
0170 - Loss: 0.4391110837459564
0180 - Loss: 0.41409701108932495
0190 - Loss: 0.40797221660614014
02

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Attack,Attack_tvt
0,-0.005459,0.056402,0.160437,0.092969,0.03735,0.06087,0.08779,-0.011695,0.017384,0.132252,...,0.125594,0.158441,0.056155,0.110055,0.04309,0.04786,0.198315,-0.010529,0,train
1,-10.78902,-0.684547,0.736268,11.548258,3.68082,-0.346116,6.007852,6.319572,-6.894775,12.862212,...,2.619545,13.565704,-2.580909,14.010063,1.347686,3.276218,15.448701,0.709198,4,train
2,-1.771369,1.680122,0.003228,2.023825,-0.334287,1.091513,0.871815,0.936393,-0.435344,2.929645,...,-0.071037,3.605663,2.477155,2.321013,1.45211,-1.043931,2.95644,1.294322,0,train
3,-1.771369,1.680122,0.003228,2.023825,-0.334287,1.091513,0.871815,0.936393,-0.435344,2.929645,...,-0.071037,3.605663,2.477155,2.321013,1.45211,-1.043931,2.95644,1.294322,0,val
4,-0.22812,0.159319,0.0273,0.158878,-0.02193,0.093691,0.218024,0.218714,0.043308,0.672588,...,-0.020125,0.693883,0.646654,0.309379,0.271068,-0.208817,0.69555,0.175448,0,train


Save: ../output_emb/AnomalE_nf_bot_multi_cv3.pkl
Fold: 4




Number of samples: (480080, 12) (480080,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
Number of samples: (120020, 12) (120020,)
['OUT_PKTS', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'IN_BYTES', 'OUT_BYTES', 'PROTOCOL', 'TCP_FLAGS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.5647516250610352
0020 - Loss: 1.261464238166809
0030 - Loss: 1.1617000102996826
0040 - Loss: 0.953518271446228
0050 - Loss: 0.6998889446258545
0060 - Loss: 0.5567739605903625
0070 - Loss: 0.5264344215393066
0080 - Loss: 0.5071387887001038
0090 - Loss: 0.4941825866699219
0100 - Loss: 0.47535306215286255
0110 - Loss: 0.45869332551956177
0120 - Loss: 0.45292240381240845
0130 - Loss: 0.4481305778026581
0140 - Loss: 0.43823832273483276
0150 - Loss: 0.42938223481178284
0160 - Loss: 0.43346092104911804
0170 - Loss: 0.4212878346443176
0180 - Loss: 0.4199526309967041
0190 - Loss: 0.4098898768424988

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Attack,Attack_tvt
0,-1.238457,-1.841131,3.461085,6.890592,12.349005,6.761558,10.531001,2.586161,14.825981,-6.510933,...,17.769934,-4.022481,1.120273,-3.257091,-10.214943,1.668622,-5.462571,6.27813,4,train
1,-0.875972,-0.238168,1.012785,0.610468,2.556398,1.391983,2.583323,1.207111,1.733117,-1.618011,...,1.710366,-1.294729,0.297484,1.015489,-1.686576,0.37187,-0.0226,0.361506,0,val
2,-0.888017,-0.433161,1.915892,0.356167,2.92127,2.506774,3.180284,1.861716,2.166063,-2.29145,...,2.166724,-2.729713,0.758631,1.828482,-2.891664,0.240087,-0.129191,0.8518,0,train
3,-0.875972,-0.238168,1.012785,0.610468,2.556398,1.391983,2.583323,1.207111,1.733117,-1.618011,...,1.710366,-1.294729,0.297484,1.015489,-1.686576,0.37187,-0.0226,0.361506,0,train
4,-0.250624,-0.038113,0.137072,0.295339,0.49312,0.033824,0.464758,0.166967,0.208423,-0.224813,...,0.403563,-0.094993,0.041968,0.149544,-0.171112,-0.032183,0.034245,0.081795,0,val


Save: ../output_emb/AnomalE_nf_bot_multi_cv4.pkl


In [9]:
cname_target = 'Attack'
cname_tvt = f'{cname_target}_tvt'
flag_save = True
for fold in range(n_folds):
    print('Fold:', fold)
    cname_feats = [str(i) for i in range(128)]
    dfXY = pd.read_pickle(f'../output_emb/AnomalE_nf_bot_multi_cv{fold}.pkl')
    
    f_model = train_xgb(
        dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
    df = predict(f_model, dfXY)
    if flag_save:
        out_path = f'../output_cv/AnomalE_nf_bot_multi_cv{fold}.csv'
        print('Save:', out_path)
        df.to_csv(out_path, index=False)

Fold: 0




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,28,0.93855,0.838571,0.937753,0.837985,0.838591,0.837394


Save: ../output_cv/AnomalE_nf_bot_multi_cv0.csv
Fold: 1




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,35,0.938578,0.839049,0.937196,0.838719,0.918899,0.829695


Save: ../output_cv/AnomalE_nf_bot_multi_cv1.csv
Fold: 2




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,48,0.938168,0.83843,0.936964,0.837794,0.899103,0.825554


Save: ../output_cv/AnomalE_nf_bot_multi_cv2.csv
Fold: 3




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,43,0.938579,0.839177,0.938335,0.837927,0.8264,0.825112


Save: ../output_cv/AnomalE_nf_bot_multi_cv3.csv
Fold: 4




Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,128,360060,120020,120020,20,0.937925,0.838546,0.937825,0.838427,0.776469,0.790135


Save: ../output_cv/AnomalE_nf_bot_multi_cv4.csv


## ToN

In [6]:
device = 'cuda:1'
n_folds = 5
flag_save = True

### Binary

In [None]:
n_epochs = 2000
cname_label = 'Label'
ds_name = 'NF-ToN-IoT_cv'
g_name = 'NF-ToN-IoT_cv0_graph_binary'
for fold in range(1, n_folds):
    print('Fold:', fold)
    cname_tvt = f'{cname_label}_tvt_fold_{fold}'
    df_result = run_baseline(
        ds_name,
        g_name,
        cname_label,
        cname_tvt,
        n_epochs
    )
    display(df_result.head())
    if flag_save:
        out_path = f'../output_emb/AnomalE_nf_ton_binary_cv{fold}.pkl'
        print('Save:', out_path)
        df_result.to_pickle(out_path)

Fold: 1




Number of samples: (1103420, 12) (1103420,)
['PROTOCOL', 'L7_PROTO', 'OUT_PKTS', 'OUT_BYTES', 'IN_PKTS', 'IN_BYTES', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
Convert NX graph to DGL
Number of samples: (275854, 12) (275854,)
['PROTOCOL', 'L7_PROTO', 'OUT_PKTS', 'OUT_BYTES', 'IN_PKTS', 'IN_BYTES', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.571653127670288
0020 - Loss: 1.3802754878997803
0030 - Loss: 1.385054111480713
0040 - Loss: 1.3775782585144043
0050 - Loss: 1.3665988445281982
0060 - Loss: 1.3559730052947998
0070 - Loss: 1.3369929790496826
0080 - Loss: 1.3023241758346558
0090 - Loss: 1.2468059062957764
0100 - Loss: 1.1504735946655273
0110 - Loss: 1.106756329536438
0120 - Loss: 0.9629880785942078
0130 - Loss: 0.844119668006897
0140 - Loss: 0.8694419860839844
0150 - Loss: 0.7110531330108643
0160 - Loss: 0.7023247480392456
0170 - Loss: 0.6558488011360168
0180 - Loss: 0.5979750156402588
0190 - Loss: 0.5808843374252319
0200 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Label,Label_tvt
0,-0.098757,0.061815,-0.072648,0.046304,-0.061086,0.133402,-0.0021,0.078948,-0.004074,0.083123,...,0.095319,0.026804,-0.024102,0.010573,0.057527,-0.010087,0.062209,0.098277,0,train
1,-0.173724,0.640377,-0.344587,0.394448,-0.036219,0.67536,0.280551,-0.089965,0.076154,0.28485,...,0.05083,0.578202,0.295151,0.333243,0.341715,-0.063575,0.254221,0.312628,0,train
2,-0.107301,0.055413,-0.067121,0.059845,-0.059322,0.144517,0.014243,0.071828,-0.013638,0.072646,...,0.089099,0.015234,-0.018859,0.003706,0.045622,-0.004219,0.066143,0.103366,0,val
3,-0.13004,0.19289,-0.119239,0.214464,-0.091033,0.319849,0.091386,-0.014514,-0.045745,0.07741,...,0.105167,0.111527,0.047029,0.08236,0.076323,-0.015642,0.122852,0.181686,0,val
4,-0.164236,0.511135,-0.270675,0.354221,-0.061598,0.580225,0.227954,-0.06959,0.03069,0.215867,...,0.052132,0.435273,0.22192,0.251372,0.258832,-0.040793,0.209196,0.266315,0,val


Save: ../output_emb/AnomalE_nf_ton_binary_cv1.pkl
Fold: 2




Number of samples: (1103420, 12) (1103420,)
['PROTOCOL', 'L7_PROTO', 'OUT_PKTS', 'OUT_BYTES', 'IN_PKTS', 'IN_BYTES', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
Convert NX graph to DGL
Number of samples: (275854, 12) (275854,)
['PROTOCOL', 'L7_PROTO', 'OUT_PKTS', 'OUT_BYTES', 'IN_PKTS', 'IN_BYTES', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS']
Convert NX graph to DGL
To device
Start training
0010 - Loss: 1.698265790939331
0020 - Loss: 1.3967959880828857
0030 - Loss: 1.4075596332550049
0040 - Loss: 1.3775558471679688
0050 - Loss: 1.3695569038391113
0060 - Loss: 1.3590104579925537
0070 - Loss: 1.3442622423171997
0080 - Loss: 1.319408655166626
0090 - Loss: 1.2817859649658203
0100 - Loss: 1.211519479751587
0110 - Loss: 1.1222591400146484
0120 - Loss: 1.01223886013031
0130 - Loss: 0.9164758920669556
0140 - Loss: 0.8886610865592957
0150 - Loss: 0.724753201007843
0160 - Loss: 0.6845396757125854
0170 - Loss: 0.632996678352356
0180 - Loss: 0.6502498388290405
0190 - Loss: 0.5975292921066284
0200 - L

In [None]:
cname_target = 'Label'
cname_tvt = f'{cname_target}_tvt'
flag_save = True
for fold in range(n_folds):
    print('Fold:', fold)
    cname_feats = [str(i) for i in range(128)]
    dfXY = pd.read_pickle(f'../output_emb/AnomalE_nf_ton_binary_cv{fold}.pkl')
    
    f_model = train_xgb(
        dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
    df = predict(f_model, dfXY)
    if flag_save:
        out_path = f'../output_cv/AnomalE_nf_ton_binary_cv{fold}.csv'
        print('Save:', out_path)
        df.to_csv(out_path, index=False)

### Multi

In [None]:
n_epochs = 2000
cname_label = 'Attack'
ds_name = 'NF-ToN-IoT_cv'
g_name = 'NF-ToN-IoT_cv0_graph_multi'
for fold in range(n_folds):
    print('Fold:', fold)
    cname_tvt = f'{cname_label}_tvt_fold_{fold}'
    df_result = run_baseline(
        ds_name,
        g_name,
        cname_label,
        cname_tvt,
        n_epochs
    )
    display(df_result.head())
    if flag_save:
        out_path = f'../output_emb/AnomalE_nf_ton_multi_cv{fold}.pkl'
        print('Save:', out_path)
        df_result.to_pickle(out_path)

In [None]:
cname_target = 'Attack'
cname_tvt = f'{cname_target}_tvt'
flag_save = True
for fold in range(n_folds):
    print('Fold:', fold)
    cname_feats = [str(i) for i in range(128)]
    dfXY = pd.read_pickle(f'../output_emb/AnomalE_nf_ton_multi_cv{fold}.pkl')
    
    f_model = train_xgb(
        dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
    df = predict(f_model, dfXY)
    # df.to_csv(f'../output_cv/xgb_nf_bot_binary_cv{fold}.csv', index=False)
    if flag_save:
        out_path = f'../output_cv/AnomalE_nf_ton_multi_cv{fold}.csv'
        print('Save:', out_path)
        df.to_csv(out_path, index=False)