In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter

import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

  from pandas import MultiIndex, Int64Index


PROJ_PATH=/home/hoang/github/TS-IDS


In [2]:
def compute_evaluation_metrics(model, best_ntree, x_train, y_train, x_val, y_val, x_test, y_test, is_binary):
    train_score = model.predict_proba(x_train, ntree_limit=best_ntree)
    train_pred = model.predict(x_train, ntree_limit=best_ntree)
    val_score = model.predict_proba(x_val, ntree_limit=best_ntree)
    val_pred = model.predict(x_val, ntree_limit=best_ntree)
    test_score = model.predict_proba(x_test, ntree_limit=best_ntree)
    test_pred = model.predict(x_test, ntree_limit=best_ntree)
    
    if is_binary:
        train_auc = roc_auc_score(y_true=y_train, y_score=train_score[:, 1])
        val_auc = roc_auc_score(y_true=y_val, y_score=val_score[:, 1])
        test_auc = roc_auc_score(y_true=y_test, y_score=test_score[:, 1])
    else:
        train_auc = roc_auc_score(y_true=y_train, y_score=train_score, multi_class='ovo')
        val_auc = roc_auc_score(y_true=y_val, y_score=val_score, multi_class='ovo')
        test_auc = roc_auc_score(y_true=y_test, y_score=test_score, multi_class='ovo')
    
    train_acc = accuracy_score(y_true=y_train, y_pred=train_pred)
    val_acc = accuracy_score(y_true=y_val, y_pred=val_pred)
    test_acc = accuracy_score(y_true=y_test, y_pred=test_pred)
    return train_auc, train_acc, val_auc, val_acc, test_auc, test_acc

def train_xgb(dfXY, cname_feats, cname_target='Label', cname_tvt='Label_tvt', option_init={}, option_fit={}):
    default_option_fit = {
        'eval_metric': 'auc',
        'verbose': False,
        'early_stopping_rounds': 20,
    }
    default_option_init = {
        'objective': 'binary:logistic',
        'random_state': 0,
        'n_jobs': 32
    }
    default_option_fit.update(option_fit)
    default_option_init.update(option_init)
    option_fit = default_option_fit
    option_init = default_option_init
    
    if dfXY[cname_target].nunique() == 2:
        is_binary = True
    else:
        is_binary = False
        
    # train/test
    x_train = dfXY[dfXY[cname_tvt]=='train'][cname_feats].values
    y_train = dfXY[dfXY[cname_tvt]=='train'][cname_target].values.astype("i4")
    x_val = dfXY[dfXY[cname_tvt]=='val'][cname_feats].values
    y_val = dfXY[dfXY[cname_tvt]=='val'][cname_target].values.astype("i4")
    x_test = dfXY[dfXY[cname_tvt]=='test'][cname_feats].values
    y_test = dfXY[dfXY[cname_tvt]=='test'][cname_target].values.astype("i4")
    
    # classify
    eval_set = [
        (x_train, y_train),
        (x_val, y_val),
    ]
    
    model = xgb.XGBClassifier(**option_init)
    model.fit(x_train, y_train, eval_set=eval_set, **option_fit)
    best_ntree = model.get_booster().best_ntree_limit  
    
    train_auc, train_acc, val_auc, val_acc, test_auc, test_acc = compute_evaluation_metrics(
        model, best_ntree, x_train, y_train, x_val, y_val, x_test, y_test, is_binary)
    
    pd_res = pd.DataFrame({
        'n_features': [len(cname_feats)],
        'n_train': [x_train.shape[0]],
        'n_val': [x_val.shape[0]],
        'n_test': [x_test.shape[0]],
        'n_tree': [best_ntree],
        'train_auc': [train_auc],
        'train_acc': [train_acc],
        'val_auc': [val_auc],
        'val_acc': [val_acc],
        'test_auc': [test_auc],
        'test_acc': [test_acc],  
    })
    
    display(pd_res)
    
    # track
    fmodel = {
        'model': model,
        'cname_target': cname_target,
        'cname_feats': cname_feats,  
    }
    return fmodel

def predict(f_model, dfXY):
    probs = f_model['model'].predict_proba(dfXY[f_model['cname_feats']])
    df = pd.DataFrame(probs)
    df.columns = [f'probs_{i}' for i in range(df.shape[1])]
    df['gts'] = dfXY[f_model['cname_target']]
    df['tvt'] = dfXY[f"{f_model['cname_target']}_tvt"]
    return df

# BoT

In [3]:
dataset_name = 'NF-BoT-IoT'
data = pd.read_csv(str(PROJ_PATH / f'datasets/{dataset_name}_tvt.csv'))
name = 'nf_bot_multi'
config_path = str(PROJ_PATH / f'src/config/{name}.json')
config = read_json(config_path)

In [4]:
(g_data, x, edge_index, edge_attr, y, 
 input_train_edges, input_val_edges, input_test_edges, 
 input_train_labels, input_val_labels, input_test_labels) = read_data(config, returned_dtype='array')

x: (77177, 32)
edge_index: (2, 600100)
edge_attr: (600100, 8)
y: (77177,)
input_train_edges: (2, 300050)
input_val_edges: (2, 120020)
input_test_edges: (2, 180030)
input_train_labels: (300050,)
input_val_labels: (120020,)
input_test_labels: (180030,)


In [5]:
df_features = pd.concat([pd.DataFrame(edge_index).T, pd.DataFrame(edge_attr)], axis=1)
df_features.columns = ['src', 'dst'] + [f'f_{i}' for i in range(8)]
dfXY = pd.concat([data[['Label', 'Attack', 'Label_tvt', 'Attack_tvt']], df_features], axis=1)
dfXY['Attack'] = dfXY['Attack'].map(g_data['label2idx'])

In [6]:
dfXY.head()

Unnamed: 0,Label,Attack,Label_tvt,Attack_tvt,src,dst,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7
0,0,0,train,train,73463,59694,0.496526,-0.020609,-0.045684,-16.056164,-0.008339,-0.023745,-4.102019,-3.220229
1,1,4,test,test,73400,38797,0.425257,503.528591,18.302417,0.040034,0.26393,19.552606,0.243128,-12.795608
2,0,0,train,train,71600,72572,0.425057,19.653852,36.833188,0.040034,12.195069,43.911817,-4.102019,-3.220229
3,0,0,train,train,71217,71622,0.425057,19.501499,36.833188,0.257827,12.32511,43.911817,0.243128,-3.220229
4,0,0,train,train,71601,72572,0.425057,19.345563,36.833188,0.257827,-0.008511,-0.028581,0.243128,-3.220229


In [7]:
cname_feats = [f'f_{i}' for i in range(8)]
cname_target = 'Label'
cname_tvt = f'{cname_target}_tvt'
f_model = train_xgb(
    dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
df = predict(f_model, dfXY)
df.to_csv('../output/xgb_nf_bot_binary.csv', index=False)



Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,8,300050,120020,180030,51,0.989243,0.988629,0.988979,0.988419,0.988954,0.98863


In [8]:
cname_feats = [f'f_{i}' for i in range(8)]
cname_target = 'Attack'
cname_tvt = f'{cname_target}_tvt'
f_model = train_xgb(
    dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
df = predict(f_model, dfXY)
df.to_csv('../output/xgb_nf_bot_multi.csv', index=False)



Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,8,300050,120020,180030,11,0.934877,0.839587,0.931595,0.835477,0.933073,0.836611


# ToN

In [9]:
dataset_name = 'NF-ToN-IoT'
data = pd.read_csv(str(PROJ_PATH / f'datasets/{dataset_name}_tvt.csv'))
name = 'nf_ton_multi'
config_path = str(PROJ_PATH / f'src/config/{name}.json')
config = read_json(config_path)

In [10]:
(g_data, x, edge_index, edge_attr, y, 
 input_train_edges, input_val_edges, input_test_edges, 
 input_train_labels, input_val_labels, input_test_labels) = read_data(config, returned_dtype='array')

x: (169562, 72)
edge_index: (2, 1379274)
edge_attr: (1379274, 8)
y: (169562,)
input_train_edges: (2, 689638)
input_val_edges: (2, 275854)
input_test_edges: (2, 413782)
input_train_labels: (689638,)
input_val_labels: (275854,)
input_test_labels: (413782,)


In [11]:
df_features = pd.concat([pd.DataFrame(edge_index).T, pd.DataFrame(edge_attr)], axis=1)
df_features.columns = ['src', 'dst'] + [f'f_{i}' for i in range(8)]
dfXY = pd.concat([data[['Label', 'Attack', 'Label_tvt', 'Attack_tvt']], df_features], axis=1)
dfXY['Attack'] = dfXY['Attack'].map(g_data['label2idx'])

In [12]:
cname_feats = [f'f_{i}' for i in range(8)]
cname_target = 'Label'
cname_tvt = f'{cname_target}_tvt'
f_model = train_xgb(
    dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
df = predict(f_model, dfXY)
df.to_csv('../output/xgb_nf_ton_binary.csv', index=False)



Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,8,689638,275854,413782,100,0.999884,0.99739,0.999847,0.997082,0.999849,0.997127


In [13]:
cname_feats = [f'f_{i}' for i in range(8)]
cname_target = 'Attack'
cname_tvt = f'{cname_target}_tvt'
f_model = train_xgb(
    dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
df = predict(f_model, dfXY)
df.to_csv('../output/xgb_nf_ton_multi.csv', index=False)



Unnamed: 0,n_features,n_train,n_val,n_test,n_tree,train_auc,train_acc,val_auc,val_acc,test_auc,test_acc
0,8,689638,275854,413782,88,0.944295,0.723016,0.939436,0.722534,0.939991,0.721873


In [14]:
df

Unnamed: 0,probs_0,probs_1,probs_2,probs_3,probs_4,probs_5,probs_6,probs_7,probs_8,probs_9,gts,tvt
0,0.997096,3.639415e-07,0.000512,2.951430e-06,0.002188,0.000121,0.000015,6.504833e-08,0.000008,0.000056,0,train
1,0.999591,2.028457e-05,0.000031,8.244563e-06,0.000219,0.000039,0.000041,5.308651e-06,0.000024,0.000022,0,train
2,0.999591,2.028457e-05,0.000031,8.244563e-06,0.000219,0.000039,0.000041,5.308651e-06,0.000024,0.000022,0,train
3,0.999869,1.432071e-05,0.000002,2.337731e-06,0.000071,0.000029,0.000002,2.046886e-06,0.000002,0.000006,0,train
4,0.999591,2.028457e-05,0.000031,8.244563e-06,0.000219,0.000039,0.000041,5.308651e-06,0.000024,0.000022,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...
1379269,0.000507,1.948609e-06,0.867949,3.151486e-07,0.001725,0.000036,0.000603,4.079656e-07,0.128983,0.000195,2,train
1379270,0.000504,1.465344e-06,0.862489,3.131664e-07,0.001714,0.000041,0.000599,4.053996e-07,0.134457,0.000194,2,train
1379271,0.000507,2.254400e-06,0.867949,3.151485e-07,0.001725,0.000036,0.000603,4.079655e-07,0.128983,0.000195,2,val
1379272,0.000501,2.227886e-06,0.869502,3.114423e-07,0.001705,0.000035,0.000596,4.031678e-07,0.127466,0.000193,2,train
