In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter

import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

PROJ_PATH=/home/hoang/github/TS-IDS


In [2]:
def get_xgb_training_data(data, config):
    (g_data, x, edge_index, edge_attr, y, 
     input_train_edges, input_val_edges, input_test_edges, 
     input_train_labels, input_val_labels, input_test_labels, 
     input_train_edges_attr, input_val_edges_attr, input_test_edges_attr) = read_data(
        config, returned_dtype='array')

    df_features = pd.concat([pd.DataFrame(edge_index).T, pd.DataFrame(edge_attr)], axis=1)
    cname_feats = [f'f_{i}' for i in range(edge_attr.shape[1])]
    df_features.columns = ['src', 'dst'] + cname_feats
    incl_cols = [c for c in data.columns if c.startswith('Label') or c.startswith('Attack')]
    dfXY = pd.concat([data[incl_cols], df_features], axis=1)
    dfXY['Attack'] = dfXY['Attack'].map(g_data['label2idx'])
    return dfXY, cname_feats

def compute_evaluation_metrics(model, best_ntree, x_train, y_train, x_val, y_val, x_test, y_test, is_binary):
    train_score = model.predict_proba(x_train, ntree_limit=best_ntree)
    train_pred = model.predict(x_train, ntree_limit=best_ntree)
    val_score = model.predict_proba(x_val, ntree_limit=best_ntree)
    val_pred = model.predict(x_val, ntree_limit=best_ntree)
    test_score = model.predict_proba(x_test, ntree_limit=best_ntree)
    test_pred = model.predict(x_test, ntree_limit=best_ntree)
    
    if is_binary:
        train_auc = roc_auc_score(y_true=y_train, y_score=train_score[:, 1])
        val_auc = roc_auc_score(y_true=y_val, y_score=val_score[:, 1])
        test_auc = roc_auc_score(y_true=y_test, y_score=test_score[:, 1])
    else:
        train_auc = roc_auc_score(y_true=y_train, y_score=train_score, multi_class='ovo')
        val_auc = roc_auc_score(y_true=y_val, y_score=val_score, multi_class='ovo')
        test_auc = roc_auc_score(y_true=y_test, y_score=test_score, multi_class='ovo')
    
    train_acc = accuracy_score(y_true=y_train, y_pred=train_pred)
    val_acc = accuracy_score(y_true=y_val, y_pred=val_pred)
    test_acc = accuracy_score(y_true=y_test, y_pred=test_pred)
    return train_auc, train_acc, val_auc, val_acc, test_auc, test_acc

def train_xgb(dfXY, cname_feats, cname_target='Label', cname_tvt='Label_tvt', option_init={}, option_fit={}):
    default_option_fit = {
        'eval_metric': 'auc',
        'verbose': False,
        'early_stopping_rounds': 20,
    }
    default_option_init = {
        'tree_method': 'gpu_hist',
        'use_label_encoder': False,
        'objective': 'binary:logistic',
        'random_state': 0,
        'n_jobs': 32
    }
    default_option_fit.update(option_fit)
    default_option_init.update(option_init)
    option_fit = default_option_fit
    option_init = default_option_init
    
    if dfXY[cname_target].nunique() == 2:
        is_binary = True
    else:
        is_binary = False
        
    # train/test
    x_train = dfXY[dfXY[cname_tvt]=='train'][cname_feats].values
    y_train = dfXY[dfXY[cname_tvt]=='train'][cname_target].values.astype("i4")
    x_val = dfXY[dfXY[cname_tvt]=='val'][cname_feats].values
    y_val = dfXY[dfXY[cname_tvt]=='val'][cname_target].values.astype("i4")
    x_test = dfXY[dfXY[cname_tvt]=='test'][cname_feats].values
    y_test = dfXY[dfXY[cname_tvt]=='test'][cname_target].values.astype("i4")
    
    # classify
    eval_set = [
        (x_train, y_train),
        (x_val, y_val),
    ]
    
    model = xgb.XGBClassifier(**option_init)
    model.fit(x_train, y_train, eval_set=eval_set, **option_fit)
    best_ntree = model.get_booster().best_ntree_limit  
    
    train_auc, train_acc, val_auc, val_acc, test_auc, test_acc = compute_evaluation_metrics(
        model, best_ntree, x_train, y_train, x_val, y_val, x_test, y_test, is_binary)
    
    pd_res = pd.DataFrame({
        'n_features': [len(cname_feats)],
        'n_train': [x_train.shape[0]],
        'n_val': [x_val.shape[0]],
        'n_test': [x_test.shape[0]],
        'n_tree': [best_ntree],
        'train_auc': [train_auc],
        'train_acc': [train_acc],
        'val_auc': [val_auc],
        'val_acc': [val_acc],
        'test_auc': [test_auc],
        'test_acc': [test_acc],  
    })
    
    display(pd_res)
    
    # track
    fmodel = {
        'model': model,
        'cname_target': cname_target,
        'cname_tvt': cname_tvt,
        'cname_feats': cname_feats,  
    }
    return fmodel

def predict(f_model, dfXY):
    probs = f_model['model'].predict_proba(dfXY[f_model['cname_feats']])
    df = pd.DataFrame(probs)
    df.columns = [f'probs_{i}' for i in range(df.shape[1])]
    df['gts'] = dfXY[f_model['cname_target']]
    df['tvt'] = dfXY[f_model['cname_tvt']]
    return df

In [3]:
cfname2dsname = {
    'nf_bot_multi': 'NF-BoT-IoT_cv{}_graph_multi',
    'nf_bot_binary': 'NF-BoT-IoT_cv{}_graph_binary',
    'nf_ton_multi': 'NF-ToN-IoT_cv{}_graph_multi',
    'nf_ton_binary': 'NF-ToN-IoT_cv{}_graph_binary',
    'nf_cse_multi': 'NF-CSE-CIC-IDS2018-v2_cv{}_graph_multi',
    'nf_cse_binary': 'NF-CSE-CIC-IDS2018-v2_cv{}_graph_binary',
    'nf_unsw_multi': 'NF-UNSW-NB15-v2_cv{}_graph_multi',
    'nf_unsw_binary': 'NF-UNSW-NB15-v2_cv{}_graph_binary',
}
n_folds = 5

In [None]:
# binary classification
cname_target = 'Label'

ds = 'nf_cse_binary'
f_ds = cfname2dsname[ds]
ds_file = "../datasets/{}.csv".format(f_ds.split('{}')[0])
print(ds_file)
data =  pd.read_csv(ds_file)
for fold in range(n_folds):
    config_path = str(PROJ_PATH / f'src/config/{ds}.json')
    config = read_json(config_path)
    config['ds_name'] = f_ds.format(str(fold))
    dfXY, cname_feats = get_xgb_training_data(data, config)
    cname_tvt = f'{cname_target}_tvt_fold_{fold}'
    f_model = train_xgb(
        dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
    df = predict(f_model, dfXY)
    df.to_csv(f'../output_cv/xgb_{ds}_cv{fold}.csv', index=False)

In [None]:
# Multi-class classification
cname_target = 'Attack'

ds = 'nf_cse_multi'
f_ds = cfname2dsname[ds]
ds_file = "../datasets/{}.csv".format(f_ds.split('{}')[0])
print(ds_file)
data =  pd.read_csv(ds_file)
for fold in range(n_folds):
    config_path = str(PROJ_PATH / f'src/config/{ds}.json')
    config = read_json(config_path)
    config['ds_name'] = f_ds.format(str(fold))
    dfXY, cname_feats = get_xgb_training_data(data, config)
    cname_tvt = f'{cname_target}_tvt_fold_{fold}'
    f_model = train_xgb(
        dfXY, cname_feats, cname_target, cname_tvt, option_init={}, option_fit={})
    df = predict(f_model, dfXY)
    df.to_csv(f'../output_cv/xgb_{ds}_cv{fold}.csv', index=False)