In [6]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter
import socket,struct
import timeit

import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

from dgl import from_networkx
import dgl.function as fn
import networkx as nx

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from pandas import MultiIndex, Int64Index


PROJ_PATH=/home/hoang/github/TS-IDS


In [7]:
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(torch.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5          
            g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)
    
class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(torch.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']
        
class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, n_classes, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, n_classes)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

In [8]:
def build_graph(scaler, encoder, X, y, cols_to_norm):
    X = encoder.transform(X)
    print(cols_to_norm)
    X[cols_to_norm] = scaler.transform(X[cols_to_norm])
    X['h'] = X[cols_to_norm].values.tolist()
    X['h'] = X['h'].apply(lambda x: torch.tensor(x))
    
    G = nx.from_pandas_edgelist(
        X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiGraph())
    G = G.to_directed()
    G = from_networkx(G, edge_attrs=['h', cname_label])
    
    # Eq1
    G.ndata['h'] = torch.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = torch.ones(len(G.edata['h']), dtype=torch.bool)
    
    G.ndata['h'] = torch.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
    G.edata['h'] = torch.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))
    G = G.to(device)
    
    return G

In [185]:
def run_baseline(
    ds_name,
    cname_label,
    n_epochs
):
    cname_tvt = f'{cname_label}_tvt'
    
    data = pd.read_csv(f'../datasets/{ds_name}_tvt.csv')
    label2idx = pd.read_pickle(f'../datasets/{ds_name}_graph_multi.pkl')['label2idx']
    if cname_label == 'Attack':
        data['Attack'] = data['Attack'].map(label2idx)
        
    ####
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
        lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
    data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
    data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
    data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

    data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
    data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

    data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    
    ####
    X_cnames = [c for c in data.columns if c not in ['Label_tvt', 'Attack_tvt']]
    X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                        data[data[cname_tvt]=='test'][X_cnames], 
                                        data[data[cname_tvt]!='test'][cname_label], 
                                        data[data[cname_tvt]=='test'][cname_label])
    
    ####
    cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
    scaler = StandardScaler()
    scaler.fit(X_train[cols_to_norm])
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
    encoder.fit(X_train, y_train)

    G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)
    G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm)

    node_features = G_train.ndata['h']
    edge_features = G_train.edata['h']

    node_features_test = G_test.ndata['h']
    edge_features_test = G_test.edata['h']
    
    ####
    ndim_in = G_train.ndata['h'].shape[2]
    ndim_out = 128 
    edim = G_train.ndata['h'].shape[2]
    activation = F.relu
    dropout = 0.2
    n_classes = data[cname_label].nunique()

    sage = SAGE(ndim_in, ndim_out, edim, activation, dropout).to(device)
    mlp = MLPPredictor(ndim_out, 2).to(device)
    model = Model(ndim_in, ndim_out, edim, n_classes, activation, dropout).to(device)
    opt = torch.optim.Adam(model.parameters())

    class_weights = class_weight.compute_class_weight(
        'balanced', 
        classes=data[cname_label].unique(),
        y=data[cname_label].values.tolist(),
    )
    class_weights = torch.FloatTensor(class_weights).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    ####
    edge_label = G_train.edata[cname_label]
    train_mask = G_train.edata['train_mask']

    for epoch in range(1, n_epochs+1):
        pred = model(G_train, node_features, edge_features).to(device)
        loss = criterion(pred[train_mask], edge_label[train_mask])
        opt.zero_grad()
        loss.backward()
        opt.step()
        if epoch % 10 == 0:
            print(f'{epoch:04d} - Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))
            
    ####
    test_pred_prop = model(G_test, node_features_test, edge_features_test).to(device)
    norm_test_pred_prop = torch.softmax(test_pred_prop, dim=1)
    test_pred = test_pred_prop.argmax(1)
    test_pred = torch.Tensor.cpu(test_pred).detach().numpy()
    actual = G_test.edata.pop(cname_label)

    actual_1 = [i.item() for i in actual]
    test_pred_1 = [i for i in test_pred]
    
    # # evaluation metrics
    # - Acc
    print('accuracy_score:', accuracy_score(actual_1, test_pred_1))
    # - AUC
    pred_array = norm_test_pred_prop.detach().cpu()
    if cname_label == 'Label':
        pred_array = norm_test_pred_prop[:, 1].detach().cpu()
    print('roc_auc_score', roc_auc_score(actual_1, pred_array, multi_class='ovr', average='macro'))
    # - Precision
    print('precision_score', precision_score(actual_1, test_pred_1, average='micro'))
    # - Recall
    print('recall_score', recall_score(actual_1, test_pred_1, average='micro'))
    # - F1-score
    print('f1_score', f1_score(actual_1, test_pred_1, average='micro'))

# BoT

In [186]:
ds_name = 'NF-BoT-IoT'
n_epochs = 10
device = 'cuda:0'

In [187]:
cname_label = 'Label'
run_baseline(
    ds_name,
    cname_label,
    n_epochs
)



['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']
['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']
0010 - Training acc: 0.8097209930419922
accuracy_score: 0.8116230628228629
roc_auc_score 0.6192215129179939
precision_score 0.8116230628228629
recall_score 0.8116230628228629
f1_score 0.8116230628228629


In [188]:
cname_label = 'Attack'
run_baseline(
    ds_name,
    cname_label,
    n_epochs
)



['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']
['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']
0010 - Training acc: 0.09482943266630173
accuracy_score: 0.09455368549686163
roc_auc_score 0.5742949133688617
precision_score 0.09455368549686163
recall_score 0.09455368549686163
f1_score 0.09455368549686163


# ToN

In [189]:
ds_name = 'NF-ToN-IoT'
n_epochs = 10
device = 'cuda:0'

In [None]:
cname_label = 'Label'
run_baseline(
    ds_name,
    cname_label,
    n_epochs
)



['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']
['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']


In [None]:
cname_label = 'Attack'
run_baseline(
    ds_name,
    cname_label,
    n_epochs
)

# Exploration

In [166]:
ds_name = 'NF-BoT-IoT'
cname_label = 'Label'
cname_tvt = f'{cname_label}_tvt'
n_epochs = 10
device = 'cuda:0'

data = pd.read_csv(f'../datasets/{ds_name}_tvt.csv')
label2idx = pd.read_pickle(f'../datasets/{ds_name}_graph_multi.pkl')['label2idx']
data['Attack'] = data['Attack'].map(label2idx)

print(data.shape)
data.head()

(600100, 16)


Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack,Label_tvt,Attack_tvt
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,0,train,train
1,192.168.100.6,49160,192.168.100.149,4444,6,0.0,217753000,199100,4521,4049,24,4176249,1,4,test,test
2,192.168.100.46,3456,192.168.100.5,80,17,0.0,8508021,8918372,9086,9086,0,4175916,0,0,train,train
3,192.168.100.3,80,192.168.100.55,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,0,0,train,train
4,192.168.100.46,80,192.168.100.5,80,6,7.0,8374706,0,9086,0,0,4175916,0,0,train,train


In [167]:
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
    lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)

In [168]:
X_cnames = [c for c in data.columns if c not in ['Label_tvt', 'Attack_tvt']]
X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                    data[data[cname_tvt]=='test'][X_cnames], 
                                    data[data[cname_tvt]!='test'][cname_label], 
                                    data[data[cname_tvt]=='test'][cname_label])

In [169]:
cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
scaler = StandardScaler()
scaler.fit(X_train[cols_to_norm])
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
encoder.fit(X_train, y_train)

G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)
G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm)

node_features = G_train.ndata['h']
edge_features = G_train.edata['h']

node_features_test = G_test.ndata['h']
edge_features_test = G_test.edata['h']



['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']
['TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS', 'PROTOCOL', 'IN_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'L7_PROTO']


In [170]:
ndim_in = G_train.ndata['h'].shape[2]
ndim_out = 128 
edim = G_train.ndata['h'].shape[2]
activation = F.relu
dropout = 0.2
n_classes = data[cname_label].nunique()

sage = SAGE(ndim_in, ndim_out, edim, activation, dropout).to(device)
mlp = MLPPredictor(ndim_out, 2).to(device)
model = Model(ndim_in, ndim_out, edim, n_classes, activation, dropout).to(device)
opt = torch.optim.Adam(model.parameters())

class_weights = class_weight.compute_class_weight(
    'balanced', 
    classes=data[cname_label].unique(),
    y=data[cname_label].values.tolist(),
)
class_weights = torch.FloatTensor(class_weights).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [171]:
edge_label = G_train.edata[cname_label]
train_mask = G_train.edata['train_mask']

for epoch in range(1, n_epochs+1):
    pred = model(G_train, node_features, edge_features).to(device)
    loss = criterion(pred[train_mask], edge_label[train_mask])
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 10 == 0:
        print(f'{epoch:04d} - Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))

0010 - Training acc: 0.8383662104606628


In [172]:
test_pred_prop = model(G_test, node_features_test, edge_features_test).to(device)
test_pred = test_pred_prop.argmax(1)
test_pred = torch.Tensor.cpu(test_pred).detach().numpy()
actual = G_test.edata.pop(cname_label)

# actual = ["Normal" if i == 0 else "Attack" for i in actual]
# test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

actual_1 = [i.item() for i in actual]
test_pred_1 = [i for i in test_pred]

In [173]:
norm_test_pred_prop = torch.softmax(test_pred_prop, dim=1)

In [176]:
test_pred_prop

tensor([[ 5.0102e+01, -2.0193e+01],
        [ 4.6777e+01, -2.5275e+01],
        [ 2.1044e+01, -1.0978e+01],
        ...,
        [ 1.3861e+00, -9.9944e-01],
        [ 2.3370e+00, -2.2723e+00],
        [-1.7419e-01,  9.2416e-03]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [175]:
norm_test_pred_prop

tensor([[1.0000e+00, 2.9613e-31],
        [1.0000e+00, 5.1087e-32],
        [1.0000e+00, 1.2398e-14],
        ...,
        [9.1572e-01, 8.4278e-02],
        [9.9014e-01, 9.8600e-03],
        [4.5427e-01, 5.4573e-01]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [182]:
roc_auc_score(actual_1, test_pred_prop[:, 1].detach().cpu(), multi_class='raise', average='macro')

0.5734023285218188

In [174]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

# # evaluation metrics
# - Acc
print('accuracy_score:', accuracy_score(actual_1, test_pred_1))

# - AUC
print('roc_auc_score', roc_auc_score(actual_1, norm_test_pred_prop.detach().cpu(), multi_class='ovr', average='macro'))

# - Precision
print('precision_score', precision_score(actual_1, test_pred_1, average='micro'))

# - Recall
print('recall_score', recall_score(actual_1, test_pred_1, average='micro'))

# - F1-score
print('f1_score', f1_score(actual_1, test_pred_1, average='micro'))

accuracy_score: 0.8009526190079431


ValueError: y should be a 1d array, got an array of shape (360060, 2) instead.

# ToN

In [58]:
ds_name = 'NF-ToN-IoT'
data = pd.read_csv(f'../datasets/{ds_name}_tvt.csv')

In [59]:
data['Attack'].value_counts()

injection     468539
ddos          326345
Benign        270279
password      156299
xss            99944
scanning       21467
dos            17717
backdoor       17247
mitm            1295
ransomware       142
Name: Attack, dtype: int64