In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter
import socket,struct
import timeit

import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

from dgl import from_networkx
import dgl.function as fn
import networkx as nx

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from pandas import MultiIndex, Int64Index


PROJ_PATH=/home/hoang/github/TS-IDS


In [2]:
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(torch.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5          
            g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)
    
class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(torch.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']
        
class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, n_classes, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, n_classes)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

In [4]:
def build_graph(scaler, encoder, X, y, cols_to_norm, cname_label):
    X = encoder.transform(X)
    print('Number of samples:', X.shape, y.shape)
    print(cols_to_norm)
    X[cols_to_norm] = scaler.transform(X[cols_to_norm])
    X['h'] = X[cols_to_norm].values.tolist()
    X['h'] = X['h'].apply(lambda x: torch.tensor(x))
    
    G = nx.from_pandas_edgelist(
        X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.DiGraph())
#     G = nx.from_pandas_edgelist(
#         X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiGraph())
#     G = G.to_directed()
    print('Convert NX graph to DGL')
    G = from_networkx(G, edge_attrs=['h', cname_label])
    
    # Eq1
    G.ndata['h'] = torch.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = torch.ones(len(G.edata['h']), dtype=torch.bool)
    
    G.ndata['h'] = torch.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
    G.edata['h'] = torch.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))
    
    return G

In [5]:
def create_prob_df(
    tvt_str,
    model,
    G,
    node_features, 
    edge_features,
    actual
):
    pred_prop = model(G, node_features, edge_features)
    norm_pred_prop = torch.softmax(pred_prop, dim=1)
    data_array = [pred_prop_ + [actual_, tvt_str] for pred_prop_, actual_ in zip(norm_pred_prop.tolist(), actual.tolist())]
    cnames = [f'probs_{i}' for i in range(norm_pred_prop.shape[1])]
    prob_df = pd.DataFrame(data_array, columns=cnames+['gts', 'tvt'])
    return prob_df

In [6]:
def run_baseline(
    ds_name,
    g_name,
    cname_label,
    cname_tvt,
    n_epochs
):
    
    data = pd.read_csv(f'../datasets/{ds_name}.csv')
    label2idx = pd.read_pickle(f'../datasets/{g_name}.pkl')['label2idx']
    if cname_label == 'Attack':
        data['Attack'] = data['Attack'].map(label2idx)
        
    ####
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
        lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
    data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
    data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
    data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

    data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
    data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

    data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    
    ####
    X_cnames = [c for c in data.columns if not c.startswith('Label_tvt') and not c.startswith('Attack_tvt')]
    X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                        data[data[cname_tvt]=='test'][X_cnames], 
                                        data[data[cname_tvt]!='test'][cname_label], 
                                        data[data[cname_tvt]=='test'][cname_label])
    
    ####
    cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
    scaler = StandardScaler()
    scaler.fit(X_train[cols_to_norm])
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
    encoder.fit(X_train, y_train)

    G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm, cname_label)
    G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm, cname_label)
    
    print('To device')
    G_train = G_train.to(torch.device(device))
    G_test = G_test.to(torch.device(device))
    
    node_features = G_train.ndata['h']
    edge_features = G_train.edata['h']

    node_features_test = G_test.ndata['h']
    edge_features_test = G_test.edata['h']
    
    ####
    ndim_in = G_train.ndata['h'].shape[2]
    ndim_out = 128 
    edim = G_train.ndata['h'].shape[2]
    activation = F.relu
    dropout = 0.2
    n_classes = data[cname_label].nunique()

    sage = SAGE(ndim_in, ndim_out, edim, activation, dropout).to(device)
    mlp = MLPPredictor(ndim_out, 2).to(device)
    model = Model(ndim_in, ndim_out, edim, n_classes, activation, dropout).to(device)
    opt = torch.optim.Adam(model.parameters())

    class_weights = class_weight.compute_class_weight(
        'balanced', 
        classes=data[cname_label].unique(),
        y=data[cname_label].values.tolist(),
    )
    class_weights = torch.FloatTensor(class_weights).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    ####
    edge_label = G_train.edata[cname_label]
    train_mask = G_train.edata['train_mask']
    
    print('Start training')
    for epoch in range(1, n_epochs+1):
        pred = model(G_train, node_features, edge_features).to(device)
        loss = criterion(pred[train_mask], edge_label[train_mask])
        opt.zero_grad()
        loss.backward()
        opt.step()
        if epoch % 10 == 0:
            print(f'{epoch:04d} - Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))
            
    #### test
    test_pred_prop = model(G_test, node_features_test, edge_features_test).to(device)
    norm_test_pred_prop = torch.softmax(test_pred_prop, dim=1)
    test_pred = test_pred_prop.argmax(1)
    test_pred = torch.Tensor.cpu(test_pred).detach().numpy()
    test_actual = G_test.edata.pop(cname_label)

    #### train
    train_pred_prop = model(G_train, node_features, edge_features).to(device)
    train_actual = G_train.edata.pop(cname_label)
    #### create probs df
    train_prob_df = create_prob_df(
        'train',
        model,
        G_train,
        node_features,
        edge_features,
        train_actual
    )
    test_prob_df = create_prob_df(
        'test',
        model,
        G_test,
        node_features_test,
        edge_features_test,
        test_actual
    )
    prob_df = pd.concat([test_prob_df, train_prob_df], axis=0)
    return prob_df

# CV

In [16]:
ds_name2shrtname = {
    'NF-BoT-IoT_cv': 'nf_bot',
    'NF-ToN-IoT_cv': 'nf_ton',
    'NF-CSE-CIC-IDS2018-v2_cv': 'nf_cse', 
    'NF-UNSW-NB15-v2_cv': 'nf_unsw',
}

In [17]:
device = 'cuda:2'
n_folds = 5
flag_save = True
ds_name = list(ds_name2shrtname.keys())[0]
shrt_name = ds_name2shrtname[ds_name]
print(ds_name)
print(shrt_name)

NF-BoT-IoT
nf_bot


In [None]:
n_epochs = 1000
cname_label = 'Label'
for fold in range(n_folds):
    print('Fold:', fold)
    g_name = f'{ds_name}{fold}_graph_binary'
    cname_tvt = f'{cname_label}_tvt_fold_{fold}'
    df_result = run_baseline(
        ds_name,
        g_name,
        cname_label,
        cname_tvt,
        n_epochs
    )
    display(df_result.head())
    if flag_save:
        
        out_path = f'../output_cv/EGraphSAGE_{shrt_name}_binary_cv{fold}.csv'
        print('Save:', out_path)
        df_result.to_csv(out_path, index=False)

In [None]:
n_epochs = 1000
cname_label = 'Attack'
for fold in range(n_folds):
    print('Fold:', fold)
    g_name = f'{ds_name}{fold}_graph_multi'
    cname_tvt = f'{cname_label}_tvt_fold_{fold}'
    df_result = run_baseline(
        ds_name,
        g_name,
        cname_label,
        cname_tvt,
        n_epochs
    )
    display(df_result.head())
    if flag_save:
        out_path = f'../output_cv/EGraphSAGE_{shrt_name}_multi_cv{fold}.csv'
        print('Save:', out_path)
        df_result.to_csv(out_path, index=False)

# Misc

In [18]:
ds_name = 'NF-BoT-IoT'
cname_label = 'Label'
cname_tvt = f'{cname_label}_tvt'
n_epochs = 10
device = 'cuda:0'

data = pd.read_csv(f'../datasets/{ds_name}_tvt.csv')
label2idx = pd.read_pickle(f'../datasets/{ds_name}_graph_multi.pkl')['label2idx']
data['Attack'] = data['Attack'].map(label2idx)

print(data.shape)
data.head()

(600100, 16)


Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack,Label_tvt,Attack_tvt
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,0,train,train
1,192.168.100.6,49160,192.168.100.149,4444,6,0.0,217753000,199100,4521,4049,24,4176249,1,4,test,test
2,192.168.100.46,3456,192.168.100.5,80,17,0.0,8508021,8918372,9086,9086,0,4175916,0,0,train,train
3,192.168.100.3,80,192.168.100.55,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,0,0,train,train
4,192.168.100.46,80,192.168.100.5,80,6,7.0,8374706,0,9086,0,0,4175916,0,0,train,train


In [19]:
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
    lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)

In [20]:
X_cnames = [c for c in data.columns if c not in ['Label_tvt', 'Attack_tvt']]
X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                    data[data[cname_tvt]=='test'][X_cnames], 
                                    data[data[cname_tvt]!='test'][cname_label], 
                                    data[data[cname_tvt]=='test'][cname_label])

In [36]:
def build_graph(scaler, encoder, X, y, cols_to_norm):
    X = encoder.transform(X)
    print(cols_to_norm)
    X[cols_to_norm] = scaler.transform(X[cols_to_norm])
    X['h'] = X[cols_to_norm].values.tolist()
    X['h'] = X['h'].apply(lambda x: torch.tensor(x))
    print(X, X.shape)
    G = nx.from_pandas_edgelist(
        X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiGraph())
    G = G.to_directed()
    print(G.number_of_nodes())
    print(G.number_of_edges())
    G = from_networkx(G, edge_attrs=['h', cname_label])
    print('G nodes', G.number_of_nodes())
    print('G edges', G.number_of_edges())
    # Eq1
    G.ndata['h'] = torch.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = torch.ones(len(G.edata['h']), dtype=torch.bool)
    print('G ndata', G.ndata['h'].shape)
    print('G edata', G.edata['train_mask'].shape)
    
    G.ndata['h'] = torch.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1, G.ndata['h'].shape[1]))
    G.edata['h'] = torch.reshape(G.edata['h'], (G.edata['h'].shape[0], 1, G.edata['h'].shape[1]))
    print('G ndata', G.ndata['h'].shape)
    print('G edata', G.edata['h'].shape)
    G = G.to(device)
    
    return G

In [37]:
G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)

['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']
               IPV4_SRC_ADDR         IPV4_DST_ADDR  PROTOCOL  L7_PROTO  \
0       172.16.165.142:52670      192.168.100.1:53 -2.268430 -0.255669   
2        172.19.188.130:3456      192.168.100.5:80 -2.268430 -0.227547   
3          172.30.194.190:80   192.168.100.55:8080 -2.177751 -0.227166   
4            172.19.59.45:80      192.168.100.5:80 -2.177751 -0.227166   
5           172.29.116.152:0       192.168.100.3:0 -2.177751 -0.227547   
...                      ...                   ...       ...       ...   
600091    172.28.202.83:3456      192.168.100.5:80 -2.268430 -0.227547   
600092   172.29.191.120:8080      192.168.100.3:80 -2.177751 -0.227166   
600094      172.27.119.97:80      192.168.100.3:80 -2.177751 -0.227166   
600095     172.19.103.242:80      192.168.100.5:80 -2.177751 -0.227166   
600099  172.29.164.238:49160  192.168.100.149:4444 -2.177751 -0.227547

In [33]:
G_train.number_of_edges()

840140

In [47]:
G.number_of_edges()

420070

In [44]:
X.shape

(420070, 13)

In [48]:
X = encoder.transform(X_train)
print(cols_to_norm)
X[cols_to_norm] = scaler.transform(X[cols_to_norm])
X['h'] = X[cols_to_norm].values.tolist()
X['h'] = X['h'].apply(lambda x: torch.tensor(x))
G = nx.from_pandas_edgelist(
    X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.DiGraph())
# G = G.to_directed()

['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']


In [21]:
cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
scaler = StandardScaler()
scaler.fit(X_train[cols_to_norm])
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
encoder.fit(X_train, y_train)

G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)
G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm)

node_features = G_train.ndata['h']
edge_features = G_train.edata['h']

node_features_test = G_test.ndata['h']
edge_features_test = G_test.edata['h']



['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']
['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']


In [27]:
print(X_train.shape)
print(X_test.shape)
print(edge_features.shape)
print(edge_features_test.shape)

(420070, 12)
(180030, 12)
torch.Size([840140, 1, 8])
torch.Size([360060, 1, 8])


In [22]:
ndim_in = G_train.ndata['h'].shape[2]
ndim_out = 128 
edim = G_train.ndata['h'].shape[2]
activation = F.relu
dropout = 0.2
n_classes = data[cname_label].nunique()

sage = SAGE(ndim_in, ndim_out, edim, activation, dropout).to(device)
mlp = MLPPredictor(ndim_out, 2).to(device)
model = Model(ndim_in, ndim_out, edim, n_classes, activation, dropout).to(device)
opt = torch.optim.Adam(model.parameters())

class_weights = class_weight.compute_class_weight(
    'balanced', 
    classes=data[cname_label].unique(),
    y=data[cname_label].values.tolist(),
)
class_weights = torch.FloatTensor(class_weights).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [23]:
edge_label = G_train.edata[cname_label]
train_mask = G_train.edata['train_mask']

for epoch in range(1, n_epochs+1):
    pred = model(G_train, node_features, edge_features).to(device)
    loss = criterion(pred[train_mask], edge_label[train_mask])
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 10 == 0:
        print(f'{epoch:04d} - Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))

0010 - Training acc: 0.8192979693412781


In [24]:
train_pred_prop = model(G_train, node_features, edge_features).to(device)
train_pred = train_pred_prop.argmax(1)
train_pred = torch.Tensor.cpu(train_pred).detach().numpy()
train_actual = G_train.edata.pop(cname_label)

In [25]:
test_pred_prop = model(G_test, node_features_test, edge_features_test).to(device)
test_pred = test_pred_prop.argmax(1)
test_pred = torch.Tensor.cpu(test_pred).detach().numpy()
test_actual = G_test.edata.pop(cname_label)

# actual = ["Normal" if i == 0 else "Attack" for i in actual]
# test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

NameError: name 'actual' is not defined

In [None]:
norm_test_pred_prop = torch.softmax(test_pred_prop, dim=1)

In [13]:
test_pred_prop

tensor([[ 3.6942e+01, -4.6637e+01],
        [ 3.9542e+01, -4.8324e+01],
        [ 1.9192e+01, -1.9510e+01],
        ...,
        [ 1.3778e+00, -1.5827e+00],
        [ 2.9585e+00, -3.1632e+00],
        [-1.5411e-01, -1.5061e-02]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [14]:
norm_test_pred_prop

tensor([[1.0000e+00, 5.0369e-37],
        [1.0000e+00, 6.9235e-39],
        [1.0000e+00, 1.5552e-17],
        ...,
        [9.5076e-01, 4.9239e-02],
        [9.9781e-01, 2.1900e-03],
        [4.6529e-01, 5.3471e-01]], device='cuda:0', grad_fn=<SoftmaxBackward0>)