In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter
import socket,struct
import timeit

import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

from dgl import from_networkx
import dgl.function as fn
import networkx as nx

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from pandas import MultiIndex, Int64Index


PROJ_PATH=/home/hoang/github/TS-IDS


In [2]:
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(torch.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5          
            g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)
    
class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(torch.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']
        
class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, n_classes, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, n_classes)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

In [3]:
def build_graph(scaler, encoder, X, y, cols_to_norm):
    X = encoder.transform(X)
    print(cols_to_norm)
    X[cols_to_norm] = scaler.transform(X[cols_to_norm])
    X['h'] = X[cols_to_norm].values.tolist()
    X['h'] = X['h'].apply(lambda x: torch.tensor(x))
    
    G = nx.from_pandas_edgelist(
    X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.DiGraph())
#     G = nx.from_pandas_edgelist(
#         X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiGraph())
#     G = G.to_directed()
    G = from_networkx(G, edge_attrs=['h', cname_label])
    
    # Eq1
    G.ndata['h'] = torch.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = torch.ones(len(G.edata['h']), dtype=torch.bool)
    
    G.ndata['h'] = torch.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
    G.edata['h'] = torch.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))
    G = G.to(device)
    
    return G

In [4]:
def create_prob_df(
    tvt_str,
    model,
    G,
    node_features, 
    edge_features,
    actual
):
    pred_prop = model(G, node_features, edge_features)
    norm_pred_prop = torch.softmax(pred_prop, dim=1)
    data_array = [pred_prop_ + [actual_, tvt_str] for pred_prop_, actual_ in zip(norm_pred_prop.tolist(), actual.tolist())]
    cnames = [f'probs_{i}' for i in range(norm_pred_prop.shape[1])]
    prob_df = pd.DataFrame(data_array, columns=cnames+['gts', 'tvt'])
    return prob_df

In [5]:
def run_baseline(
    ds_name,
    cname_label,
    n_epochs
):
    cname_tvt = f'{cname_label}_tvt'
    
    data = pd.read_csv(f'../datasets/{ds_name}_tvt.csv')
    label2idx = pd.read_pickle(f'../datasets/{ds_name}_graph_multi.pkl')['label2idx']
    if cname_label == 'Attack':
        data['Attack'] = data['Attack'].map(label2idx)
        
    ####
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
        lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
    data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
    data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
    data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
    data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

    data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
    data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

    data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)
    
    ####
    X_cnames = [c for c in data.columns if c not in ['Label_tvt', 'Attack_tvt']]
    X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                        data[data[cname_tvt]=='test'][X_cnames], 
                                        data[data[cname_tvt]!='test'][cname_label], 
                                        data[data[cname_tvt]=='test'][cname_label])
    
    ####
    cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
    scaler = StandardScaler()
    scaler.fit(X_train[cols_to_norm])
    encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
    encoder.fit(X_train, y_train)

    G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)
    G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm)

    node_features = G_train.ndata['h']
    edge_features = G_train.edata['h']

    node_features_test = G_test.ndata['h']
    edge_features_test = G_test.edata['h']
    
    ####
    ndim_in = G_train.ndata['h'].shape[2]
    ndim_out = 128 
    edim = G_train.ndata['h'].shape[2]
    activation = F.relu
    dropout = 0.2
    n_classes = data[cname_label].nunique()

    sage = SAGE(ndim_in, ndim_out, edim, activation, dropout).to(device)
    mlp = MLPPredictor(ndim_out, 2).to(device)
    model = Model(ndim_in, ndim_out, edim, n_classes, activation, dropout).to(device)
    opt = torch.optim.Adam(model.parameters())

    class_weights = class_weight.compute_class_weight(
        'balanced', 
        classes=data[cname_label].unique(),
        y=data[cname_label].values.tolist(),
    )
    class_weights = torch.FloatTensor(class_weights).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    ####
    edge_label = G_train.edata[cname_label]
    train_mask = G_train.edata['train_mask']

    for epoch in range(1, n_epochs+1):
        pred = model(G_train, node_features, edge_features).to(device)
        loss = criterion(pred[train_mask], edge_label[train_mask])
        opt.zero_grad()
        loss.backward()
        opt.step()
        if epoch % 10 == 0:
            print(f'{epoch:04d} - Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))
            
    #### test
    test_pred_prop = model(G_test, node_features_test, edge_features_test).to(device)
    norm_test_pred_prop = torch.softmax(test_pred_prop, dim=1)
    test_pred = test_pred_prop.argmax(1)
    test_pred = torch.Tensor.cpu(test_pred).detach().numpy()
    test_actual = G_test.edata.pop(cname_label)

    #### train
    train_pred_prop = model(G_train, node_features, edge_features).to(device)
    train_actual = G_train.edata.pop(cname_label)
    print(type(train_actual))
    print(len(train_actual))
    print(len(test_actual))
    #### create probs df
    test_prob_df = create_prob_df(
        'train',
        model,
        G_train,
        node_features,
        edge_features,
        train_actual
    )
    train_prob_df = create_prob_df(
        'test',
        model,
        G_test,
        node_features_test,
        edge_features_test,
        test_actual
    )
    prob_df = pd.concat([test_prob_df, train_prob_df], axis=0)
    return prob_df

# BoT

In [6]:
ds_name = 'NF-BoT-IoT'
n_epochs = 5000
device = 'cuda:0'

In [7]:
cname_label = 'Label'
bot_bin_prob_df = run_baseline(
    ds_name,
    cname_label,
    n_epochs
)
bot_bin_prob_df
bot_bin_prob_df.to_csv('../output/EGraphSAGE_nf_bot_binary.csv', index=False)



['FLOW_DURATION_MILLISECONDS', 'PROTOCOL', 'L7_PROTO', 'OUT_BYTES', 'IN_BYTES', 'TCP_FLAGS', 'IN_PKTS', 'OUT_PKTS']
['FLOW_DURATION_MILLISECONDS', 'PROTOCOL', 'L7_PROTO', 'OUT_BYTES', 'IN_BYTES', 'TCP_FLAGS', 'IN_PKTS', 'OUT_PKTS']
0010 - Training acc: 0.9569141864776611
0020 - Training acc: 0.7235525250434875
0030 - Training acc: 0.9064415097236633
0040 - Training acc: 0.8148398995399475
0050 - Training acc: 0.841542661190033
0060 - Training acc: 0.7265877723693848
0070 - Training acc: 0.687746524810791
0080 - Training acc: 0.794643223285675
0090 - Training acc: 0.7439563274383545
0100 - Training acc: 0.5243304967880249
0110 - Training acc: 0.7317202687263489
0120 - Training acc: 0.7791719436645508
0130 - Training acc: 0.7395641803741455
0140 - Training acc: 0.8919844031333923
0150 - Training acc: 0.8717424869537354
0160 - Training acc: 0.7458108067512512
0170 - Training acc: 0.7576683759689331
0180 - Training acc: 0.6392378211021423
0190 - Training acc: 0.7012990713119507
0200 - Trai

2010 - Training acc: 0.9419428706169128
2020 - Training acc: 0.9343345761299133
2030 - Training acc: 0.9177420735359192
2040 - Training acc: 0.8847332000732422
2050 - Training acc: 0.9381054043769836
2060 - Training acc: 0.9333323836326599
2070 - Training acc: 0.8790150880813599
2080 - Training acc: 0.9307804107666016
2090 - Training acc: 0.9335132837295532
2100 - Training acc: 0.9267786741256714
2110 - Training acc: 0.9322039484977722
2120 - Training acc: 0.9316278696060181
2130 - Training acc: 0.9207963347434998
2140 - Training acc: 0.9290211796760559
2150 - Training acc: 0.9229221343994141
2160 - Training acc: 0.9144212007522583
2170 - Training acc: 0.9295544028282166
2180 - Training acc: 0.9101195335388184
2190 - Training acc: 0.8983952403068542
2200 - Training acc: 0.9292734861373901
2210 - Training acc: 0.908705472946167
2220 - Training acc: 0.9147306680679321
2230 - Training acc: 0.9398146271705627
2240 - Training acc: 0.9146473407745361
2250 - Training acc: 0.9491678476333618
2

4070 - Training acc: 0.9294663071632385
4080 - Training acc: 0.9130499958992004
4090 - Training acc: 0.9080103039741516
4100 - Training acc: 0.8685644268989563
4110 - Training acc: 0.9384338855743408
4120 - Training acc: 0.9462540149688721
4130 - Training acc: 0.8806672096252441
4140 - Training acc: 0.9086911678314209
4150 - Training acc: 0.9364223480224609
4160 - Training acc: 0.8962075114250183
4170 - Training acc: 0.9198464751243591
4180 - Training acc: 0.9337013363838196
4190 - Training acc: 0.9454065561294556
4200 - Training acc: 0.9467611312866211
4210 - Training acc: 0.9444257616996765
4220 - Training acc: 0.9362009167671204
4230 - Training acc: 0.8982857465744019
4240 - Training acc: 0.927397608757019
4250 - Training acc: 0.9412263035774231
4260 - Training acc: 0.9201107025146484
4270 - Training acc: 0.9321396946907043
4280 - Training acc: 0.9375959634780884
4290 - Training acc: 0.9164375066757202
4300 - Training acc: 0.9028754830360413
4310 - Training acc: 0.9263620972633362
4

Unnamed: 0,probs_0,probs_1,gts,tvt
0,0.998853,1.146895e-03,0,train
1,0.994569,5.430795e-03,0,train
2,0.999999,7.975501e-07,0,train
3,0.994542,5.458216e-03,0,train
4,0.983893,1.610726e-02,0,train
...,...,...,...,...
180024,0.000000,1.000000e+00,1,test
180025,0.999744,2.565124e-04,0,test
180026,0.935530,6.447051e-02,0,test
180027,1.000000,6.924949e-08,0,test


In [9]:
cname_label = 'Attack'
bot_multi_prob_df = run_baseline(
    ds_name,
    cname_label,
    n_epochs
)
bot_multi_prob_df.to_csv('../output/EGraphSAGE_nf_bot_multi.csv', index=False)



['FLOW_DURATION_MILLISECONDS', 'PROTOCOL', 'L7_PROTO', 'OUT_BYTES', 'IN_BYTES', 'TCP_FLAGS', 'IN_PKTS', 'OUT_PKTS']
['FLOW_DURATION_MILLISECONDS', 'PROTOCOL', 'L7_PROTO', 'OUT_BYTES', 'IN_BYTES', 'TCP_FLAGS', 'IN_PKTS', 'OUT_PKTS']
0010 - Training acc: 0.09482274204492569
0020 - Training acc: 0.09626536816358566
0030 - Training acc: 0.09626536816358566
0040 - Training acc: 0.09650342166423798
0050 - Training acc: 0.6532894372940063
0060 - Training acc: 0.7228424549102783
0070 - Training acc: 0.7339621186256409
0080 - Training acc: 0.7535398602485657
0090 - Training acc: 0.75748211145401
0100 - Training acc: 0.7581415176391602
0110 - Training acc: 0.7686017155647278
0120 - Training acc: 0.7674209475517273
0130 - Training acc: 0.7795285582542419
0140 - Training acc: 0.7741317749023438
0150 - Training acc: 0.7720106840133667
0160 - Training acc: 0.7971494793891907
0170 - Training acc: 0.7939381003379822
0180 - Training acc: 0.8026081323623657
0190 - Training acc: 0.8043578267097473
0200 -

2010 - Training acc: 0.8291729092597961
2020 - Training acc: 0.8227667808532715
2030 - Training acc: 0.8209980130195618
2040 - Training acc: 0.8284658789634705
2050 - Training acc: 0.834931492805481
2060 - Training acc: 0.8285396695137024
2070 - Training acc: 0.8311582803726196
2080 - Training acc: 0.8258829116821289
2090 - Training acc: 0.8332055807113647
2100 - Training acc: 0.8274231553077698
2110 - Training acc: 0.8301417827606201
2120 - Training acc: 0.8340292572975159
2130 - Training acc: 0.8295037746429443
2140 - Training acc: 0.821500301361084
2150 - Training acc: 0.831432044506073
2160 - Training acc: 0.8289229273796082
2170 - Training acc: 0.8341435194015503
2180 - Training acc: 0.8296799659729004
2190 - Training acc: 0.821747899055481
2200 - Training acc: 0.8234618902206421
2210 - Training acc: 0.8283896446228027
2220 - Training acc: 0.8300489187240601
2230 - Training acc: 0.8353456854820251
2240 - Training acc: 0.8332031965255737
2250 - Training acc: 0.8264876008033752
2260

4070 - Training acc: 0.8285991549491882
4080 - Training acc: 0.8360313177108765
4090 - Training acc: 0.8204861879348755
4100 - Training acc: 0.8205528259277344
4110 - Training acc: 0.8212741613388062
4120 - Training acc: 0.8148775696754456
4130 - Training acc: 0.8336983323097229
4140 - Training acc: 0.8278802037239075
4150 - Training acc: 0.8288776874542236
4160 - Training acc: 0.8328818082809448
4170 - Training acc: 0.8282325267791748
4180 - Training acc: 0.8356337547302246
4190 - Training acc: 0.8304060101509094
4200 - Training acc: 0.8260138630867004
4210 - Training acc: 0.8205171227455139
4220 - Training acc: 0.8366811871528625
4230 - Training acc: 0.835831344127655
4240 - Training acc: 0.8286253213882446
4250 - Training acc: 0.8303131461143494
4260 - Training acc: 0.825092613697052
4270 - Training acc: 0.8323937654495239
4280 - Training acc: 0.8360955715179443
4290 - Training acc: 0.8354218602180481
4300 - Training acc: 0.8286967873573303
4310 - Training acc: 0.8277040719985962
43

In [10]:
bot_multi_prob_df

Unnamed: 0,probs_0,probs_1,probs_2,probs_3,probs_4,gts,tvt
0,0.704554,2.619193e-01,1.086907e-02,2.244496e-02,2.123700e-04,0,train
1,0.517022,4.539822e-01,1.341542e-02,1.558069e-02,3.908807e-09,0,train
2,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,train
3,0.513929,4.565644e-01,1.373505e-02,1.577194e-02,3.961761e-09,0,train
4,0.165381,7.686258e-01,3.182042e-02,3.212402e-02,2.048445e-03,0,train
...,...,...,...,...,...,...,...
180025,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,4,test
180026,0.919621,6.639876e-02,1.197097e-03,1.278253e-02,5.427218e-07,0,test
180027,0.143455,7.962033e-01,3.180051e-02,2.622961e-02,2.312066e-03,0,test
180028,1.000000,3.190462e-40,6.458865e-41,5.076800e-08,2.220121e-30,0,test


# ToN

In [8]:
ds_name = 'NF-ToN-IoT'
n_epochs = 5000
device = 'cuda:0'

In [7]:
cname_label = 'Label'
ton_bin_prob_df = run_baseline(
    ds_name,
    cname_label,
    n_epochs
)
ton_bin_prob_df.to_csv('../output/EGraphSAGE_nf_ton_binary.csv', index=False)



['PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'OUT_PKTS', 'IN_PKTS', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS']
['PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'OUT_PKTS', 'IN_PKTS', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS']
0010 - Training acc: 0.5341001749038696
0020 - Training acc: 0.45612844824790955
0030 - Training acc: 0.7232189178466797
0040 - Training acc: 0.5973123908042908
0050 - Training acc: 0.8347201943397522
0060 - Training acc: 0.428486168384552
0070 - Training acc: 0.7767063975334167
0080 - Training acc: 0.5672537088394165
0090 - Training acc: 0.5369464159011841
0100 - Training acc: 0.6157498955726624
0110 - Training acc: 0.5816155076026917
0120 - Training acc: 0.37375369668006897
0130 - Training acc: 0.5792653560638428
0140 - Training acc: 0.4614553451538086
0150 - Training acc: 0.8800738453865051
0160 - Training acc: 0.899925172328949
0170 - Training acc: 0.8864323496818542
0180 - Training acc: 0.906700074672699
0190 - Training acc: 0.9143035411834717
0200 - Tr

2010 - Training acc: 0.9697931408882141
2020 - Training acc: 0.9828830361366272
2030 - Training acc: 0.986673891544342
2040 - Training acc: 0.9875863790512085
2050 - Training acc: 0.984291672706604
2060 - Training acc: 0.9713716506958008
2070 - Training acc: 0.9821310639381409
2080 - Training acc: 0.9891576766967773
2090 - Training acc: 0.9713291525840759
2100 - Training acc: 0.9833709001541138
2110 - Training acc: 0.9879520535469055
2120 - Training acc: 0.9830560088157654
2130 - Training acc: 0.9829503893852234
2140 - Training acc: 0.9849421381950378
2150 - Training acc: 0.9881581664085388
2160 - Training acc: 0.9889691472053528
2170 - Training acc: 0.9830933213233948
2180 - Training acc: 0.9803371429443359
2190 - Training acc: 0.9813739657402039
2200 - Training acc: 0.9862989783287048
2210 - Training acc: 0.9829089045524597
2220 - Training acc: 0.9833750128746033
2230 - Training acc: 0.9824532270431519
2240 - Training acc: 0.9838918447494507
2250 - Training acc: 0.9840078949928284
22

4070 - Training acc: 0.9913855791091919
4080 - Training acc: 0.9834475517272949
4090 - Training acc: 0.9835935831069946
4100 - Training acc: 0.9895553588867188
4110 - Training acc: 0.9904430508613586
4120 - Training acc: 0.9838794469833374
4130 - Training acc: 0.971332311630249
4140 - Training acc: 0.9822242856025696
4150 - Training acc: 0.9879748225212097
4160 - Training acc: 0.9896144270896912
4170 - Training acc: 0.9833439588546753
4180 - Training acc: 0.991696298122406
4190 - Training acc: 0.9918485283851624
4200 - Training acc: 0.9831771850585938
4210 - Training acc: 0.9847214818000793
4220 - Training acc: 0.9816712141036987
4230 - Training acc: 0.9866220951080322
4240 - Training acc: 0.9887112379074097
4250 - Training acc: 0.990348756313324
4260 - Training acc: 0.9830808639526367
4270 - Training acc: 0.9898298382759094
4280 - Training acc: 0.9913855791091919
4290 - Training acc: 0.9919241666793823
4300 - Training acc: 0.9816049337387085
4310 - Training acc: 0.9836556911468506
432

In [9]:
n_epochs = 10000

In [10]:
cname_label = 'Attack'
ton_multi_prob_df = run_baseline(
    ds_name,
    cname_label,
    n_epochs
)
ton_multi_prob_df.to_csv('../output/EGraphSAGE_nf_ton_multi.csv', index=False)



['OUT_BYTES', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES', 'OUT_PKTS', 'IN_PKTS', 'PROTOCOL', 'TCP_FLAGS', 'L7_PROTO']
['OUT_BYTES', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES', 'OUT_PKTS', 'IN_PKTS', 'PROTOCOL', 'TCP_FLAGS', 'L7_PROTO']
0010 - Training acc: 0.07244893908500671
0020 - Training acc: 0.07244893908500671
0030 - Training acc: 0.07244893908500671
0040 - Training acc: 0.07254216074943542
0050 - Training acc: 0.07254216074943542
0060 - Training acc: 0.07254733890295029
0070 - Training acc: 0.07256080210208893
0080 - Training acc: 0.07258358597755432
0090 - Training acc: 0.07260948419570923
0100 - Training acc: 0.07263537496328354
0110 - Training acc: 0.07267266511917114
0120 - Training acc: 0.07271409034729004
0130 - Training acc: 0.07276173681020737
0140 - Training acc: 0.0728052407503128
0150 - Training acc: 0.07647388428449631
0160 - Training acc: 0.07289949059486389
0170 - Training acc: 0.080472931265831
0180 - Training acc: 0.08319281786680222
0190 - Training acc: 0.0833015739

1970 - Training acc: 0.32117703557014465
1980 - Training acc: 0.31641775369644165
1990 - Training acc: 0.31615155935287476
2000 - Training acc: 0.3201226592063904
2010 - Training acc: 0.3194297254085541
2020 - Training acc: 0.3188673257827759
2030 - Training acc: 0.2876621186733246
2040 - Training acc: 0.3173644244670868
2050 - Training acc: 0.2789483070373535
2060 - Training acc: 0.31663838028907776
2070 - Training acc: 0.3190433979034424
2080 - Training acc: 0.3207451403141022
2090 - Training acc: 0.31701332330703735
2100 - Training acc: 0.32291609048843384
2110 - Training acc: 0.3225908577442169
2120 - Training acc: 0.3195115625858307
2130 - Training acc: 0.32014337182044983
2140 - Training acc: 0.3189760744571686
2150 - Training acc: 0.3140324056148529
2160 - Training acc: 0.31765133142471313
2170 - Training acc: 0.28862327337265015
2180 - Training acc: 0.31409040093421936
2190 - Training acc: 0.314209520816803
2200 - Training acc: 0.3221299350261688
2210 - Training acc: 0.31667357

4000 - Training acc: 0.32266438007354736
4010 - Training acc: 0.3238834738731384
4020 - Training acc: 0.33193543553352356
4030 - Training acc: 0.32844701409339905
4040 - Training acc: 0.32761943340301514
4050 - Training acc: 0.3239104151725769
4060 - Training acc: 0.32379958033561707
4070 - Training acc: 0.32638999819755554
4080 - Training acc: 0.32584312558174133
4090 - Training acc: 0.3263092041015625
4100 - Training acc: 0.3260502815246582
4110 - Training acc: 0.3257405757904053
4120 - Training acc: 0.3207409977912903
4130 - Training acc: 0.32323405146598816
4140 - Training acc: 0.3280844986438751
4150 - Training acc: 0.33093902468681335
4160 - Training acc: 0.3290487825870514
4170 - Training acc: 0.33034244179725647
4180 - Training acc: 0.3281010687351227
4190 - Training acc: 0.33217158913612366
4200 - Training acc: 0.3343176543712616
4210 - Training acc: 0.32174670696258545
4220 - Training acc: 0.3256608247756958
4230 - Training acc: 0.32666343450546265
4240 - Training acc: 0.3274

6030 - Training acc: 0.36406245827674866
6040 - Training acc: 0.3638812005519867
6050 - Training acc: 0.3632742464542389
6060 - Training acc: 0.36437422037124634
6070 - Training acc: 0.3640303313732147
6080 - Training acc: 0.3591218888759613
6090 - Training acc: 0.3626859188079834
6100 - Training acc: 0.3617610037326813
6110 - Training acc: 0.36361292004585266
6120 - Training acc: 0.3281083106994629
6130 - Training acc: 0.3605419099330902
6140 - Training acc: 0.35807061195373535
6150 - Training acc: 0.36347103118896484
6160 - Training acc: 0.36350417137145996
6170 - Training acc: 0.3632524907588959
6180 - Training acc: 0.3631572127342224
6190 - Training acc: 0.3617558181285858
6200 - Training acc: 0.3529881536960602
6210 - Training acc: 0.3623410165309906
6220 - Training acc: 0.36293554306030273
6230 - Training acc: 0.3636305332183838
6240 - Training acc: 0.3602684736251831
6250 - Training acc: 0.3653654158115387
6260 - Training acc: 0.362265408039093
6270 - Training acc: 0.35257905721

8060 - Training acc: 0.36804699897766113
8070 - Training acc: 0.3612907826900482
8080 - Training acc: 0.36709100008010864
8090 - Training acc: 0.3682582974433899
8100 - Training acc: 0.36838775873184204
8110 - Training acc: 0.36873161792755127
8120 - Training acc: 0.3687274754047394
8130 - Training acc: 0.3684975504875183
8140 - Training acc: 0.36819717288017273
8150 - Training acc: 0.36870884895324707
8160 - Training acc: 0.36755189299583435
8170 - Training acc: 0.37375712394714355
8180 - Training acc: 0.36560365557670593
8190 - Training acc: 0.36744004487991333
8200 - Training acc: 0.3689097762107849
8210 - Training acc: 0.3678305149078369
8220 - Training acc: 0.3672422170639038
8230 - Training acc: 0.3629935383796692
8240 - Training acc: 0.36678647994995117
8250 - Training acc: 0.3689180612564087
8260 - Training acc: 0.36732715368270874
8270 - Training acc: 0.3686860501766205
8280 - Training acc: 0.3642737567424774
8290 - Training acc: 0.3686560094356537
8300 - Training acc: 0.36821

# Misc

In [18]:
ds_name = 'NF-BoT-IoT'
cname_label = 'Label'
cname_tvt = f'{cname_label}_tvt'
n_epochs = 10
device = 'cuda:0'

data = pd.read_csv(f'../datasets/{ds_name}_tvt.csv')
label2idx = pd.read_pickle(f'../datasets/{ds_name}_graph_multi.pkl')['label2idx']
data['Attack'] = data['Attack'].map(label2idx)

print(data.shape)
data.head()

(600100, 16)


Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack,Label_tvt,Attack_tvt
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,0,train,train
1,192.168.100.6,49160,192.168.100.149,4444,6,0.0,217753000,199100,4521,4049,24,4176249,1,4,test,test
2,192.168.100.46,3456,192.168.100.5,80,17,0.0,8508021,8918372,9086,9086,0,4175916,0,0,train,train
3,192.168.100.3,80,192.168.100.55,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,0,0,train,train
4,192.168.100.46,80,192.168.100.5,80,6,7.0,8374706,0,9086,0,0,4175916,0,0,train,train


In [19]:
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(
    lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'], inplace=True)

In [20]:
X_cnames = [c for c in data.columns if c not in ['Label_tvt', 'Attack_tvt']]
X_train, X_test, y_train, y_test = (data[data[cname_tvt]!='test'][X_cnames], 
                                    data[data[cname_tvt]=='test'][X_cnames], 
                                    data[data[cname_tvt]!='test'][cname_label], 
                                    data[data[cname_tvt]=='test'][cname_label])

In [36]:
def build_graph(scaler, encoder, X, y, cols_to_norm):
    X = encoder.transform(X)
    print(cols_to_norm)
    X[cols_to_norm] = scaler.transform(X[cols_to_norm])
    X['h'] = X[cols_to_norm].values.tolist()
    X['h'] = X['h'].apply(lambda x: torch.tensor(x))
    print(X, X.shape)
    G = nx.from_pandas_edgelist(
        X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.MultiGraph())
    G = G.to_directed()
    print(G.number_of_nodes())
    print(G.number_of_edges())
    G = from_networkx(G, edge_attrs=['h', cname_label])
    print('G nodes', G.number_of_nodes())
    print('G edges', G.number_of_edges())
    # Eq1
    G.ndata['h'] = torch.ones(G.num_nodes(), G.edata['h'].shape[1])
    G.edata['train_mask'] = torch.ones(len(G.edata['h']), dtype=torch.bool)
    print('G ndata', G.ndata['h'].shape)
    print('G edata', G.edata['train_mask'].shape)
    
    G.ndata['h'] = torch.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1, G.ndata['h'].shape[1]))
    G.edata['h'] = torch.reshape(G.edata['h'], (G.edata['h'].shape[0], 1, G.edata['h'].shape[1]))
    print('G ndata', G.ndata['h'].shape)
    print('G edata', G.edata['h'].shape)
    G = G.to(device)
    
    return G

In [37]:
G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)

['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']
               IPV4_SRC_ADDR         IPV4_DST_ADDR  PROTOCOL  L7_PROTO  \
0       172.16.165.142:52670      192.168.100.1:53 -2.268430 -0.255669   
2        172.19.188.130:3456      192.168.100.5:80 -2.268430 -0.227547   
3          172.30.194.190:80   192.168.100.55:8080 -2.177751 -0.227166   
4            172.19.59.45:80      192.168.100.5:80 -2.177751 -0.227166   
5           172.29.116.152:0       192.168.100.3:0 -2.177751 -0.227547   
...                      ...                   ...       ...       ...   
600091    172.28.202.83:3456      192.168.100.5:80 -2.268430 -0.227547   
600092   172.29.191.120:8080      192.168.100.3:80 -2.177751 -0.227166   
600094      172.27.119.97:80      192.168.100.3:80 -2.177751 -0.227166   
600095     172.19.103.242:80      192.168.100.5:80 -2.177751 -0.227166   
600099  172.29.164.238:49160  192.168.100.149:4444 -2.177751 -0.227547

In [33]:
G_train.number_of_edges()

840140

In [47]:
G.number_of_edges()

420070

In [44]:
X.shape

(420070, 13)

In [48]:
X = encoder.transform(X_train)
print(cols_to_norm)
X[cols_to_norm] = scaler.transform(X[cols_to_norm])
X['h'] = X[cols_to_norm].values.tolist()
X['h'] = X['h'].apply(lambda x: torch.tensor(x))
G = nx.from_pandas_edgelist(
    X, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h', cname_label], create_using=nx.DiGraph())
# G = G.to_directed()

['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']


In [21]:
cols_to_norm = list(set(X_train.columns) - set(['Label', 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']))
scaler = StandardScaler()
scaler.fit(X_train[cols_to_norm])
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
encoder.fit(X_train, y_train)

G_train = build_graph(scaler, encoder, X_train, y_train, cols_to_norm)
G_test = build_graph(scaler, encoder, X_test, y_test, cols_to_norm)

node_features = G_train.ndata['h']
edge_features = G_train.edata['h']

node_features_test = G_test.ndata['h']
edge_features_test = G_test.edata['h']



['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']
['OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'PROTOCOL', 'L7_PROTO', 'FLOW_DURATION_MILLISECONDS', 'IN_BYTES']


In [27]:
print(X_train.shape)
print(X_test.shape)
print(edge_features.shape)
print(edge_features_test.shape)

(420070, 12)
(180030, 12)
torch.Size([840140, 1, 8])
torch.Size([360060, 1, 8])


In [22]:
ndim_in = G_train.ndata['h'].shape[2]
ndim_out = 128 
edim = G_train.ndata['h'].shape[2]
activation = F.relu
dropout = 0.2
n_classes = data[cname_label].nunique()

sage = SAGE(ndim_in, ndim_out, edim, activation, dropout).to(device)
mlp = MLPPredictor(ndim_out, 2).to(device)
model = Model(ndim_in, ndim_out, edim, n_classes, activation, dropout).to(device)
opt = torch.optim.Adam(model.parameters())

class_weights = class_weight.compute_class_weight(
    'balanced', 
    classes=data[cname_label].unique(),
    y=data[cname_label].values.tolist(),
)
class_weights = torch.FloatTensor(class_weights).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [23]:
edge_label = G_train.edata[cname_label]
train_mask = G_train.edata['train_mask']

for epoch in range(1, n_epochs+1):
    pred = model(G_train, node_features, edge_features).to(device)
    loss = criterion(pred[train_mask], edge_label[train_mask])
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 10 == 0:
        print(f'{epoch:04d} - Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))

0010 - Training acc: 0.8192979693412781


In [24]:
train_pred_prop = model(G_train, node_features, edge_features).to(device)
train_pred = train_pred_prop.argmax(1)
train_pred = torch.Tensor.cpu(train_pred).detach().numpy()
train_actual = G_train.edata.pop(cname_label)

In [25]:
test_pred_prop = model(G_test, node_features_test, edge_features_test).to(device)
test_pred = test_pred_prop.argmax(1)
test_pred = torch.Tensor.cpu(test_pred).detach().numpy()
test_actual = G_test.edata.pop(cname_label)

# actual = ["Normal" if i == 0 else "Attack" for i in actual]
# test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

NameError: name 'actual' is not defined

In [None]:
norm_test_pred_prop = torch.softmax(test_pred_prop, dim=1)

In [13]:
test_pred_prop

tensor([[ 3.6942e+01, -4.6637e+01],
        [ 3.9542e+01, -4.8324e+01],
        [ 1.9192e+01, -1.9510e+01],
        ...,
        [ 1.3778e+00, -1.5827e+00],
        [ 2.9585e+00, -3.1632e+00],
        [-1.5411e-01, -1.5061e-02]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [14]:
norm_test_pred_prop

tensor([[1.0000e+00, 5.0369e-37],
        [1.0000e+00, 6.9235e-39],
        [1.0000e+00, 1.5552e-17],
        ...,
        [9.5076e-01, 4.9239e-02],
        [9.9781e-01, 2.1900e-03],
        [4.6529e-01, 5.3471e-01]], device='cuda:0', grad_fn=<SoftmaxBackward0>)