In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, re, datetime, random, gzip, json, copy
import tqdm
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from itertools import accumulate
import argparse
from time import time
from math import ceil
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import Linear

import pytorch_lightning as pl
from pytorch_lightning.trainer.trainer import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.utilities.seed import seed_everything

from torch_geometric.data import Data, LightningLinkData
from torch_geometric.loader import DataLoader
from torch_geometric.nn import Sequential, HeteroConv, GINConv, GCNConv, SAGEConv, GATConv, TransformerConv

from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, roc_auc_score
from sklearn.utils import class_weight

PROJ_PATH = Path(os.path.join(re.sub("/TS-IDS.*$", '', os.getcwd()), 'TS-IDS'))
print(f'PROJ_PATH={PROJ_PATH}')
sys.path.insert(1, str(PROJ_PATH))
sys.path.insert(1, str(PROJ_PATH/'src'))
import utils
from utils import *
from dataset import build_datamodule
from trainer import build_trainer
from model import TSIDS
from pipeline import TSIDSPipeline

PROJ_PATH=/home/hoang/github/TS-IDS


In [2]:
ds_names = ['nf_bot_binary', 'nf_bot_multi', 'nf_ton_binary', 'nf_ton_multi']
for name in ds_names:
    print(name)
    ###
    config_path = str(PROJ_PATH / f'src/config/{name}.json')
    data_config = utils.read_json(config_path)
    g_data = pd.read_pickle(
        os.path.join(data_config['root'], data_config['ds_name']+'.pkl'))
    x = torch.tensor(g_data['n_features'], dtype=torch.float)
    edge_index = torch.tensor(g_data['edge_index'], dtype=torch.long)
    edge_attr = torch.tensor(g_data['e_features'], dtype=torch.float)
    y = torch.tensor(g_data['node_label'], dtype=torch.long)
    input_train_edges = torch.tensor(g_data['edge_index'][:, np.where(g_data['tvt']=='train')[0]], dtype=torch.long)
    input_train_labels = torch.tensor(g_data['edge_label'][np.where(g_data['tvt']=='train')[0]], dtype=torch.long)
    input_val_edges = torch.tensor(g_data['edge_index'][:, np.where(g_data['tvt']=='val')[0]], dtype=torch.long)
    input_val_labels = torch.tensor(g_data['edge_label'][np.where(g_data['tvt']=='val')[0]], dtype=torch.long)
    input_test_edges = torch.tensor(g_data['edge_index'][:, np.where(g_data['tvt']=='test')[0]], dtype=torch.long)
    input_test_labels = torch.tensor(g_data['edge_label'][np.where(g_data['tvt']=='test')[0]], dtype=torch.long)
    ###
    print(x.shape, edge_index.shape, edge_attr.shape)
    print(torch.isnan(x).any(), torch.isnan(edge_index).any(), torch.isnan(edge_attr).any())
    ###
    classes = np.unique(input_train_labels)
    y = input_train_labels.cpu().numpy()
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced', classes=classes, y=y)
    print(classes)
    print(class_weights)
    print(Counter(input_train_labels.cpu().numpy()))
    print('\n')

nf_bot_binary
torch.Size([77177, 8]) torch.Size([2, 600100]) torch.Size([600100, 8])
tensor(False) tensor(False) tensor(False)
[0 1]
[21.6517535   0.51181935]
Counter({1: 293121, 0: 6929})


nf_bot_multi
torch.Size([77177, 32]) torch.Size([2, 600100]) torch.Size([600100, 8])
tensor(False) tensor(False) tensor(False)
[0 1 2 3 4]
[ 8.6607014   2.10465402  2.12086941  0.25499386 61.61190965]
Counter({3: 235339, 1: 28513, 2: 28295, 0: 6929, 4: 974})


nf_ton_binary
torch.Size([169562, 8]) torch.Size([2, 1379274]) torch.Size([1379274, 8])
tensor(False) tensor(False) tensor(False)
[0 1]
[2.55158762 0.62185685]
Counter({1: 554499, 0: 135139})


nf_ton_multi
torch.Size([169562, 72]) torch.Size([2, 1379274]) torch.Size([1379274, 8])
tensor(False) tensor(False) tensor(False)
[0 1 2 3 4 5 6 7 8 9]
[5.10317525e-01 8.01531846e+00 4.23311543e-01 7.82612347e+00
 2.94324661e-01 1.08433648e+02 8.77558344e-01 8.51404938e+02
 6.40272955e+00 1.38531598e+00]
Counter({4: 234312, 2: 162915, 0: 135139, 6: 785

In [1]:
# ds_names = ['nf_bot_binary', 'nf_bot_multi', 'nf_ton_binary', 'nf_ton_multi']
# name = ds_names[0]
# config_path = str(PROJ_PATH / f'src/config/{name}.json')
# tsids = TSIDSPipeline(config_path=config_path)
# data_module, model_module, trainer = tsids.initialize()
# tsids.train(data_module, model_module, trainer)

In [4]:
g_data

{'n_features': array([[-0.42454842, -0.42454842, -0.42454842, ..., -0.01798697,
         -0.35426996, -0.6061237 ],
        [-0.42454842, -0.42454842, -0.42454842, ..., -0.01798697,
         -0.35426996, -0.6061237 ],
        [-0.42454842, -0.42454842, -0.42454842, ..., -0.01798697,
         -0.35426996, -0.6061237 ],
        ...,
        [-0.43361206, -0.43361206, -0.43361206, ..., -0.01798697,
         -0.35426996, -0.6061237 ],
        [-0.43344499, -0.43344499, -0.43344499, ..., -0.01798697,
         -0.35426996, -0.6061237 ],
        [-0.43369559, -0.43369559, -0.43369559, ..., -0.01798697,
         -0.35426996, -0.6061237 ]]),
 'e_features': array([[-0.05243373, -0.00737229, -0.00544518, ..., -0.00621386,
          0.3048566 , -2.37758476],
        [-0.0551884 , -0.00805565, -0.00576938, ..., -0.00680971,
         -2.77156946, -1.14868915],
        [-0.0551884 , -0.00805565, -0.00576938, ..., -0.00680971,
         -2.77156946, -1.14868915],
        ...,
        [ 0.02615441, -0.0

In [10]:
config = utils.read_json('../src/config/nf_ton_binary.json')
tsids = TSIDSPipeline(config_dict=config)
data_module, model_module, trainer = tsids.initialize()

Global seed set to 2022


x: torch.Size([169562, 8])
edge_index: torch.Size([2, 1379274])
edge_attr: torch.Size([1379274, 8])
y: torch.Size([169562])
input_train_edges: torch.Size([2, 689638])
input_val_edges: torch.Size([2, 275854])
input_test_edges: torch.Size([2, 413782])
input_train_labels: torch.Size([689638])
input_val_labels: torch.Size([275854])
input_test_labels: torch.Size([413782])


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
for batch in data_module.train_dataloader():
    break

In [12]:
batch

Data(x=[33491, 8], edge_index=[2, 76336], edge_attr=[76336, 8], y=[33491], edge_label_index=[2, 32768], edge_label=[32768])

In [None]:
https://stackoverflow.com/questions/59705001/torch-find-indices-of-matching-rows-in-2-2d-tensors

In [36]:
edge_index_dense = batch.edge_index
edge_index2_dense = batch.edge_label_index
arr1 = edge_index_dense.numpy().view(np.int32)
arr2 = edge_index2_dense.numpy().view(np.int32)
arr1_view = arr1.view([('', arr1.dtype)] * arr1.shape[1])
arr2_view = arr2.view([('', arr2.dtype)] * arr2.shape[1])
intersected = np.intersect1d(arr1_view, arr2_view, return_indices=True)

TypeError: invalid type promotion

In [22]:
batch.edge_attr

tensor([[ 1.0524e+00, -7.3086e-03, -5.1210e-03,  ..., -6.8097e-03,
         -7.2393e+00, -1.1487e+00],
        [ 1.9480e+01, -1.8881e-03, -1.8790e-03,  ..., -6.8097e-03,
         -7.2393e+00, -1.1487e+00],
        [-5.5188e-02, -7.6792e-03, -5.4452e-03,  ..., -6.8097e-03,
         -7.2393e+00, -1.1487e+00],
        ...,
        [ 2.0601e-01,  1.7559e-01,  6.0367e-02,  ..., -6.8097e-03,
          3.0486e-01, -5.8427e-01],
        [-3.7304e-02,  1.0158e-02,  1.6600e-02,  ..., -6.8097e-03,
          3.0486e-01,  6.5093e-01],
        [-5.5188e-02, -8.1888e-03, -5.7694e-03,  ..., -6.2139e-03,
          3.0486e-01,  6.0939e-01]])

In [20]:
batch.edge_label_index

tensor([[18627, 23021, 12467,  ..., 21775, 13484, 21926],
        [  799,  2186,   103,  ..., 26788,   103,   103]])

In [21]:
edge_index

tensor([[ 52185, 167014, 167018,  ...,  94061,  94062,  94063],
        [169274,  58185, 169150,  ...,  39676,  39676,  39676]])

In [28]:
len(edge_index[0])

1379274

In [29]:
l = len(edge_index[0])
tuple_to_idx = {}
for i, (s, d) in enumerate(zip(edge_index[0], edge_index[1])):
    tuple_to_idx[(s, d)] = i

In [31]:
len(tuple_to_idx.keys())

1379274

In [None]:
for (s, d) in zip(batch.edge_label_index[0], batch.edge_label_index[1]):
    