In [None]:
import os

import pandas as pd

from framework import NamedDatasetSpecifications, EvaluationDatasetSampling, FlowTransformer, FlowTransformerParameters
from implementations.classification_heads import *
from implementations.input_encodings import *
from implementations.pre_processings import StandardPreProcessing
from implementations.transformers.basic_transformers import BasicTransformer
from implementations.transformers.named_transformers import *


In [None]:
encodings = [
    NoInputEncoder(),
    RecordLevelEmbed(64),
    CategoricalFeatureEmbed(EmbedLayerType.Dense, 16),
    CategoricalFeatureEmbed(EmbedLayerType.Lookup, 16),
    CategoricalFeatureEmbed(EmbedLayerType.Projection, 16),
    RecordLevelEmbed(64, project=True)
]

classification_heads = [
    LastTokenClassificationHead(),
    FlattenClassificationHead(),
    GlobalAveragePoolingClassificationHead(),
    CLSTokenClassificationHead(),
    FeaturewiseEmbedding(project=False),
    FeaturewiseEmbedding(project=True),
]

transformers = [
    BasicTransformer(2, 128, n_heads=2),
    BasicTransformer(2, 128, n_heads=2, is_decoder=True),
    GPTSmallTransformer(),
    BERTSmallTransformer()
]

In [None]:
! mkdir dataset

In [None]:
! mv /content/drive/MyDrive/NF-UNSW-NB15-v2.csv dataset/.

In [None]:
flow_file_path = r"/content/dataset"

datasets = [
    ("UNSW_NB15", os.path.join(flow_file_path, "NF-UNSW-NB15-v2.csv"), NamedDatasetSpecifications.unified_flow_format, 0.025, EvaluationDatasetSampling.LastRows)
]


In [None]:
pre_processing = StandardPreProcessing(n_categorical_levels=32)

# Define the transformer
ft = FlowTransformer(pre_processing=pre_processing,
                     input_encoding=encodings[0],
                     sequential_model=transformers[0],
                     classification_head=classification_heads[0],
                     params=FlowTransformerParameters(window_size=8, mlp_layer_sizes=[128], mlp_dropout=0.1))

# Load the specific dataset
cache_folder = '/content/cache_folder'
dataset_name, dataset_path, dataset_specification, eval_percent, eval_method = datasets[0]
ft.load_dataset(dataset_name, dataset_path, dataset_specification, cache_folder,evaluation_dataset_sampling=eval_method, evaluation_percent=eval_percent)


Using cache file path: /content/cache_folder/UNSW_NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_VzQ981ONg0PHPxtLtik6rZN0dGw0.feather
Attempting to read dataset from path /content/dataset/NF-UNSW-NB15-v2.csv...
Set y to = Attack
Converting numerical columns to floats, and removing out of range values...
Applying pre-processing to numerical values
[Numerical 1 / 28] Processing numerical column LONGEST_FLOW_PKT...
[Numerical 2 / 28] Processing numerical column DURATION_IN...
[Numerical 3 / 28] Processing numerical column IN_PKTS...
[Numerical 4 / 28] Processing numerical column NUM_PKTS_128_TO_256_BYTES...
[Numerical 5 / 28] Processing numerical column MAX_TTL...
[Numerical 6 / 28] Processing numerical column OUT_PKTS...
[Numerical 7 / 28] Processing numerical column MIN_IP_PKT_LEN...
[Numerical 8 / 28] Processing numerical column NUM_PKTS_256_TO_512_BYTES...
[Numerical 9 / 28] Processing numerical column IN_BYTES...
[Numerical 10 / 28] Processing numerical column FLOW_DURATION_MILLISECONDS...
[Num

Unnamed: 0,LONGEST_FLOW_PKT,DURATION_IN,IN_PKTS,NUM_PKTS_128_TO_256_BYTES,MAX_TTL,OUT_PKTS,MIN_IP_PKT_LEN,NUM_PKTS_256_TO_512_BYTES,IN_BYTES,FLOW_DURATION_MILLISECONDS,...,L7_PROTO_23,L7_PROTO_24,L7_PROTO_25,L7_PROTO_26,L7_PROTO_27,L7_PROTO_28,L7_PROTO_29,L7_PROTO_30,L7_PROTO_31,L7_PROTO_32
0,0.565324,0.0,0.000000,0.000000,0.630549,0.148859,0.608608,0.000000,0.127562,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.565324,0.0,0.160324,0.000000,0.630549,0.223288,0.608608,0.000000,0.323054,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.575948,0.0,0.218877,0.000000,0.630549,0.266827,0.608608,0.000000,0.358547,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.598516,0.0,0.255508,0.000000,0.630549,0.297718,0.608608,0.000000,0.380413,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.598516,0.0,0.293311,0.000000,0.630549,0.331913,0.608608,0.000000,0.402810,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390270,0.915812,0.0,0.247534,0.192724,0.630549,0.290788,0.608608,0.000000,0.404639,0.0,...,0,0,0,0,0,0,0,0,0,0
2390271,0.565324,0.0,0.069048,0.000000,0.625000,0.117968,0.659772,0.000000,0.289329,0.0,...,0,0,0,0,0,0,0,0,0,0
2390272,0.999258,0.0,0.229372,0.000000,0.630549,0.257484,0.608608,0.132745,0.400688,0.0,...,0,0,0,0,0,0,0,0,0,0
2390273,0.984754,0.0,0.420327,0.000000,0.630549,0.457722,0.608608,0.000000,0.481723,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
m = ft.build_model()
m.summary()




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_LONGEST_FLOW_PKT (InputL  [(None, 8, 1)]      0           []                               
 ayer)                                                                                            
                                                                                                  
 input_DURATION_IN (InputLayer)  [(None, 8, 1)]      0           []                               
                                                                                                  
 input_IN_PKTS (InputLayer)     [(None, 8, 1)]       0           []                               
                                                                                                  
 input_NUM_PKTS_128_TO_256_BYTE  [(None, 8, 1)]      0           []                           

In [None]:
m.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'], jit_compile=True)

# Get the evaluation results
eval_results: pd.DataFrame
(train_results, eval_results, final_epoch) = ft.evaluate(m, batch_size=128, epochs=5, steps_per_epoch=64, early_stopping_patience=5)

print(eval_results)

Building eval dataset...
Splitting dataset to featurewise...
Evaluation dataset is built!
Positive samples in eval set: 2816
Negative samples in eval set: 56940
Epoch = 0 / 5 (early stop in 5), step = 0, loss = 0.70432, results = [0.7043224573135376, 0.4921875] -- elapsed (train): 0.00s
Epoch = 0 / 5 (early stop in 5), step = 31, loss = 0.12169, results = [0.12169043719768524, 0.9609375] -- elapsed (train): 1.02s
Epoch = 0 / 5 (early stop in 5), step = 63, loss = 0.01447, results = [0.014470361173152924, 1.0] -- elapsed (train): 2.02s
Epoch = 1 / 5 (early stop in 5), step = 28, loss = 0.08485, results = [0.08485106378793716, 0.9765625] -- elapsed (train): 2.96s
Epoch = 1 / 5 (early stop in 5), step = 50, loss = 0.08263, results = [0.08262961357831955, 0.984375] -- elapsed (train): 3.88s
Epoch = 2 / 5 (early stop in 5), step = 18, loss = 0.00547, results = [0.005471301265060902, 1.0] -- elapsed (train): 4.87s
Epoch = 2 / 5 (early stop in 5), step = 49, loss = 0.00368, results = [0.00368