# Dataset Preprocessing

In this script, we undertake preprocessing of all standardized Netflow datasets (version 2), to render it compatible for analysis by Large Language Models. All datasets will be stored locally under the efficient streaming *arrow* format and published to Hugging Face as private datasets.

The resulting datasets share a uniform structure with the following features:

| Feature Name                     | Description                                   |
|------------------------------|-----------------------------------------------|
|*input*                | A tabular netflow entry encoded as text using key-value pairs separated by commas to represent the feature name and value pairs. For instance, a network flow originally represented as a row within a CSV table is transformed into text as follows: ```IPV4_SRC_ADDR: 149.171.126.0 [...] TCP_FLAGS: 25, FLOW_DURATION_MILLISECONDS: 15"```
| *output*                | Label associated the with the network flows, 0 being benign and 1 malicious|

In [None]:
from dotenv import load_dotenv
from os import getenv
import pandas as pd
import datasets

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

load_dotenv()
HUGGING_FACE_WRITE_TOKEN = getenv("HUGGING_FACE_WRITE_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Helper Function for all datasets
def encode_dataset(dataset_name):
    df = pd.read_csv(f"./data_raw/{dataset_name}.csv")

    # Merge all column entries as one single string using key-value pairs
    def create_key_value_pairs_str(row):
        return ', '.join([f"{column}: {row[column]}" for column in df.columns.drop("output")])

    # Obtain the output label which shall be predicted by the LLM
    df.rename(columns={'Label': 'output'}, inplace=True)

    # Remove the prediction labels from the data which will be encoded
    del df["Attack"]

    # Merge all remaining columns
    df['input'] = df.apply(create_key_value_pairs_str, axis=1)

    df = df[['input', 'output']]

    return df

def save_to_arrow_disk(df, dataset_name):
    examples = df.to_dict()

    prompt_template_qa = """{input}"""

    num_examples = len(examples["input"])
    finetuning_dataset_input_output = {}
    finetuning_dataset_input_output['input'] = []
    finetuning_dataset_input_output['output'] = []

    for i in range(num_examples):
        input = examples["input"][i]
        output = examples["output"][i]

        text_with_prompt_template_qa = prompt_template_qa.format(input=input)
        finetuning_dataset_input_output['input'].append(text_with_prompt_template_qa)
        finetuning_dataset_input_output['output'].append(output)

    finetuning_dataset = datasets.Dataset.from_dict(finetuning_dataset_input_output)
    finetuning_dataset = finetuning_dataset.class_encode_column("output")
    finetuning_dataset = finetuning_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123, stratify_by_column="output")
    finetuning_dataset.save_to_disk(f"./{dataset_name}/")
    return finetuning_dataset

def push_dataset_to_hub(dataset, dataset_name):
    dataset.push_to_hub(f"Jetlime/{dataset_name}", private=True, token=HUGGING_FACE_WRITE_TOKEN)

## NF-UNSW-NB15-v2

In [10]:
df = encode_dataset("NF-UNSW-NB15-v2")
df.head(1)

Unnamed: 0,input,output
0,"IPV4_SRC_ADDR: 59.166.0.5, L4_SRC_PORT: 1305, IPV4_DST_ADDR: 149.171.126.8, L4_DST_PORT: 21, PROTOCOL: 6, L7_PROTO: 1.0, IN_BYTES: 9, IN_PKTS: 1, OUT_BYTES: 193, OUT_PKTS: 3, TCP_FLAGS: 24, CLIENT_TCP_FLAGS: 24, SERVER_TCP_FLAGS: 16, FLOW_DURATION_MILLISECONDS: 0, DURATION_IN: 0, DURATION_OUT: 0, MIN_TTL: 31, MAX_TTL: 32, LONGEST_FLOW_PKT: 89, SHORTEST_FLOW_PKT: 52, MIN_IP_PKT_LEN: 52, MAX_IP_PKT_LEN: 89, SRC_TO_DST_SECOND_BYTES: 456.0, DST_TO_SRC_SECOND_BYTES: 435.0, RETRANSMITTED_IN_BYTES:...",0


In [21]:
dataset = save_to_arrow_disk(df, "NF-UNSW-NB15-v2")

Stringifying the column: 100%|██████████| 2390275/2390275 [00:02<00:00, 807168.72 examples/s] 
Casting to class labels: 100%|██████████| 2390275/2390275 [00:03<00:00, 623607.65 examples/s]
Saving the dataset (5/5 shards): 100%|██████████| 2151247/2151247 [00:09<00:00, 224425.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 239028/239028 [00:01<00:00, 223493.30 examples/s]


In [22]:
push_dataset_to_hub(dataset, "NF-UNSW-NB15-v2")

Creating parquet from Arrow format: 100%|██████████| 431/431 [00:03<00:00, 116.53ba/s]
Creating parquet from Arrow format: 100%|██████████| 431/431 [00:03<00:00, 114.42ba/s]
Creating parquet from Arrow format: 100%|██████████| 431/431 [00:03<00:00, 115.81ba/s]
Creating parquet from Arrow format: 100%|██████████| 431/431 [00:03<00:00, 114.06ba/s]
Creating parquet from Arrow format: 100%|██████████| 431/431 [00:03<00:00, 116.20ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [00:22<00:00,  4.57s/it]
Creating parquet from Arrow format: 100%|██████████| 240/240 [00:02<00:00, 117.13ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.56s/it]


## NF-ToN-IoT-v2 - Ignored for Now

In [23]:
df = encode_dataset("NF-ToN-IoT-v2")
df.head(1)

Unnamed: 0,input,output
0,"IPV4_SRC_ADDR: 192.168.1.193, L4_SRC_PORT: 49235, IPV4_DST_ADDR: 192.168.1.33, L4_DST_PORT: 4444, PROTOCOL: 6, L7_PROTO: 0.0, IN_BYTES: 155392, IN_PKTS: 202, OUT_BYTES: 34552, OUT_PKTS: 149, TCP_FLAGS: 24, CLIENT_TCP_FLAGS: 24, SERVER_TCP_FLAGS: 24, FLOW_DURATION_MILLISECONDS: 4294952, DURATION_IN: 15, DURATION_OUT: 15, MIN_TTL: 128, MAX_TTL: 128, LONGEST_FLOW_PKT: 1500, SHORTEST_FLOW_PKT: 40, MIN_IP_PKT_LEN: 40, MAX_IP_PKT_LEN: 1500, SRC_TO_DST_SECOND_BYTES: 155392.0, DST_TO_SRC_SECOND_BYTE...",1


In [2]:
dataset = save_to_arrow_disk(df, "NF-ToN-IoT-v2")

NameError: name 'save_to_arrow_disk' is not defined

In [None]:
push_dataset_to_hub(dataset, "NF-ToN-IoT-v2")

## NF-BoT-IoT-v2 - Ignored for Now

In [None]:
df = encode_dataset("NF-BoT-IoT-v2")
df.head(1)

In [None]:
dataset = save_to_arrow_disk(df, "NF-BoT-IoT-v2")

In [None]:
push_dataset_to_hub(dataset, "NF-BoT-IoT-v2")

## NF-CSE-CIC-IDS2018-v2

In [5]:
df = encode_dataset("NF-CSE-CIC-IDS2018-v2")
df.head(1)

In [None]:
dataset = save_to_arrow_disk(df, "NF-CSE-CIC-IDS2018-v2")

In [None]:
push_dataset_to_hub(dataset, "NF-CSE-CIC-IDS2018-v2")

## NF-UQ-NIDS-v2

In [None]:
df = encode_dataset("NF-UQ-NIDS-v2")
df.head(1)

In [None]:
push_dataset_to_hub(dataset, "NF-UQ-NIDS-v2")

In [None]:
push_dataset_to_hub(dataset, "NF-UQ-NIDS-v2")