In [36]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import glob
import os
from sys import platform
import datetime

datasets = []

root_dir = "./ModbusDataset"

# Function to find all CSV files in 'output' directories
def find_csv_files_in_output(start_path):
    output_csvs = []
    for root, dirs, files in os.walk(start_path):
        if os.path.basename(root) == "output":
            # Collect all .csv files in this directory
            for file in files:
                if file.lower().endswith('.csv'):
                    full_path = str(Path(root) / file)
                    output_csvs.append(full_path)
    return output_csvs

for subdir in os.listdir(root_dir):
    subdir_path = os.path.join(root_dir, subdir)
    if os.path.isdir(subdir_path):
        # Find all CSV files in 'output' directories under this subdirectory
        csv_files = find_csv_files_in_output(subdir_path)
        datasets.append(csv_files)

print("Datasets structure:")
for i, csv_lists in enumerate(datasets):
    print(f"Dataset {i} ( {os.path.join(root_dir,os.listdir(root_dir)[i]) if i < len(os.listdir(root_dir)) else 'unknown'})")
    for csv_list in (csv_lists):
        print( csv_list.replace("ModbusDataset/",""))



Datasets structure:
Dataset 0 ( ./ModbusDataset/attack)
attack/external/network-wide/output/network-wide-normal-1_fix_ord.pcap_Flow.csv
attack/external/network-wide/output/network-wide-normal-0_fix_ord.pcap_Flow.csv
attack/external/ied4c/ied4c-network-capture/output/veth8bc3408-0_fix_ord.pcap_Flow.csv
attack/external/external-attacker/external-attacker-network-capture/output/veth665f3cf-0_fix_ord.pcap_Flow.csv
attack/external/ied1b/ied1b-network-capture/output/vethd9e14c0-0_fix_ord.pcap_Flow.csv
attack/external/scada-hmi/scada-hmi-network-capture/output/veth5bbeaa2-0_fix_ord.pcap_Flow.csv
attack/external/central-agent/central-agent-network-capture/output/veth460b141-0_fix_ord.pcap_Flow.csv
attack/external/ied1a/ied1a-network-capture/output/veth4edc015-0_fix_ord.pcap_Flow.csv
attack/compromised-scada/ied4c/ied4c-network-captures/output/vethe685ac9-4_fix_ord.pcap_Flow.csv
attack/compromised-scada/ied4c/ied4c-network-captures/output/vethe685ac9-3_fix_ord.pcap_Flow.csv
attack/compromised-s

##### Modified Labeling functions
- Attempted-Category ,Int_64 Columns  label wasn't considered

In [None]:
# Basic preprocessing before getting started on labelling.
# Deletes rows with "Infinity" and NaNs, converts "Timestamp" to Pandas Datetime, and converts all necessary columns to
# numeric values

def format_csv_for_labelling(df):
    df = df.replace('Infinity', np.nan)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    for column in df.columns:
        if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:
            df[column] = pd.to_numeric(df[column], errors='coerce')

    df.dropna()

    return df.dropna()

def read_csvs_from_path_and_reformat(path):
    df = pd.read_csv(path, encoding='cp1252')

    df = format_csv_for_labelling(df)
    print("labels after pre-processing:", df["Label"].value_counts())

    int32_columns = ["Src Port", "Dst Port", "Flow Duration", "Total Fwd Packet", "Total Bwd packets", "Total Length of Fwd Packet", "Total Length of Bwd Packet", "Fwd Packet Length Max",
        "Fwd Packet Length Min", "Bwd Packet Length Max", "Bwd Packet Length Min", "Flow IAT Max", "Flow IAT Min", "Fwd IAT Total", "Fwd IAT Max", "Fwd IAT Min", "Bwd IAT Total",
        "Bwd IAT Max", "Bwd IAT Min", "Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags", "Packet Length Min", "Packet Length Max", "FIN Flag Count", "SYN Flag Count", "RST Flag Count", "PSH Flag Count",
        "ACK Flag Count", "URG Flag Count", "CWR Flag Count", "ECE Flag Count", "Subflow Fwd Packets", "Subflow Fwd Bytes",
        "Subflow Bwd Packets", "Subflow Bwd Bytes", "FWD Init Win Bytes", "Bwd Init Win Bytes", "Fwd Act Data Pkts", "Fwd Seg Size Min", "Active Max",
        "Active Min", "Idle Max", "Idle Min"]

    int16_columns = ["Fwd Header Length", "Bwd Header Length", "ICMP Code", "ICMP Type"]

    for column in int32_columns:
        df[column] = df[column].astype('int32')

    for column in int16_columns:
        df[column] = df[column].astype('int16')

    return df

# Main labelling function. Only used for labelling Malicious flows.
# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.
# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything
# label = the label that will be given to flows matching the criteria specified in the function
# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments
# There is no attempted_category label 
# for details on how the "Attempted" categories are defined:  (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)
# payload_filter = When set to true, this will automatically add a constraint ["Total Length of Fwd Packet"] == 0. Note that

def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,
                dst_ip_list= None, src_port_list=None, dst_port_list=None, additional_filters=[], attempted_category=-1, payload_filter=False):

    # Create initial mask for whole df with all values set to True. Squeeze is necessary to remove second axis (with value 1)
    # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)
    mask = pd.DataFrame(True,index=df.index,columns=[df.columns[0]]).squeeze()

    attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns')
    attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')

    mask &= (df["Timestamp"] >= attack_start_datetime)
    mask &= (df["Timestamp"] <= attack_end_datetime)

    if src_ip_list is not None:
        mask &= (df["Src IP"].isin(src_ip_list))
    if dst_ip_list is not None:
        mask &= (df["Dst IP"].isin(dst_ip_list))

    if src_port_list is not None:
        mask &= (df["Src Port"].isin(src_port_list))
    if dst_port_list is not None:
        mask &= (df["Dst Port"].isin(dst_port_list))

    if payload_filter:
        mask &= (df["Total Length of Fwd Packet"] == 0)

    for filter in additional_filters:
        mask &= filterl

    df["Label"].mask(mask, label, inplace=True)
    df["Attempted Category"].mask(mask, attempted_category, inplace=True)

# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label
# so far is labelled as Benign.
def label_rest_as_benign_and_write_csv(df, file_to_write):
    df["Label"].mask(df["Label"] == "NeedManualLabel", "BENIGN", inplace=True)

    # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0
    df["Label"].mask(df["Flow ID"] == '8.0.6.4-8.6.0.1-0-0-0', "BENIGN", inplace=True)

    print("label count after labelling:\r\n", df["Label"].value_counts())
    print("Attempted Category count after labelling:\r\n", df["Attempted Category"].value_counts())

    # Adds line numbers in the first column if print_index is set to true
    if print_index:
        df.reset_index(inplace=True, drop=True)
        df.index += 1
        df.index.name = 'id'
        df.to_csv(file_to_write)
    else:
        df.to_csv(file_to_write, index=False)
