### CICModbus2023 Flow labeling 

In [1]:
import os
import pandas as pd
import numpy as np
import os
import re
from bisect import bisect_left, bisect_right
import json 

class ModbusDataset():
    """
    A class to organize and manage the CICModbus 2023 directories with simple hierarchical structure
    and metadata.
    """
    def __init__(self, _root_dir = "./ModbusDataset"):
        self.root_dir = _root_dir
        datasets_dir = self.find_csv_in_folder(root_dir,"output")
        benign_datasets_dir = self.find_csv_in_list(datasets_dir,"benign")
        attack_dataset_dir = self.find_csv_in_list(datasets_dir,"attack")
        ext_attack_dataset_dir = self.find_csv_in_list(attack_dataset_dir,"external")
        comp_ied_attack_dataset_dir = self.find_csv_in_list(attack_dataset_dir,"compromised-ied")
        comp_scada_attack_dataset_dir =self.find_csv_in_list(attack_dataset_dir,"compromised-scada")
        attack_logs_dir = self.find_csv_in_folder(root_dir,"attack logs")
        attack_logs_dir.extend(self.find_csv_in_folder(root_dir,"attacker logs"))
        ## Corrupted TimeStamp (more detail in ./ModbusDataset/Readme.md)
        attack_logs_dir.remove("./ModbusDataset/attack/compromised-scada/attack logs/03-21-2023/03-21-2023-1-original.csv")
        ext_attack_log_dir = self.find_csv_in_list(attack_logs_dir,"external")
        comp_ied_attack_log_dir = self.find_csv_in_list(attack_logs_dir,"compromised-ied")
        comp_scada_attack_log_dir =self.find_csv_in_list(attack_logs_dir,"compromised-scada")


        self.dataset ={
            "metadata":{
                "total_dataset_num":len(datasets_dir),"benign_dataset_num":len(benign_datasets_dir),"attack_dataset_num":{
                "total_num":len(attack_dataset_dir),
                "external_num":len(ext_attack_dataset_dir),
                "compromised-ied_num":len(comp_ied_attack_dataset_dir),
                "compromised-scada_num":len(comp_scada_attack_dataset_dir),
                },"attack_logs_num":{
                    "total_num":len(attack_logs_dir),
                    "external_num":(ext_attack_log_dir),
                    "compromised-ied_num":len(comp_ied_attack_log_dir),
                    "compromised-scada_num":len(comp_scada_attack_log_dir),
                }
            },
            "benign_dataset_dir":benign_datasets_dir,
            "attack_dataset_dir":{"total":attack_dataset_dir,
                                  "external":ext_attack_dataset_dir,
                                  "compromised-ied":comp_ied_attack_dataset_dir,
                                  "compromised-scada":comp_scada_attack_dataset_dir}
            ,
            "attack_log_dir":{"total":attack_logs_dir,
                            "external":ext_attack_log_dir,
                            "compromised-ied":comp_ied_attack_log_dir,
                            "compromised-scada":comp_scada_attack_log_dir}}
        
    def find_csv_in_folder(self,_start_path,_folder_name):
        csv_files = []
        for root, _, files in os.walk(_start_path):
            if _folder_name in root.split(os.sep) :
                csv_files.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])
        return csv_files

    def find_csv_in_list(self,_datasets_dir,_folder_name):
        return  [ds for ds in _datasets_dir  if ds.find(_folder_name)!=(-1)]
    
    def summary_print(self):
        print(json.dumps(modbus.dataset, indent = 4))

    def print_csv(datasets_dir_list,replace_dir):
        for i,dataset in enumerate(datasets_dir_list) :
            print(i+1,dataset.replace(replace_dir,""))

print("pandas current version ",pd.__version__)
print("numpy current version ",np.__version__)
root_dir = "./ModbusDataset"
modbus = ModbusDataset(root_dir)
modbus.summary_print()


pandas current version  2.2.0
numpy current version  1.26.3
{
    "metadata": {
        "total_dataset_num": 170,
        "benign_dataset_num": 62,
        "attack_dataset_num": {
            "total_num": 108,
            "external_num": 8,
            "compromised-ied_num": 43,
            "compromised-scada_num": 57
        },
        "attack_logs_num": {
            "total_num": 34,
            "external_num": [
                "./ModbusDataset/attack/external/external-attacker/attacker logs/02-01-2023/02-01-2023-1.csv",
                "./ModbusDataset/attack/external/external-attacker/attacker logs/12-29-2022/12-29-2022-1.csv",
                "./ModbusDataset/attack/external/external-attacker/attacker logs/01-17-2023/01-17-2023-1.csv",
                "./ModbusDataset/attack/external/external-attacker/attacker logs/01-02-2023/01-02-2023-1.csv",
                "./ModbusDataset/attack/external/external-attacker/attacker logs/12-30-2022/12-30-2022-1.csv",
                "./ModbusD

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

print("list of benign csv dataset directories under ./ModbusDataset/benign/*")
ModbusDataset.print_csv(modbus.dataset["benign_dataset_dir"],"./ModbusDataset/benign/")
print("list of attack csv dataset directories under ./ModbusDataset/attack/*")
ModbusDataset.print_csv(modbus.dataset["attack_dataset_dir"]["total"],"./ModbusDataset/attack/")


list of benign csv dataset directories under ./ModbusDataset/benign/*
1 network-wide-pcap-capture/network-wide/output/network-wide-normal-18_fix_ord.pcap_Flow.csv
2 network-wide-pcap-capture/network-wide/output/network-wide-normal-22_fix_ord.pcap_Flow.csv
3 network-wide-pcap-capture/network-wide/output/network-wide-normal-30_fix_ord.pcap_Flow.csv
4 network-wide-pcap-capture/network-wide/output/network-wide-normal-19_fix_ord.pcap_Flow.csv
5 network-wide-pcap-capture/network-wide/output/network-wide-normal-32_fix_ord.pcap_Flow.csv
6 network-wide-pcap-capture/network-wide/output/network-wide-normal-25_fix_ord.pcap_Flow.csv
7 network-wide-pcap-capture/network-wide/output/network-wide-normal-15_fix_ord.pcap_Flow.csv
8 network-wide-pcap-capture/network-wide/output/network-wide-normal-31_fix_ord.pcap_Flow.csv
9 network-wide-pcap-capture/network-wide/output/network-wide-normal-24_fix_ord.pcap_Flow.csv
10 network-wide-pcap-capture/network-wide/output/network-wide-normal-14_fix_ord.pcap_Flow.csv

#### Labeling functions


In [3]:
# Basic preprocessing before getting started on labelling.
# Deletes rows with "Infinity" and NaNs, converts "Timestamp" to Pandas Datetime, and converts all necessary columns to numeric values
# Int_64 Columns (Attempted-Category) not considered.

print_index = False

def format_csv_for_labelling(df):
    df = df.replace('Infinity', np.nan)
    # Clean the Timestamp strings to always include microseconds (append .0 if missing)
    df['Timestamp'] = pd.to_datetime(
        df['Timestamp'].apply(
            lambda x: x if '.' in x.split()[-1] else f"{x}.0"  # Split into date/time and check time part
        ),
        format='%Y-%m-%d %H:%M:%S.%f'  # Parse with microseconds
    )
    for column in df.columns:
        if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label','Attack','TransactionID','TargetIP']:
            df[column] = pd.to_numeric(df[column], errors='coerce')
    df.dropna()
    return df.dropna()

def read_csvs_from_path_and_reformat(path):
    df = pd.read_csv(path, encoding='cp1252')

    df = format_csv_for_labelling(df)
    print("labels after pre-processing:\n", df["Label"].value_counts())

    int32_columns = ["Src Port", "Dst Port", "Flow Duration", "Total Fwd Packet", "Total Bwd packets", "Total Length of Fwd Packet", "Total Length of Bwd Packet", "Fwd Packet Length Max",
        "Fwd Packet Length Min", "Bwd Packet Length Max", "Bwd Packet Length Min", "Flow IAT Max", "Flow IAT Min", "Fwd IAT Total", "Fwd IAT Max", "Fwd IAT Min", "Bwd IAT Total",
        "Bwd IAT Max", "Bwd IAT Min", "Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags", "Packet Length Min", "Packet Length Max", "FIN Flag Count", "SYN Flag Count", "RST Flag Count", "PSH Flag Count",
        "ACK Flag Count", "URG Flag Count", "CWR Flag Count", "ECE Flag Count", "Subflow Fwd Packets", "Subflow Fwd Bytes",
        "Subflow Bwd Packets", "Subflow Bwd Bytes", "FWD Init Win Bytes", "Bwd Init Win Bytes", "Fwd Act Data Pkts", "Fwd Seg Size Min", "Active Max",
        "Active Min", "Idle Max", "Idle Min"]

    int16_columns = ["Fwd Header Length", "Bwd Header Length", "ICMP Code", "ICMP Type"]

    for column in int32_columns:
        df[column] = df[column].astype('int32')

    for column in int16_columns:
        df[column] = df[column].astype('int16')

    return df


# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label
# so far is labelled as Benign.
# write dataframe to csv file in ready directory (create if not exist )
def label_rest_as_benign_and_write_csv(df, input_path):
    
    df["Label"]=df["Label"].mask(df["Label"] == "NeedManualLabel", "BENIGN")
    print("label count after labelling:\r\n", df["Label"].value_counts())
    # Construct the ready directory path alongside output
    ready_dir = os.path.join(os.path.dirname(os.path.dirname(input_path)), 'ready')
    os.makedirs(ready_dir, exist_ok=True)
    filename = os.path.basename(input_path)
    name, ext = os.path.splitext(filename)  # Split the extension (name, ".csv")
    cleaned_name = name.replace("_fix_ord.pcap_Flow", "-labeled")
    # Construct full output path
    output_path = os.path.join(ready_dir, f"{cleaned_name}{ext}")  # Append extension
    # Adds line numbers in the first column if print_index is set to true
    if print_index:
        df.reset_index(inplace=True, drop=True)
        df.index += 1
        df.index.name = 'id'
        df.to_csv(output_path)
    else:
        df.to_csv(output_path, index=False)

def sort_df(_df):
    """    
    return sorted dataframe vertically with respect to TimeStamps
    """    
    new_df_sorted = _df.sort_values(by='Timestamp', ascending=True)
    new_df_sorted = new_df_sorted.reset_index(drop=True)
    return new_df_sorted

def merge_datasets(_dataset_dir):
    """
        create dataframes from the directories then return the merged_file  
    """
    # take two columns (attack logs ) or four columns (attacker logs)    
    merged_new_df = format_csv_for_labelling(pd.read_csv(_dataset_dir[0], encoding='cp1252'))
    for _dir in _dataset_dir :
        merged_new_df = pd.concat([format_csv_for_labelling(pd.read_csv(_dir, encoding='cp1252')),merged_new_df],join="inner")
    return merged_new_df
    

#### Label All csv files in benign folder as "BENIGN"

In [4]:
# create new csv files with new labels in ready folder (run it once)
for _dir in modbus.dataset["benign_dataset_dir"]:
    print(_dir)
    # uncomment next line of code if the labeling of the benign dataset does not occured yet !
    #It may take some time !
    # label_rest_as_benign_and_write_csv(read_csvs_from_path_and_reformat(_dir) ,_dir)
            

./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-18_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-22_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-30_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-19_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-32_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-25_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-15_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal-31_fix_ord.pcap_Flow.csv
./ModbusDataset/benign/network-wide-pcap-capture/network-wide/output/network-wide-normal

#### Attack Labeling depends on the log Attack , Timestamp columns



In the log only the completed attacks are reliable. for example 
```md
2023-02-01 13:06:29.2, Brute force or specific coil. Address: 0
2023-02-01 13:06:29.201, Brute force or specific - Complete
```
🟢 It's okay but 
```md
2023-02-01 13:06:29.2, Brute force or specific coil. Address: 0
```
🔴 It may not be implemented literally at all! !

❗Unfortunately no exact logging happend in attack/attacker logs
Complete Timestamp of an attack might be before ending the flow/connection or after that about 2 minutes !!!
Or the timestamp in the log may be earlier than the timestamp of the .pcap files (SYN).
Received timestamps of a flow might be different in different nodes make it hard to label automatically.


In [None]:
def clean_label(label_str):
    """Clean label by removing multiple unwanted substrings/characters"""
    # Remove "Complete" in any casing, commas, hyphens, periods, and everything after a colon
    label_str = re.sub(r'ADDRESS|Complete|or specific|coil|RANGE|range|,|-|\.|:[^:]*$', '', label_str, flags=re.IGNORECASE)
    # Remove any resulting double spaces or trailing hyphens
    label_str = re.sub(r'\s+', ' ', label_str).strip()
    label_str = re.sub(r'\s-\s*$', '', label_str)
    label_str = label_str.upper()
    return label_str

def log_pairs_from_dir(_attack_log_dirs):
    """
    extract start , end , attack label pairs from logs (csv)
    make it ready to be used in label_flows second argument
    """
    # sort attak_df beacuse of overlapped flow durations results in multiple labels 
    # Priority: Earlier flows in attack_df take precedence for labeling when multiple flows contain attack interval
    attack_log_df = sort_df(merge_datasets(_attack_log_dirs))
    #  Find rows in attack_log_df where 'Attack' contains 'Complete'
    # Create a mask for rows where 'Attack' contains 'Complete'
    complete_mask = attack_log_df['Attack'].str.contains('Complete', na=False)
    # Find valid attack pairs and convert to numpy datetime64 upfront
    pairs = []
    for i in attack_log_df.index[complete_mask]:
        if i > 0:
            start = np.datetime64(attack_log_df.at[i-1, 'Timestamp'],'us')
            end = np.datetime64(attack_log_df.at[i, 'Timestamp'],'us')
            label = clean_label(attack_log_df.at[i, 'Attack'])
            pairs.append((start, end, label))


    no_complete_mask = ~(complete_mask | complete_mask.shift(-1, fill_value=False))

    for i in attack_log_df.index[no_complete_mask]:
        if i > 0:
            start = np.datetime64(attack_log_df.at[i, 'Timestamp'],'us')
            end = np.datetime64(attack_log_df.at[i, 'Timestamp'],'us')
            label = clean_label(attack_log_df.at[i, 'Attack'])
            pairs.append((start, end, label))
    return pairs 

def label_flows(_attack_dir,_attack_pairs,src_ip_list=None ,dst_ip_list=None,max_start_timestamp_tolerate= np.timedelta64(100_000, 'us'),max_end_timestamp_tolarate=np.timedelta64(1_000_000, 'us')):
    """
    label flows of the dataset csv file with respect to the logs overlaping timestamp 
    then change the _attack_dir "Label" Column.
    Timestamps are in MICROSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.
    """

    print("Attack directory ------------------------------",_attack_dir)
        
    attack_df= read_csvs_from_path_and_reformat(_attack_dir)
    attack_df['end_time'] = attack_df['Timestamp'] + pd.to_timedelta(attack_df['Flow Duration'], unit='us')
    attack_df=sort_df(attack_df)

    # print("after----------",(len([ pair[2]  for pair in pairs if "RECON" in pair[2]])))
    # Convert to numpy arrays for vectorized operations
    starts = attack_df['Timestamp'].values.astype('datetime64[us]')
    ends = attack_df['end_time'].values.astype('datetime64[us]')
    labels = attack_df['Label'].values
    orig_indices = attack_df.index.values
    custom_mask = (attack_df['Label'] == "NeedManualLabel").values

    # Precompute mask for unlabeled flows
    custom_mask &= (labels == "NeedManualLabel")
    if src_ip_list is not None:
        custom_mask &= attack_df["Src IP"].isin(src_ip_list).values
    if dst_ip_list is not None:
        custom_mask &= attack_df["Dst IP"].isin(dst_ip_list).values

   # Collect updates to minimize DataFrame operations
    updates = []

    # Process each attack pair
    for start_p, end_p, new_label in _attack_pairs:
        # Find flows that:
        # 1. logging attack Complete within 1 seconds of the flow end_time
        # 2. logging attack start within 0.01 seconds after the flow start
        # 3. Are still unlabeled
        # Find candidate flows using binary search ( sorted attack_df by start timestamp)
        start_left_interval = bisect_left(starts, start_p - max_start_timestamp_tolerate)
        end_right_interval = bisect_right(starts, start_p + max_start_timestamp_tolerate)

        for idx in range(start_left_interval,end_right_interval):
            if custom_mask[idx] and (np.abs(ends[idx] - end_p)<=max_end_timestamp_tolarate ):
                orig_idx = orig_indices[idx]
                updates.append((orig_idx, new_label))
                custom_mask[idx] = False
                break  # Only label the first matching flow

    # Apply all updates at once
    if updates:
        update_indices, update_labels = zip(*updates)
        attack_df.loc[list(update_indices), 'Label'] = list(update_labels)

    # Finalize by labeling remaining flows as benign and writing to CSV
    label_rest_as_benign_and_write_csv(attack_df, _attack_dir)

asdasd


##### External Attacks 
🟢 Attacks from an unknown IP address (185.175.0.7) targeting 185.175.0.4.

❗Attacks occured in separable detectable interval ( no need for manual )


In [87]:
# uncomment next lines of code if the labeling of the external attack dataset does not occured yet  !
attack_pairs = log_pairs_from_dir(modbus.dataset["attack_log_dir"]["external"])
for att_dir in modbus.dataset["attack_dataset_dir"]["external"] :
    label_flows(att_dir,attack_pairs,src_ip_list=["185.175.0.7"],
                         max_start_timestamp_tolerate=np.timedelta64(10_000, 'us'),
                         max_end_timestamp_tolarate=np.timedelta64(10_000, 'us'))

Attack directory ------------------------------ ./ModbusDataset/attack/external/network-wide/output/network-wide-normal-1_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    68931
Name: count, dtype: int64
label count after labelling:
 Label
BRUTE FORCE    35475
BENIGN         33456
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/external/network-wide/output/network-wide-normal-0_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    152398
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN                   151747
BRUTE FORCE                 613
RECON                        32
QUERY FLOODING                2
LENGTH MANIPULATION           1
REPLAY                        1
PAYLOAD INJECTION             1
STACKED MODBUS FRAMES         1
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/external/ied4c/ied4c-network-capture/output/

##### Compromised SCADA Attacks 
🔴 Attacks from 185.175.0.3 targeting 185.175.0.4 ,185.175.0.5 and 185.175.0.8.Days 12 to 14 in attack logs are missed in pcaps.

❗❗Attacks not occured in separable detectable interval and need  manually checking 


In [None]:
# uncomment next lines of code if the labeling of the external attack dataset does not occured yet  !

dates = ["03-12-2023", "03-13-2023", "03-14-2023"]
## exclude dates 12,13,14 from attack log dirs
filtered_list = list(filter(lambda x: all(date not in x for date in dates), modbus.dataset["attack_log_dir"]["compromised-scada"]))
attack_pairs = log_pairs_from_dir(filtered_list)

for att_dir in modbus.dataset["attack_dataset_dir"]["compromised-scada"]  :
    label_flows(att_dir,attack_pairs,src_ip_list=["185.175.0.3"],
                dst_ip_list=["185.175.0.4","185.175.0.5","185.175.0.8"],
             max_start_timestamp_tolerate=np.timedelta64(1_000, 'us'),
                         max_end_timestamp_tolarate=np.timedelta64(100_000_000, 'us'))




Attack directory ------------------------------ ./ModbusDataset/attack/compromised-scada/ied4c/ied4c-network-captures/output/vethe685ac9-4_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    160689
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN                   160588
BRUTE FORCE                 100
STACKED MODBUS FRAMES         1
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/compromised-scada/ied4c/ied4c-network-captures/output/vethe685ac9-3_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    158034
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN                   131005
BRUTE FORCE               26868
RECON                        48
PAYLOAD INJECTION            36
QUERY FLOODING               28
STACKED MODBUS FRAMES        22
REPLAY                       20
LENGTH MANIPULATION           7
Name: count, dtype: int64
Attack directory ---------

In [12]:
modbus.dataset["attack_dataset_dir"]["compromised-scada"].index("./ModbusDataset/attack/compromised-scada/substation-wide-capture/output/substation-1_fix_ord.pcap_Flow.csv")


45

##### Compromised IED Attacks
- Scada will initiate the connections 
- Attacks from 185.175.0.5 targeting 185.175.0.2 ( reverse direction of the flow).

In [43]:
for ext_att_dir in modbus.dataset["attack_dataset_dir"]["compromised-ied"] :
    # uncomment next line of code if the labeling of the external attack dataset does not occured yet  !
    #It may take some time !
    if  ("./ModbusDataset/attack/compromised-ied/trust-scada-hmi/trust-scada-network-captures/output/veth3efd353-15_fix_ord.pcap_Flow.csv" not in ext_att_dir) :
        continue
    label_flows_from_log(ext_att_dir,modbus.dataset["attack_log_dir"]["compromised-ied"],
                         src_ip_list=["185.175.0.2"],dst_ip_list=["185.175.0.5"])


Attack directory ------------------------------ ./ModbusDataset/attack/compromised-ied/trust-scada-hmi/trust-scada-network-captures/output/veth3efd353-15_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    128025
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN                         128003
Length manipulation                 5
Delay Response Attack               4
False Data Injection Attack         4
Baseline Replay                     3
Starting Query flooding             2
Frame Stacking                      2
Query flooding                      1
Payload Injection                   1
Name: count, dtype: int64
