### CICModbus2023 Flow labeling 

In [1]:
import pandas as pd
import numpy as np
import re
from bisect import bisect_left, bisect_right
import zipfile
from modbus import ModbusDataset
import os 
print("pandas current version ",pd.__version__)
print("numpy current version ",np.__version__)
root_dir = "./ModbusDataset"
modbus = ModbusDataset(root_dir,"output")
modbus.summary_print()


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


pandas current version  2.2.0
numpy current version  1.26.3
 The CIC Modbus Dataset contains network (pcap) captures and attack logs from a simulated substation network.
                The dataset is categorized into two groups: an attack dataset and a benign dataset
                The attack dataset includes network traffic captures that simulate various types of Modbus protocol attacks in a substation environment.
                The attacks are reconnaissance, query flooding, loading payloads, delay response, modify length parameters, false data injection, stacking Modbus frames, brute force write and baseline replay.
                These attacks are based of some techniques in the MITRE ICS ATT&CK framework.
                On the other hand, the benign dataset consists of normal network traffic captures representing legitimate Modbus communication within the substation network.
                The purpose of this dataset is to facilitate research, analysis, and development of i

In [2]:

print("list of benign csv dataset directories under ./ModbusDataset/benign/*")
ModbusDataset.print_csv(modbus.dataset["benign_dataset_dir"],"./ModbusDataset/benign/")
print("list of attack csv dataset directories under ./ModbusDataset/attack/*")
ModbusDataset.print_csv(modbus.dataset["attack_dataset_dir"]["total"],"./ModbusDataset/attack/")


list of benign csv dataset directories under ./ModbusDataset/benign/*
1 network-wide-pcap-capture/network-wide/output/network-wide-normal-18_fix_ord.pcap_Flow.csv
2 network-wide-pcap-capture/network-wide/output/network-wide-normal-22_fix_ord.pcap_Flow.csv
3 network-wide-pcap-capture/network-wide/output/network-wide-normal-30_fix_ord.pcap_Flow.csv
4 network-wide-pcap-capture/network-wide/output/network-wide-normal-19_fix_ord.pcap_Flow.csv
5 network-wide-pcap-capture/network-wide/output/network-wide-normal-32_fix_ord.pcap_Flow.csv
6 network-wide-pcap-capture/network-wide/output/network-wide-normal-25_fix_ord.pcap_Flow.csv
7 network-wide-pcap-capture/network-wide/output/network-wide-normal-15_fix_ord.pcap_Flow.csv
8 network-wide-pcap-capture/network-wide/output/network-wide-normal-31_fix_ord.pcap_Flow.csv
9 network-wide-pcap-capture/network-wide/output/network-wide-normal-24_fix_ord.pcap_Flow.csv
10 network-wide-pcap-capture/network-wide/output/network-wide-normal-14_fix_ord.pcap_Flow.csv

#### Labeling functions


In [3]:
from utils import format_csv_for_labeling,merge_datasets


def read_csvs_from_path_and_reformat(path):
    df = pd.read_csv(path, encoding='cp1252')

    df = format_csv_for_labeling(df)
    print("labels after pre-processing:\n", df["Label"].value_counts())

    int32_columns = ["Src Port", "Dst Port", "Flow Duration", "Total Fwd Packet", "Total Bwd packets", "Total Length of Fwd Packet", "Total Length of Bwd Packet", "Fwd Packet Length Max",
        "Fwd Packet Length Min", "Bwd Packet Length Max", "Bwd Packet Length Min", "Flow IAT Max", "Flow IAT Min", "Fwd IAT Total", "Fwd IAT Max", "Fwd IAT Min", "Bwd IAT Total",
        "Bwd IAT Max", "Bwd IAT Min", "Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags", "Packet Length Min", "Packet Length Max", "FIN Flag Count", "SYN Flag Count", "RST Flag Count", "PSH Flag Count",
        "ACK Flag Count", "URG Flag Count", "CWR Flag Count", "ECE Flag Count", "Subflow Fwd Packets", "Subflow Fwd Bytes",
        "Subflow Bwd Packets", "Subflow Bwd Bytes", "FWD Init Win Bytes", "Bwd Init Win Bytes", "Fwd Act Data Pkts", "Fwd Seg Size Min", "Active Max",
        "Active Min", "Idle Max", "Idle Min"]

    int16_columns = ["Fwd Header Length", "Bwd Header Length", "ICMP Code", "ICMP Type"]

    for column in int32_columns:
        df[column] = df[column].astype('int32')

    for column in int16_columns:
        df[column] = df[column].astype('int16')

    return df


def label_rest_as_benign_and_write_csv(df, input_path,print_index=False):
    """
    This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label
    so far is labelled as Benign.
    write dataframe to csv file in ready directory (create if not exist )
    """
    df["Label"]=df["Label"].mask(df["Label"] == "NeedManualLabel", "BENIGN")
    print("label count after labelling:\r\n", df["Label"].value_counts())
    # Construct the ready directory path alongside output
    ready_dir = os.path.join(os.path.dirname(os.path.dirname(input_path)), 'ready')
    os.makedirs(ready_dir, exist_ok=True)
    filename = os.path.basename(input_path)
    name, ext = os.path.splitext(filename)  # Split the extension (name, ".csv")
    cleaned_name = name.replace("_fix_ord.pcap_Flow", "-labeled")
    # Construct full output path
    output_path = os.path.join(ready_dir, f"{cleaned_name}{ext}")  # Append extension
    # Adds line numbers in the first column if print_index is set to true
    if print_index:
        df.reset_index(inplace=True, drop=True)
        df.index += 1
        df.index.name = 'id'
        df.to_csv(output_path)
    else:
        df.to_csv(output_path, index=False)

def sort_df(_df):
    """    
    return sorted dataframe vertically with respect to TimeStamps
    """    
    new_df_sorted = _df.sort_values(by='Timestamp', ascending=True)
    new_df_sorted = new_df_sorted.reset_index(drop=True)
    return new_df_sorted


    

#### Label All csv files in benign folder as "BENIGN"

Uncomment next lines of code if the labeling of the benign dataset not occured yet !


In [None]:
# for _dir in modbus.dataset["benign_dataset_dir"]:
#     label_rest_as_benign_and_write_csv(read_csvs_from_path_and_reformat(_dir) ,_dir)
                    

labels after pre-processing:
 Label
NeedManualLabel    153041
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    153041
Name: count, dtype: int64
labels after pre-processing:
 Label
NeedManualLabel    153868
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    153868
Name: count, dtype: int64
labels after pre-processing:
 Label
NeedManualLabel    152394
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    152394
Name: count, dtype: int64
labels after pre-processing:
 Label
NeedManualLabel    153812
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    153812
Name: count, dtype: int64
labels after pre-processing:
 Label
NeedManualLabel    71348
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    71348
Name: count, dtype: int64
labels after pre-processing:
 Label
NeedManualLabel    153378
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    153378
Name: count, dtype: int64


#### Attack Labeling depends on the log Attack , Timestamp columns



In the log only the completed attacks are reliable. for example 
```md
2023-02-01 13:06:29.2, Brute force or specific coil. Address: 0
2023-02-01 13:06:29.201, Brute force or specific - Complete
```
🟢 It's okay but 
```md
2023-02-01 13:06:29.2, Brute force or specific coil. Address: 0
```
🔴 It may not be implemented literally at all! !

❗Unfortunately no exact logging happend in attack/attacker logs
Complete Timestamp of an attack might be before ending the flow/connection or after that about 2 minutes !!!
Or the timestamp in the log may be earlier than the timestamp of the .pcap files (SYN).
Received timestamps of a flow might be different in different nodes make it hard to label automatically.


In [11]:
def clean_label(label_str):
    """Clean label by removing multiple unwanted substrings/characters and applying specific conversions"""
    conversions = {
        'REPLAY': 'BASELINE REPLAY',
        'STACKED MODBUS FRAMES': 'FRAME STACKING',
        'FALSE DATA INJECTION': 'PAYLOAD INJECTION'
    }
    
    label_str = re.sub(r'BASELINE|STARTING|ATTACK|ADDRESS|Complete|or specific|coil|RANGE|range|,|-|\.|:[^:]*$', '', label_str, flags=re.IGNORECASE)
    label_str = re.sub(r'\s+', ' ', label_str).strip()
    label_str = re.sub(r'\s-\s*$', '', label_str)
    label_str = label_str.upper()
    
    for old, new in conversions.items():
        label_str = label_str.replace(old, new)
    
    return label_str

def log_pairs_from_dir(_attack_log_dirs):
    """
    extract start , end , attack label pairs from logs (csv)
    make it ready to be used in label_flows second argument
    """
    # sort attak_df beacuse of overlapped flow durations results in multiple labels 
    # Priority: Earlier flows in attack_df take precedence for labeling when multiple flows contain attack interval
    attack_log_df = sort_df(merge_datasets(_attack_log_dirs))
    #  Find rows in attack_log_df where 'Attack' contains 'Complete'
    # Create a mask for rows where 'Attack' contains 'Complete'
    complete_mask = attack_log_df['Attack'].str.contains('Complete', na=False)
    
    ## sending response to the client (compromised-ied logs)
    complete_mask |= attack_log_df['Attack'].str.contains('Sending', na=False)

    # Find valid attack pairs and convert to numpy datetime64 upfront
    pairs = []
    for i in attack_log_df.index[complete_mask]:
        if i > 0:
            start = np.datetime64(attack_log_df.at[i-1, 'Timestamp'],'us')
            end = np.datetime64(attack_log_df.at[i, 'Timestamp'],'us')
            label = clean_label(attack_log_df.at[i, 'Attack'])
            pairs.append((start, end, label))

    no_complete_mask = ~(complete_mask | complete_mask.shift(-1, fill_value=False))

    for i in attack_log_df.index[no_complete_mask]:
        if i > 0:
            start = np.datetime64(attack_log_df.at[i, 'Timestamp'],'us')
            end = np.datetime64(attack_log_df.at[i, 'Timestamp'],'us')
            label = clean_label(attack_log_df.at[i, 'Attack'])
            pairs.append((start, end, label))
    return pairs 

def label_flows(_attack_dir,_attack_pairs,src_ip_list=None ,dst_ip_list=None,
                max_start_timestamp_tolerate= np.timedelta64(100_000, 'us'),
                max_end_timestamp_tolarate=np.timedelta64(1_000_000, 'us')):
    """
    label flows of the dataset csv file with respect to the logs overlaping timestamp 
    then change the _attack_dir "Label" Column.
    Timestamps are in MICROSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.
    """

    print("Attack directory ------------------------------",_attack_dir)
        
    attack_df= read_csvs_from_path_and_reformat(_attack_dir)
    attack_df['end_time'] = attack_df['Timestamp'] + pd.to_timedelta(attack_df['Flow Duration'], unit='us')
    attack_df=sort_df(attack_df)

    # print("after----------",(len([ pair[2]  for pair in pairs if "RECON" in pair[2]])))
    # Convert to numpy arrays for vectorized operations
    starts = attack_df['Timestamp'].values.astype('datetime64[us]')
    ends = attack_df['end_time'].values.astype('datetime64[us]')
    labels = attack_df['Label'].values
    orig_indices = attack_df.index.values
    custom_mask = (attack_df['Label'] == "NeedManualLabel").values

    # Precompute mask for unlabeled flows
    custom_mask &= (labels == "NeedManualLabel")
    if src_ip_list is not None:
        custom_mask &= attack_df["Src IP"].isin(src_ip_list).values
    if dst_ip_list is not None:
        custom_mask &= attack_df["Dst IP"].isin(dst_ip_list).values

   # Collect updates to minimize DataFrame operations
    updates = []

    # Process each attack pair
    for start_p, end_p, new_label in _attack_pairs:
        # Find flows that:
        # 1. logging attack Complete within 1 seconds of the flow end_time
        # 2. logging attack start within 0.01 seconds after the flow start
        # 3. Are still unlabeled
        # Find candidate flows using binary search ( sorted attack_df by start timestamp)
        start_left_interval = bisect_left(starts, start_p - max_start_timestamp_tolerate)
        end_right_interval = bisect_right(starts, start_p + max_start_timestamp_tolerate)

        for idx in range(start_left_interval,end_right_interval):
            if custom_mask[idx] and (np.abs(ends[idx] - end_p)<=max_end_timestamp_tolarate ):
                orig_idx = orig_indices[idx]
                updates.append((orig_idx, new_label))
                custom_mask[idx] = False
                break  # Only label the first matching flow

    # Apply all updates at once
    if updates:
        update_indices, update_labels = zip(*updates)
        attack_df.loc[list(update_indices), 'Label'] = list(update_labels)

    # Finalize by labeling remaining flows as benign and writing to CSV
    label_rest_as_benign_and_write_csv(attack_df, _attack_dir)




##### External Attacks 
🟢 Attacks from an unknown IP address (185.175.0.7) targeting 185.175.0.4.

❗Attacks occured in separable detectable interval ( no need for manual )


In [5]:
attack_pairs = log_pairs_from_dir(modbus.dataset["attack_log_dir"]["external"])
for att_dir in modbus.dataset["attack_dataset_dir"]["external"] :
    label_flows(att_dir,attack_pairs,src_ip_list=["185.175.0.7"],
                         max_start_timestamp_tolerate=np.timedelta64(10_000, 'us'),
                         max_end_timestamp_tolarate=np.timedelta64(10_000, 'us'))

Attack directory ------------------------------ ./ModbusDataset/attack/external/network-wide/output/network-wide-normal-1_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    68931
Name: count, dtype: int64
label count after labelling:
 Label
BRUTE FORCE    35407
BENIGN         33524
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/external/network-wide/output/network-wide-normal-0_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    152398
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN                 151746
BRUTE FORCE               615
RECON                      31
QUERY FLOODING              2
LENGTH MANIPULATION         1
BASELINE REPLAY             1
PAYLOAD INJECTION           1
FRAME STACKING              1
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/external/ied4c/ied4c-network-capture/output/veth8bc3408-0_fi

##### Compromised SCADA Attacks 
🔴 Attacks from 185.175.0.3 targeting 185.175.0.4 ,185.175.0.5 and 185.175.0.8.Days 12 to 14 in attack logs are missed in pcaps.

❗❗Attacks not occured in separable detectable interval and need  manually checking 


In [6]:
# uncomment next lines of code if the labeling of the external attack dataset does not occured yet  !

dates = ["03-12-2023", "03-13-2023", "03-14-2023"]
## exclude dates 12,13,14 from attack log dirs
filtered_list = list(filter(lambda x: all(date not in x for date in dates), modbus.dataset["attack_log_dir"]["compromised-scada"]))
attack_pairs = log_pairs_from_dir(filtered_list)

for att_dir in modbus.dataset["attack_dataset_dir"]["compromised-scada"]  :
    label_flows(att_dir,attack_pairs,src_ip_list=["185.175.0.3"],
                dst_ip_list=["185.175.0.4","185.175.0.5","185.175.0.8"],
             max_start_timestamp_tolerate=np.timedelta64(1_000, 'us'),
                         max_end_timestamp_tolarate=np.timedelta64(100_000_000, 'us'))




Attack directory ------------------------------ ./ModbusDataset/attack/compromised-scada/ied4c/ied4c-network-captures/output/vethe685ac9-4_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    160689
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN             160588
BRUTE FORCE            99
FRAME STACKING          1
BASELINE REPLAY         1
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/compromised-scada/ied4c/ied4c-network-captures/output/vethe685ac9-3_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    158034
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN                 131007
BRUTE FORCE             26868
RECON                      47
PAYLOAD INJECTION          35
QUERY FLOODING             28
FRAME STACKING             22
BASELINE REPLAY            20
LENGTH MANIPULATION         7
Name: count, dtype: int64
Attack directory -----------------

##### Compromised IED Attacks
🟢 Scada will initiate the connections Attacks from 185.175.0.5 targeting 185.175.0.2 ( reverse direction of the flow).
❗separable detectable interval but few count


In [12]:
attack_pairs = log_pairs_from_dir(modbus.dataset["attack_log_dir"]["compromised-ied"])
for att_dir in modbus.dataset["attack_dataset_dir"]["compromised-ied"]  :
    label_flows(att_dir,attack_pairs,src_ip_list=["185.175.0.2"],
                dst_ip_list=["185.175.0.5"],
             max_start_timestamp_tolerate=np.timedelta64(3_000_000, 'us'),
                         max_end_timestamp_tolarate=np.timedelta64(100_000_000, 'us'))


Attack directory ------------------------------ ./ModbusDataset/attack/compromised-ied/ied4c/ied4c-network-captures/output/vethe685ac9-4_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    176157
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    176157
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/compromised-ied/ied4c/ied4c-network-captures/output/vethe685ac9-3_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    173315
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    173315
Name: count, dtype: int64
Attack directory ------------------------------ ./ModbusDataset/attack/compromised-ied/ied4c/ied4c-network-captures/output/vethe685ac9-1_fix_ord.pcap_Flow.csv
labels after pre-processing:
 Label
NeedManualLabel    176179
Name: count, dtype: int64
label count after labelling:
 Label
BENIGN    176179
Name: count, dtype: int64
Attack directory -----

#### Save the ready folders in a rar file for later uses.

In [13]:


def create_archive(archive_name, files_to_archive):
    with zipfile.ZipFile(archive_name, 'w') as zip_obj:
        for file in files_to_archive:
            if os.path.exists(file):
                zip_obj.write(file)
            else:
                print(f"Warning: File not found: {file}")

modbus_ready = ModbusDataset(root_dir,"ready")
files_to_archive  = modbus_ready.dataset["benign_dataset_dir"]
files_to_archive.extend(modbus_ready.dataset["attack_dataset_dir"]["total"])
dataset_total_number =modbus.dataset["metadata"]["founded_files_num"]["total_dataset_num"]
if len(files_to_archive)==dataset_total_number:
    print("all dataset csv files are labeled and ready to use!")
    archive_name = 'Labeled_CICMODBUS2023.zip'
    create_archive(archive_name, files_to_archive)
else:
    print(dataset_total_number-len(files_to_archive," files are not ready yet! "))


all dataset csv files are labeled and ready to use!
