In [2]:
import pandas as pd
import numpy as np
import os 

### Files preparation

In [3]:
CWD = os.getcwd()
DATA_FOLDER = os.path.join(CWD, 'data')
MACHINE_LEARNING_CVE_FOLDER = os.path.join(DATA_FOLDER, 'MachineLearningCVE')
TRAFFIC_LABELLING_FOLDER = os.path.join(DATA_FOLDER, 'TrafficLabelling')

In [4]:
def get_file_path(folder, file):
    return os.path.join(folder, file)

In [5]:
ml_cve_files = os.listdir(MACHINE_LEARNING_CVE_FOLDER)
traffic_lab_files = os.listdir(TRAFFIC_LABELLING_FOLDER)

In [6]:
ml_cve_files

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Monday-WorkingHours.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv']

In [7]:
traffic_lab_files

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Monday-WorkingHours.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv']

### Load CSV files into a dict of DataFrames keyed by filename

In [None]:
def load_csv_into_dict(folder) -> dict[str, pd.DataFrame]:
    files = os.listdir(folder)
    dfs = {}
    for file_name in files:
        file_path = get_file_path(folder, file_name)
        
        try:
            df = pd.read_csv(get_file_path(folder, file_name))
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding='ISO-8859-1')

        dfs[file_name] = df
        
    return dfs

In [10]:
ml_cve_dfs = load_csv_into_dict(MACHINE_LEARNING_CVE_FOLDER)

In [11]:
# traffic_lab_dfs = load_csv_into_dict(TRAFFIC_LABELLING_FOLDER)

### Columns

In [13]:
df = ml_cve_dfs['Wednesday-workingHours.pcap_ISCX.csv']

In [19]:
df.columns.to_list()

['Flow ID',
 ' Source IP',
 ' Source Port',
 ' Destination IP',
 ' Destination Port',
 ' Protocol',
 ' Timestamp',
 ' Flow Duration',
 ' Total Fwd Packets',
 ' Total Backward Packets',
 'Total Length of Fwd Packets',
 ' Total Length of Bwd Packets',
 ' Fwd Packet Length Max',
 ' Fwd Packet Length Min',
 ' Fwd Packet Length Mean',
 ' Fwd Packet Length Std',
 'Bwd Packet Length Max',
 ' Bwd Packet Length Min',
 ' Bwd Packet Length Mean',
 ' Bwd Packet Length Std',
 'Flow Bytes/s',
 ' Flow Packets/s',
 ' Flow IAT Mean',
 ' Flow IAT Std',
 ' Flow IAT Max',
 ' Flow IAT Min',
 'Fwd IAT Total',
 ' Fwd IAT Mean',
 ' Fwd IAT Std',
 ' Fwd IAT Max',
 ' Fwd IAT Min',
 'Bwd IAT Total',
 ' Bwd IAT Mean',
 ' Bwd IAT Std',
 ' Bwd IAT Max',
 ' Bwd IAT Min',
 'Fwd PSH Flags',
 ' Bwd PSH Flags',
 ' Fwd URG Flags',
 ' Bwd URG Flags',
 ' Fwd Header Length',
 ' Bwd Header Length',
 'Fwd Packets/s',
 ' Bwd Packets/s',
 ' Min Packet Length',
 ' Max Packet Length',
 ' Packet Length Mean',
 ' Packet Length Std'

### UNIQUE VALUES IN COLUMN

In [16]:
for column in df.columns.to_list():
    print(f'{column = }:')
    print(f'{df[column].unique()}')
    print()

column = 'Flow ID':
['192.168.10.14-209.48.71.168-49459-80-6'
 '192.168.10.3-192.168.10.17-389-49453-6'
 '192.168.10.3-192.168.10.17-88-46124-6' ...
 '192.168.10.3-192.168.10.14-53-51114-17'
 '192.168.10.3-192.168.10.16-53-24054-17'
 '192.168.10.3-192.168.10.14-53-51694-17']

column = ' Source IP':
['192.168.10.14' '192.168.10.17' '192.168.10.16' ... '104.17.27.15'
 '52.55.1.224' '54.165.169.242']

column = ' Source Port':
[49459 49453 46124 ...  9015 27034 29671]

column = ' Destination IP':
['209.48.71.168' '192.168.10.3' '91.189.88.161' ... '94.31.29.99'
 '88.85.71.60' '17.253.20.125']

column = ' Destination Port':
[   80   389    88 ... 45294 46356 37907]

column = ' Protocol':
[ 6 17  0]

column = ' Timestamp':
['5/7/2017 8:42' '5/7/2017 8:43' '5/7/2017 8:44' '5/7/2017 8:45'
 '5/7/2017 8:46' '5/7/2017 8:47' '5/7/2017 8:48' '5/7/2017 8:49'
 '5/7/2017 8:50' '5/7/2017 8:51' '5/7/2017 8:52' '5/7/2017 8:53'
 '5/7/2017 8:54' '5/7/2017 8:55' '5/7/2017 8:56' '5/7/2017 8:57'
 '5/7/2017 8:

### Label Values in dataset

In [21]:
for file_name, df in ml_cve_dfs.items():
    print(f'{file_name = }')
    print(f'Label values: {df[' Label'].unique()}')
    print()

file_name = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
Label values: ['BENIGN' 'DDoS']

file_name = 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'
Label values: ['BENIGN' 'PortScan']

file_name = 'Friday-WorkingHours-Morning.pcap_ISCX.csv'
Label values: ['BENIGN' 'Bot']

file_name = 'Monday-WorkingHours.pcap_ISCX.csv'
Label values: ['BENIGN']

file_name = 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv'
Label values: ['BENIGN' 'Infiltration']

file_name = 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv'
Label values: ['BENIGN' 'Web Attack ï¿½ Brute Force' 'Web Attack ï¿½ XSS'
 'Web Attack ï¿½ Sql Injection']

file_name = 'Tuesday-WorkingHours.pcap_ISCX.csv'
Label values: ['BENIGN' 'FTP-Patator' 'SSH-Patator']

file_name = 'Wednesday-workingHours.pcap_ISCX.csv'
Label values: ['BENIGN' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed']




### Summary of Attack Types

| Category           | Labels                                 |
|--------------------|-----------------------------------------|
| **Brute Force**     | `FTP-Patator`, `SSH-Patator`, `Web Attack – Brute Force` |
| **DoS**             | `DoS Hulk`, `DoS GoldenEye`, `DoS slowloris`, `DoS Slowhttptest` |
| **DDoS**            | `DDoS`                                 |
| **Port Scanning**   | `PortScan`                             |
| **Web Attacks**     | `Web Attack – XSS`, `Web Attack – Sql Injection` |
| **Botnet**          | `Bot`                                  |
| **Infiltration**    | `Infiltration`                         |
| **Exploit**         | `Heartbleed`                           |

### What models can I build?

1) **Binary classifier**  
Detects whether traffic is an attack (1) or benign (0)

2) **Specific attack classifiers**

   2.1) **One model per attack type**  
   Separate binary model for each attack (e.g., DDoS vs benign, PortScan vs benign)

   2.2) **Multiclass classifier**  
   One model that predicts the exact type of attack (e.g., DDoS, PortScan, Infiltration, etc.)
