# Base de Dados

## POR TEMPO

In [None]:
import os
import pandas as pd
import numpy as np
import math
from collections import Counter
from datetime import datetime
from typing import List, Dict, Union


In [None]:
DATASET_PATH = 'datasets/CICDDoS2019/'

ATTACK_ORDER = {
    '03-11': [
        'Portmap.csv', 'NetBIOS.csv', 'LDAP.csv', 'MSSQL.csv', 'UDP.csv', 'UDPLag.csv', 'Syn.csv'
    ],
    '01-12': [
        'DrDoS_NTP.csv', 'DrDoS_DNS.csv', 'DrDoS_LDAP.csv', 'DrDoS_MSSQL.csv', 'DrDoS_NetBIOS.csv', 'DrDoS_SNMP.csv', 'DrDoS_SSDP.csv', 'DrDoS_UDP.csv', 
        'UDPLag.csv', 'Syn.csv', 'TFTP.csv' 
    ]
}

OUTPUT_FILES = {
    '03-11': 'CICDDoS2019_03_11_Aggregated_Features_1sWindow.csv', 
    '01-12': 'CICDDoS2019_01_12_Aggregated_Features_1sWindow.csv' 
}

# Tamanho do chunking 
PANDAS_CHUNK_SIZE = 100000 

# Tamanho da janela temporal 
TIME_WINDOW_SECONDS = 0.05

# Colunas que serão tratadas de forma especial
TIMESTAMP_COL = 'Timestamp'
ATTACK_LABEL_COL = 'Label' 

# Colunas para cálculo de Entropia 
DIVERSITY_COLS = ['Source IP', 'Destination IP', 'Source Port', 'Destination Port', 'Protocol'] 

# Colunas lixo a ser removidas 
COLUMNS_TO_DROP = ['Unnamed: 0', 'Flow ID', 'SimillarHTTP']

# Constante para o rótulo Benigno
BENIGN_LABEL = 'BENIGN'

In [4]:
def clean_column_names(columns: pd.Index) -> List[str]:
    return [col.replace(' ', '_') for col in columns]

def shannon_entropy(data: pd.Series) -> float:
    if data.empty:
        return 0.0
    
    data = data.astype(str).dropna()
    counts = data.value_counts(normalize=True)
    entropy = -np.sum(counts * np.log2(counts))
    return entropy

def aggregate_window_by_time(df_window: pd.DataFrame) -> pd.Series:
    timestamp_col_clean = TIMESTAMP_COL.replace(' ', '_')
    attack_label_col_clean = ATTACK_LABEL_COL.replace(' ', '_')
    diversity_cols_clean = [col.replace(' ', '_') for col in DIVERSITY_COLS]
    
    # Identifica colunas numéricas 
    numeric_cols = [col for col in df_window.columns 
                    if col not in [timestamp_col_clean, attack_label_col_clean] + diversity_cols_clean]

    # Calcular a Média para Colunas Numéricas
    numeric_data = df_window[numeric_cols].apply(pd.to_numeric, errors='coerce')
    aggregated_row = numeric_data.mean(axis=0)

    # Agregação Temporal
    try:
        # Garante que o Timestamp seja tratado como datetime, erros se tornam NaT
        timestamps = pd.to_datetime(df_window[timestamp_col_clean], errors='coerce')
        valid_timestamps = timestamps.dropna()
        
        delta_seconds = 0.0
        start_time = None
        
        if len(valid_timestamps) >= 2:
            delta_seconds = (valid_timestamps.iloc[-1] - valid_timestamps.iloc[0]).total_seconds()
        
        if not valid_timestamps.empty:
            start_time = valid_timestamps.iloc[0].strftime('%Y-%m-%d %H:%M:%S.%f')
        
    except Exception:
        delta_seconds = 0.0 
        start_time = 'Error'

    aggregated_row[f'{timestamp_col_clean}_Delta_Seconds'] = delta_seconds
    aggregated_row[f'{timestamp_col_clean}_Start'] = start_time
    
    if timestamp_col_clean in aggregated_row.index:
         aggregated_row = aggregated_row.drop(timestamp_col_clean)

    # Entropia de Shannon 
    for col_clean in diversity_cols_clean:
        entropy = shannon_entropy(df_window[col_clean].astype(str).dropna())
        aggregated_row[f'{col_clean}_Shannon_Entropy'] = entropy


      # O rótulo final é o mais frequente no intervalo de 1 segundo
    attack_labels = df_window[attack_label_col_clean].astype(str).str.strip().str.upper().replace('NAN', BENIGN_LABEL).dropna()
    final_string_label = attack_labels.mode().iloc[0] if not attack_labels.empty else BENIGN_LABEL

    # MANTÉM O RÓTULO AGREGADO NO LUGAR DA COLUNA ORIGINAL
    aggregated_row[attack_label_col_clean] = final_string_label 
    aggregated_row['Window_Packet_Count'] = len(df_window)
    
    return aggregated_row.to_frame().T


def concatenate_and_aggregate(date_folder: str, output_filename: str, data_path: str, file_names: List[str]):
    ordered_files = []
    base_path = os.path.join(data_path, date_folder)
    
    if not os.path.isdir(base_path):
        print(f"O caminho base '{base_path}' não foi encontrado.")
        return
    for file_name in file_names:
        file_path = os.path.join(base_path, file_name)
        if os.path.exists(file_path):
            ordered_files.append(file_path)
    if not ordered_files:
        print(f"Nenhum arquivo encontrado para o padrão '{base_path}/*.csv'.")
        return
    
    print(f"\nProcessando data: {date_folder} ({len(ordered_files)} arquivos)")
    print(f"-> Escrevendo o arquivo agregado para: {output_filename}")
    
    global_first_write = True
    
    # Buffer unificado para janelamento temporal
    aggregation_buffer = pd.DataFrame() 

    for file_path in ordered_files:
        print(f"-> Concatenando e agregando arquivo: {os.path.basename(file_path)}")
        
        try:
            chunker = pd.read_csv(
                file_path, 
                chunksize=PANDAS_CHUNK_SIZE, 
                low_memory=False, 
                skipinitialspace=True,
                dtype=str 
            )
        except Exception as e:
            print(f"!!! ERRO ao abrir o arquivo {os.path.basename(file_path)}: {e}")
            continue

        for chunk in chunker:
            
            chunk.columns = clean_column_names(chunk.columns)
            
            # Remove as colunas
            cols_to_drop_clean = [col.replace(' ', '_') for col in COLUMNS_TO_DROP]
            chunk = chunk.drop(columns=cols_to_drop_clean, errors='ignore')
            
            # Concatena o chunk lido ao buffer
            aggregation_buffer = pd.concat([aggregation_buffer, chunk], ignore_index=True)
            
            # Converte timestamps no buffer 
            buffer_ts = pd.to_datetime(aggregation_buffer[TIMESTAMP_COL.replace(' ', '_')], errors='coerce')
            valid_ts_buffer = buffer_ts.dropna()
            
            if valid_ts_buffer.empty:
                continue

            # Início da janela de tempo é o primeiro timestamp válido no buffer
            start_time = valid_ts_buffer.iloc[0]
            
            # Encontra o índice da última linha que está DENTRO da janela de 1 segundo
            time_diffs = (valid_ts_buffer - start_time).dt.total_seconds()
            
            # Índices de todas as linhas que estão dentro da janela de 1s
            window_indices = time_diffs[time_diffs < TIME_WINDOW_SECONDS].index
            
            # Verifica se há linhas suficientes para fechar uma janela de tempo
            while not window_indices.empty:
                # O último índice válido que faz parte da janela de 1s
                last_index_in_window = window_indices[-1]

                # A janela é tudo do início até o último índice
                window = aggregation_buffer.iloc[:last_index_in_window + 1]
                
                # Agrega a janela
                aggregated_row_df = aggregate_window_by_time(window)
                
                # Escrita
                header = global_first_write
                mode = 'w' if global_first_write else 'a'
                aggregated_row_df.to_csv(output_filename, mode=mode, header=header, index=False)
                global_first_write = False
                
                # Remove a janela processada do buffer e reseta o index
                aggregation_buffer = aggregation_buffer.iloc[last_index_in_window + 1:].reset_index(drop=True)

                # Re-calcula os tempos e índices para o próximo loop
                buffer_ts = pd.to_datetime(aggregation_buffer[TIMESTAMP_COL.replace(' ', '_')], errors='coerce')
                valid_ts_buffer = buffer_ts.dropna()
                
                if valid_ts_buffer.empty:
                    window_indices = pd.Index([]) # Força saída do while
                else:
                    start_time = valid_ts_buffer.iloc[0]
                    time_diffs = (valid_ts_buffer - start_time).dt.total_seconds()
                    window_indices = time_diffs[time_diffs < TIME_WINDOW_SECONDS].index
    
    # Processa o que sobrou no buffer como a última janela 
    if not aggregation_buffer.empty and global_first_write:
        # Se o buffer não estiver vazio e não houver sido escrito nada
        aggregated_row_df = aggregate_window_by_time(aggregation_buffer)
        aggregated_row_df.to_csv(output_filename, mode='w', header=True, index=False)
    elif not aggregation_buffer.empty:
        # Se o buffer não estiver vazio e já houver sido escrito algo
        aggregated_row_df = aggregate_window_by_time(aggregation_buffer)
        aggregated_row_df.to_csv(output_filename, mode='a', header=False, index=False)


    print(f"--- Processamento concluído para {date_folder}. O arquivo '{output_filename}' foi criado. ---")


In [None]:
if __name__ == '__main__':
    for date, output_file in OUTPUT_FILES.items():
        concatenate_and_aggregate(date, output_file, DATASET_PATH, ATTACK_ORDER[date])

## Análise dos datasets gerados

In [1]:
import pandas as pd
import numpy as np

In [4]:
FILE_NAME = 'datasets/CICDDoS2019/03-11/CICDDoS2019_03_11_Aggregated_Features_1sWindow.csv'
df = pd.read_csv(FILE_NAME)
print(f"Amostras: {df.shape[0]}")
print(f"\n{df['Label'].value_counts()}")
print(f"\nColunas: {print(df.columns)}")
print(f"\n{df['Window_Packet_Count'].sum()}")
df.head()

Amostras: 10341

Label
BENIGN     3923
SYN        2937
MSSQL       935
UDP         853
LDAP        548
NETBIOS     521
PORTMAP     313
UDPLAG      311
Name: count, dtype: int64
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Total_Length_of_Fwd_Packets', 'Total_Length_of_Bwd_Packets',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Min',
       'Fwd_Packet_Length_Mean', 'Fwd_Packet_Length_Std',
       'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_Min',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags',
       'Bwd_PSH_Flags', 'Fwd_URG_Flags', 'Bwd_URG_Flags', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Min_

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,...,Inbound,Timestamp_Delta_Seconds,Timestamp_Start,Source_IP_Shannon_Entropy,Destination_IP_Shannon_Entropy,Source_Port_Shannon_Entropy,Destination_Port_Shannon_Entropy,Protocol_Shannon_Entropy,Label,Window_Packet_Count
0,114457000.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2018-11-03 09:18:16.964447,-0.0,-0.0,-0.0,-0.0,-0.0,BENIGN,1
1,41209820.0,13.888889,5.111111,209.333333,30.666667,50.666667,37.333333,41.777778,6.885304,15.333333,...,0.222222,0.64933,2018-11-03 09:18:18.506537,1.879965,2.725481,2.725481,2.19716,1.224394,BENIGN,9
2,34432080.0,6.0,2.0,116.0,92.0,46.0,6.0,19.333333,20.655911,46.0,...,0.0,2.6e-05,2018-11-03 09:18:20.610850,-0.0,1.584963,1.584963,-0.0,-0.0,BENIGN,3
3,3687.909,2.636364,1.181818,43.818182,3.272727,19.090909,0.0,11.868182,10.147708,0.545455,...,0.454545,0.107515,2018-11-03 09:18:26.989249,1.685816,1.858555,1.858555,1.685816,-0.0,BENIGN,11
4,34.25,3.0,0.5,38.5,0.0,19.25,0.0,11.5625,10.90139,0.0,...,0.5,0.053799,2018-11-03 09:18:27.989374,1.0,1.0,1.0,1.0,-0.0,BENIGN,4


In [24]:
FILE_NAME = 'datasets/CICDDoS2019/01-12/CICDDoS2019_01_12_Aggregated_Features_1sWindow.csv'
df = pd.read_csv(FILE_NAME)
print(f"Amostras: {df.shape[0]}")
print(f"\n{df['Label'].value_counts()}")
print(f"\nColunas: {print(df.columns)}")
df.head()

Amostras: 17213

Label
TFTP             4212
BENIGN           3690
DRDOS_NTP        1800
DRDOS_DNS        1429
DRDOS_UDP        1354
DRDOS_SNMP        946
DRDOS_SSDP        809
DRDOS_NETBIOS     698
UDP-LAG           683
DRDOS_MSSQL       680
DRDOS_LDAP        598
SYN               246
WEBDDOS            68
Name: count, dtype: int64
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
       'Total_Length_of_Fwd_Packets', 'Total_Length_of_Bwd_Packets',
       'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Min',
       'Fwd_Packet_Length_Mean', 'Fwd_Packet_Length_Std',
       'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_Min',
       'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
       'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
       'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
       'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Fla

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,Bwd_Packet_Length_Max,...,Inbound,Timestamp_Delta_Seconds,Timestamp_Start,Source_IP_Shannon_Entropy,Destination_IP_Shannon_Entropy,Source_Port_Shannon_Entropy,Destination_Port_Shannon_Entropy,Protocol_Shannon_Entropy,Label,Window_Packet_Count
0,8932564.0,8.5,5.5,964.0,2145.0,254.5,0.0,80.333333,118.939124,536.5,...,1.0,0.021826,2018-12-01 09:17:11.183810,-0.0,-0.0,1.0,-0.0,-0.0,DRDOS_NTP,2
1,38109746.0,18.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.666667,0.836344,2018-12-01 09:17:12.634569,0.918296,0.918296,0.918296,1.584963,0.918296,BENIGN,3
2,56463455.25,14.5,13.5,1493.5,219.0,741.25,0.0,53.414664,165.825649,109.5,...,0.0,0.312552,2018-12-01 09:17:13.931484,-0.0,2.0,2.0,1.0,-0.0,BENIGN,4
3,29164144.5,12.25,2.5,188.5,536.5,94.25,0.0,23.5625,43.629272,268.25,...,0.5,0.589891,2018-12-01 09:17:16.230610,1.5,1.5,2.0,1.5,0.811278,DRDOS_NTP,4
4,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.64193,2018-12-01 09:17:17.363505,-0.0,-0.0,-0.0,1.0,-0.0,BENIGN,2


In [26]:
print(f"\n{df['Timestamp_Delta_Seconds'].sum()}")


11380.057132000002


# Algoritmos

## LeveragingBagging

https://capymoa.org/api/modules/capymoa.classifier.LeveragingBagging.html

## HoeffdingAdaptiveTree

https://capymoa.org/api/modules/capymoa.classifier.HoeffdingAdaptiveTree.html