# Introdução aos Ataques DDoS no Dataset CICDDoS2019

O dataset contém múltiplos cenários de ataques, registrados em arquivos CSV, com detalhes sobre tráfego malicioso e legítimo. Abaixo, são listados os períodos de tempo (em horas e minutos) em que os ataques ocorreram, organizados por dia e tipo de ataque.


## Ataques coloetados no dia (01/12)

```csv
data_path, length, columns, hour
DrDos_NTP.csv, 1217007, 88, 10:35 - 10:45
DrDos_DNS.csv, 5074413, 88, 10:52 - 11:05
DrDos_LDAP.csv, 2181542, 88, 11:22 - 11:32
DrDos_MSSQL.csv, 4524498, 88, 11:36 - 11:45
DrDos_NetBIOS.csv, 4094986, 88, 11:50 - 12:00
DrDos_SNMP.csv, 5161377, 88, 12:12 - 12:23
DrDos_SSDP.csv, 2611374, 88, 12:27 - 12:37
DrDos_UDP.csv, 3136802, 88, 12:45 - 13:09
UDPLag.csv, 370605, 88, 13:11 - 13:15
Syn.csv, 1582681, 88, 13:29 - 13:34
TFTP.csv, 20107827, 88, 13:35 - 17:15
```

## Ataques coloetados no dia (03/11)

```csv
data_path, length, columns, hour
NetBIOS.csv, 3455899, 88, 10:00 - 10:09
LDAP.csv, 2113234, 88, 10:21 - 10:30
MSSQL.csv, 5775786, 88, 10:33 - 10:42
UDP.csv, 3782206, 88,  10:53 - 11:03
UDPLag.csv, 725165, 88, 11:14 - 11:24
Syn.csv, 4320541, 88, 11:28 - 17:35
```

Portmap.csv não tem período de ataque 

# Pré-Processamento UEL - Gerando dados para treinamento

In [None]:
# selected_columns = [
#     ' Source IP',
#     ' Source Port',
#     ' Destination IP',
#     ' Destination Port',
#     ' Timestamp',
#     ' Flow Duration',
#     ' Total Fwd Packets',
#     ' Total Backward Packets',
#     'Total Length of Fwd Packets',
#     ' Total Length of Bwd Packets',
#     'Flow Bytes/s',
#     ' Flow Packets/s',
#     'Fwd Packets/s',
#     ' Bwd Packets/s',
# ]

import pandas as pd
from itertools import cycle
import random

# 1. Carregar os arquivos
teste_ataque = pd.read_csv('data/cic_puro/teste_ataque_ordenado.csv', sep=';')
teste_normal = pd.read_csv('data/cic_puro/teste_sem_ataque_ordenado.csv', sep=';')
treino_ataque = pd.read_csv('data/cic_puro/treino_ataque_ordenado.csv', sep=';')
treino_normal = pd.read_csv('data/cic_puro/treino_sem_ataque_ordenado.csv', sep=';')


# 2. Concatenar para treino e teste
teste_full = pd.concat([teste_normal, teste_ataque], ignore_index=True)
treino_full = pd.concat([treino_normal, treino_ataque], ignore_index=True)

# 3. Separar normais e ataques
def prepare_data(df, max_per_attack=1000, max_normal=5000):
    normal = df[df['label'] == 0].sample(frac=1).reset_index(drop=True)  # embaralhar normais
    attacks = df[df['label'] == 1].reset_index(drop=True)

    # Agora limitar por tipo de ataque
    attack_types = {}
    for name, group in attacks.groupby('attack_name'):
        attack_types[name] = group.sample(n=min(len(group), max_per_attack)).reset_index(drop=True)

    # Limitar normais
    if max_normal is not None:
        normal = normal.sample(n=min(len(normal), max_normal)).reset_index(drop=True)

    return normal, attack_types

train_normal, train_attacks = prepare_data(treino_full, max_per_attack=1000, max_normal=10000)
test_normal, test_attacks = prepare_data(teste_full, max_per_attack=500, max_normal=5000)

# 4. Função para criar sequências aleatórias
def create_random_sequences(normal_df, attack_dict, min_seq=30, max_seq=150):
    final_rows = []
    
    normal_iter = normal_df.iterrows()
    attack_iters = {k: v.iterrows() for k, v in attack_dict.items()}
    attack_cycle = cycle(list(attack_iters.keys()))
    
    normal_remaining = True
    attack_remaining = True

    while normal_remaining or attack_remaining:
        choice = random.choice(['normal', 'attack'])  # Aleatoriamente decidir normal ou ataque primeiro
        
        if choice == 'normal' and normal_remaining:
            seq_len = random.randint(min_seq, max_seq)
            for _ in range(seq_len):
                try:
                    idx, row = next(normal_iter)
                    final_rows.append(row)
                except StopIteration:
                    normal_remaining = False
                    break
        
        elif choice == 'attack' and attack_remaining:
            attack_type = next(attack_cycle)
            seq_len = random.randint(min_seq, max_seq)
            for _ in range(seq_len):
                try:
                    idx, row = next(attack_iters[attack_type])
                    final_rows.append(row)
                except StopIteration:
                    # Se esgotar ataques desse tipo, remover do ciclo
                    del attack_iters[attack_type]
                    if attack_iters:
                        attack_cycle = cycle(list(attack_iters.keys()))
                    else:
                        attack_remaining = False
                    break
        else:
            # Se o tipo escolhido acabou, tenta o outro
            continue

    return pd.DataFrame(final_rows)

# 5. Criar datasets
train_final = create_random_sequences(train_normal, train_attacks, min_seq=30, max_seq=120)
test_final = create_random_sequences(test_normal, test_attacks, min_seq=30, max_seq=120)

# 6. Salvar
train_final.to_csv('treino_final_estratificado_random.csv', sep=';', index=False)
test_final.to_csv('teste_final_estratificado_random.csv', sep=';', index=False)

print('Arquivos treino_final_estratificado_random.csv e teste_final_estratificado_random.csv gerados com sequências aleatórias!')



Arquivos treino_final_estratificado_random.csv e teste_final_estratificado_random.csv gerados com sequências aleatórias!


In [None]:
# Contar a quantidade de cada valor na coluna 'attack_name'
attack_counts_train = train_final['attack_name'].value_counts()
attack_counts_test = test_final['attack_name'].value_counts()

# Exibir os resultados
print('Tamanho:', len(train_final), 'Treino:', attack_counts_train)
print('Total de linhas no conjunto de treino:', len(train_final))

print('Tamanho:', len(test_final), 'Teste:', attack_counts_test)
print('Total de linhas no conjunto de teste:', len(test_final))

Tamanho: 13 Treino: attack_name
normal           8074
DrDoS_DNS        1000
DrDoS_NTP        1000
DrDoS_SNMP       1000
DrDoS_UDP        1000
TFTP             1000
UDP-lag           885
DrDoS_SSDP        822
DrDoS_NetBIOS     726
DrDoS_MSSQL       687
DrDoS_LDAP        592
Syn               237
WebDDoS           125
Name: count, dtype: int64
Tamanho: 8 Teste: attack_name
normal     5000
LDAP        500
MSSQL       500
NetBIOS     500
Syn         500
UDP         500
UDPLag      470
Portmap     449
Name: count, dtype: int64


# Testes CIC

In [17]:
import pandas as pd
import os

def contar_rotulos_csv(lista_arquivos, coluna_label=' Label'):
    for arquivo in lista_arquivos:
        try:
            print(f"\n📄 Analisando: {arquivo}")
            df = pd.read_csv(arquivo, usecols=[coluna_label], low_memory=False)
            
            total_linhas = len(df)
            contagem_rotulos = df[coluna_label].value_counts(dropna=False)
            soma_rotulos = contagem_rotulos.sum()

            print(f"  ➤ Total de linhas: {total_linhas}")
            print("  ➤ Contagem de rótulos:")
            for rotulo, qtd in contagem_rotulos.items():
                print(f"     - {rotulo}: {qtd}")

            if soma_rotulos == total_linhas:
                print("  ✅ A soma de todos os rótulos é igual ao total de linhas.")
            else:
                print("  ❌ A soma de todos os rótulos NÃO é igual ao total de linhas.")

        except Exception as e:
            print(f"❌ Erro ao ler {arquivo}: {e}")


In [None]:
csv_files1 = [
    'data/03-11/attacks_labeled/LDAP_labeled.csv',
    'data/03-11/attacks_labeled/Mssql_labeled.csv',
    'data/03-11/attacks_labeled/NetBIOS_labeled.csv',
    'data/03-11/attacks_labeled/Syn_labeled.csv',
    'data/03-11/attacks_labeled/UDP_labeled.csv',
    'data/03-11/attacks_labeled/UDPLag_labeled.csv'
]

csv_files2 = [
    'data/01-12/attacks_labeled/DrDos_DNS_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_LDAP_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_MSSQL_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_NetBIOS_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_NTP_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_SNMP_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_SSDP_labeled.csv',
    'data/01-12/attacks_labeled/DrDos_UDP_labeled.csv',
    'data/01-12/attacks_labeled/Syn_labeled.csv',
    'data/01-12/attacks_labeled/UDPLag_labeled.csv'   
]

csv_files3 = [
    'data/Full_dataset/03-11/LDAP.csv',
    'data/Full_dataset/03-11/Mssql.csv',
    'data/Full_dataset/03-11/NetBIOS.csv',
    'data/Full_dataset/03-11/Syn.csv',
    'data/Full_dataset/03-11/UDP.csv',
    'data/Full_dataset/03-11/UDPLag.csv'
]

csv_files4 = [
    'data/Full_dataset/01-12/DrDos_DNS.csv',
    'data/Full_dataset/01-12/DrDos_LDAP.csv',
    'data/Full_dataset/01-12/DrDos_MSSQL.csv',
    'data/Full_dataset/01-12/DrDos_NetBIOS.csv',
    'data/Full_dataset/01-12/DrDos_NTP.csv',
    'data/Full_dataset/01-12/DrDos_SNMP.csv',
    'data/Full_dataset/01-12/DrDos_SSDP.csv',
    'data/Full_dataset/01-12/DrDos_UDP.csv',
    'data/Full_dataset/01-12/Syn.csv',
    'data/Full_dataset/01-12/UDPLag.csv'   
]
    
contar_rotulos_csv(csv_files3)



📄 Analisando: data/Full_dataset/03-11/LDAP.csv
  ➤ Total de linhas: 2113234
  ➤ Contagem de rótulos:
     - LDAP: 1905191
     - NetBIOS: 202919
     - BENIGN: 5124
  ✅ A soma de todos os rótulos é igual ao total de linhas.

📄 Analisando: data/Full_dataset/03-11/Mssql.csv
  ➤ Total de linhas: 5775786
  ➤ Contagem de rótulos:
     - MSSQL: 5763061
     - LDAP: 9931
     - BENIGN: 2794
  ✅ A soma de todos os rótulos é igual ao total de linhas.

📄 Analisando: data/Full_dataset/03-11/NetBIOS.csv
  ➤ Total de linhas: 3455899
  ➤ Contagem de rótulos:
     - NetBIOS: 3454578
     - BENIGN: 1321
  ✅ A soma de todos os rótulos é igual ao total de linhas.

📄 Analisando: data/Full_dataset/03-11/Syn.csv
  ➤ Total de linhas: 4320541
  ➤ Contagem de rótulos:
     - Syn: 4284751
     - BENIGN: 35790
  ✅ A soma de todos os rótulos é igual ao total de linhas.

📄 Analisando: data/Full_dataset/03-11/UDP.csv
  ➤ Total de linhas: 3782206
  ➤ Contagem de rótulos:
     - UDP: 3754680
     - MSSQL: 24392
   

In [25]:
import pandas as pd

def visualizar_primeiras_linhas_formatado(caminho_csv, n_linhas=3):
    try:
        print(f"\n📂 Lendo primeiras {n_linhas} linhas do arquivo: {caminho_csv}")
        df = pd.read_csv(caminho_csv, nrows=n_linhas, low_memory=False)

        for idx, linha in df.iterrows():
            print(f"\n🧾 Linha {idx + 1}:")
            for coluna, valor in linha.items():
                print(f"  {coluna:<30} ➜ {valor}")

    except Exception as e:
        print(f"❌ Erro ao ler o arquivo {caminho_csv}: {e}")


In [29]:
csv_files4 = [
    'data/Full_dataset/01-12/DrDos_DNS.csv',
    'data/Full_dataset/01-12/DrDos_LDAP.csv',
    'data/Full_dataset/01-12/DrDos_MSSQL.csv',
    'data/Full_dataset/01-12/DrDos_NetBIOS.csv',
    'data/Full_dataset/01-12/DrDos_NTP.csv',
    'data/Full_dataset/01-12/DrDos_SNMP.csv',
    'data/Full_dataset/01-12/DrDos_SSDP.csv',
    'data/Full_dataset/01-12/DrDos_UDP.csv',
    'data/Full_dataset/01-12/Syn.csv',
    'data/Full_dataset/01-12/UDPLag.csv'   
]

visualizar_primeiras_linhas_formatado('data/Full_dataset/01-12/DrDos_LDAP.csv', n_linhas=1)



📂 Lendo primeiras 1 linhas do arquivo: data/Full_dataset/01-12/DrDos_LDAP.csv

🧾 Linha 1:
  Unnamed: 0                     ➜ 21010
  Flow ID                        ➜ 172.16.0.5-192.168.50.1-0-0-0
   Source IP                     ➜ 172.16.0.5
   Source Port                   ➜ 0
   Destination IP                ➜ 192.168.50.1
   Destination Port              ➜ 0
   Protocol                      ➜ 0
   Timestamp                     ➜ 2018-12-01 11:22:40.254769
   Flow Duration                 ➜ 9141643
   Total Fwd Packets             ➜ 85894
   Total Backward Packets        ➜ 28
  Total Length of Fwd Packets    ➜ 0.0
   Total Length of Bwd Packets   ➜ 0.0
   Fwd Packet Length Max         ➜ 0.0
   Fwd Packet Length Min         ➜ 0.0
   Fwd Packet Length Mean        ➜ 0.0
   Fwd Packet Length Std         ➜ 0.0
  Bwd Packet Length Max          ➜ 0.0
   Bwd Packet Length Min         ➜ 0.0
   Bwd Packet Length Mean        ➜ 0.0
   Bwd Packet Length Std         ➜ 0.0
  Flow Bytes/s          

# Correção do CICDDoS - Calculo da entropia e concatenação de tempo em 1 segundo

# Análise de correlação (Random Forest - RFE)

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

def analyze_csv_with_feature_importance(file_paths, label_column='label', chunk_size=5000, output_file='importancia_colunas_todas.csv'):
    importance_data = []

    for file_path in file_paths:
        print(f"\n🔄 Processando arquivo: {file_path}")
        try:
            reader = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
            importance_sum = {}
            total_chunks = 0

            for i, chunk in enumerate(reader):
                print(f"  📦 Chunk {i+1}")
                chunk.dropna(axis=1, how='all', inplace=True)
                numeric_chunk = chunk.select_dtypes(include=[np.number])

                if label_column not in numeric_chunk.columns:
                    print(f"  ⚠️ Coluna '{label_column}' não encontrada, pulando chunk.")
                    continue

                X = numeric_chunk.drop(columns=[label_column])
                y = numeric_chunk[label_column]

                X.replace([np.inf, -np.inf], np.nan, inplace=True)
                X.dropna(inplace=True)
                y = y.loc[X.index]

                if len(X) < 10 or len(set(y)) < 2:
                    print("  ⚠️ Poucos dados ou classes únicas, pulando chunk.")
                    continue

                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(X)

                model = RandomForestClassifier(n_estimators=30, random_state=42)
                model.fit(X_scaled, y)

                importances = model.feature_importances_
                total_importance = importances.sum()
                if total_importance == 0:
                    print("  ⚠️ Importância total zero, pulando.")
                    continue

                importances_percent = 100.0 * (importances / total_importance)

                for col, imp in zip(X.columns, importances_percent):
                    importance_sum[col] = importance_sum.get(col, 0) + imp

                print(f"    ✅ Chunk {i+1} processado com {len(X.columns)} colunas.")
                total_chunks += 1

            if total_chunks == 0:
                print("⚠️ Nenhum chunk válido neste arquivo.")
                continue

            avg_importances = {col: imp / total_chunks for col, imp in importance_sum.items()}
            all_columns = sorted(avg_importances.keys())
            file_name = os.path.basename(file_path)

            row = {'arquivo': file_name}
            for col in all_columns:
                row[col] = avg_importances.get(col, 0.0)

            importance_data.append(row)
            print(f"📊 Importâncias médias para '{file_name}':")
            for k, v in row.items():
                if k != 'arquivo':
                    print(f"    {k}: {v:.2f}%")

        except Exception as e:
            print(f"❌ Erro ao processar {file_path}: {e}")

    if importance_data:
        df_output = pd.DataFrame(importance_data).fillna(0)
        df_output.to_csv(output_file, index=False)
        print(f"\n✅ Arquivo final salvo como: {output_file}")
    else:
        print("⚠️ Nenhum dado foi processado.")


In [12]:
csv_files = [
    'data/03-11/attacks_labeled/LDAP_labeled.csv',
    'data/03-11/attacks_labeled/Mssql_labeled.csv',
    'data/03-11/attacks_labeled/NetBIOS_labeled.csv',
    'data/03-11/attacks_labeled/Syn_labeled.csv',
    'data/03-11/attacks_labeled/UDP_labeled.csv',
    'data/03-11/attacks_labeled/UDPLag_labeled.csv'
]


analyze_csv_with_rfe(csv_files, output_file='importancia_colunas_percentual.csv', label_column='label')



Processando: data/03-11/attacks_labeled/LDAP_labeled.csv


KeyboardInterrupt: 

# LSTM

In [94]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
import torch.optim as optim

SEED = 42
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

class SequenceDataset(Dataset):
    def __init__(self, path, sequence_length, column_to_remove=None):
        df = pd.read_csv(path, sep=';')

        if column_to_remove and column_to_remove in df.columns:
            df = df.drop(columns=[column_to_remove])

        # Separar features e labels
        features = df.iloc[:, :-1].values  # Assume que a última coluna é a label
        labels = df.iloc[:, -1].values

        # Normalizar os dados 
        scaler = StandardScaler()
        features = scaler.fit_transform(features)

        # Criar sequências
        sequences = []
        sequence_labels = []
        for i in range(len(features) - sequence_length + 1):
            seq = features[i:i+sequence_length]
            label = labels[i+sequence_length-1]  
            sequences.append(seq)
            sequence_labels.append(label)

        # Converter para tensores
        self.sequences = torch.tensor(sequences, dtype=torch.float32).view(-1, sequence_length, features.shape[1])
        self.labels = torch.tensor(sequence_labels, dtype=torch.long)
        # self.labels = torch.tensor(sequence_labels, dtype=torch.float32).unsqueeze(1) # Para binário / nn.BCEWithLogitsLoss()

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return sequence, label
    
    
class LSTM_model(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM_model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Primeira camada LSTM
        self.lstm1 = nn.LSTM(input_size=input_size,
                             hidden_size=hidden_size//2,
                             num_layers=num_layers,
                             batch_first=True
                             
                             )
        
        # Segunda camada LSTM
        self.lstm2 = nn.LSTM(input_size=hidden_size//2,
                             hidden_size=hidden_size,
                             num_layers=num_layers,
                             batch_first=True,  
                             dropout=0.2  # Dropout entre as camadas LSTM                           
                            )
        
        self.lstm3 = nn.LSTM(input_size=hidden_size,
                             hidden_size=hidden_size//2,
                             num_layers=num_layers,
                             batch_first=True,
                             dropout=0.2  # Dropout entre as camadas LSTM
                             )
        
        # Camada fully-connected
        # self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()  # Para binário
        # self.fc = torch.nn.Linear(hidden_size, 1) # nn.BCEWithLogitsLoss() / nn.BCELoss()
        self.fc = torch.nn.Linear(hidden_size//2, output_size) # nn.CrossEntropyLoss()
        # Camada de ativação softmax
    
    def forward(self, x):
        out, _ = self.lstm1(x)  # Primeira camada LSTM
      
        out, _ = self.lstm2(out)  # Segunda camada LSTM
        
        out, _ = self.lstm3(out)  # Terceira camada LSTM
        
        out = torch.sigmoid(out)  # Aplicar sigmoid para obter probabilidades / Remover para nn.BCEWithLogitsLoss()
        # out = self.softmax(out)  # Aplicar softmax para obter probabilidades / Remover para nn.BCEWithLogitsLoss()
        out = self.fc(out[:, -1, :])  # Usar a última saída do LSTM como entrada para fc
        
        return out
    
    def save_model(self, filename):
        torch.save(self.state_dict(), filename)
        print(f"Modelo salvo em: {filename}")


# out = self.dropout(out)  # Aplicar dropout se necessário
# out = self.batch_norm(out)  # Aplicar batch normalization se necessário
# out = self.relu(out)  # Aplicar ReLU se necessário
# out = self.tanh(out)  # Aplicar Tanh se necessário
# out = self.sigmoid(out)  # Aplicar Sigmoid se necessário
# out = self.leaky_relu(out)  # Aplicar Leaky ReLU se necessário
# out = self.prelu(out)  # Aplicar PReLU se necessário
# out = self.elu(out)  # Aplicar ELU se necessário

Usando dispositivo: cuda


In [95]:
# Configurar os parâmetros da rede LSTM
input_size = 9         # Número de features no dataset / Tamanho do vetor de entrada por tempo
hidden_size = 256       # Tamanho do hidden state / Nº de unidades ocultas por célula
num_layers = 2         # Número de camadas LSTM / Nº de camadas LSTM empilhadas
output_size = 2        # Classes: normal (0), anomalia (1) 
batch_size = 64        # Batch size / 
num_epochs = 100         # Número de epochs
lr = 0.0011             # Learning rate
sequence_length = 1   # Tamanho da sequência de entrada para a LSTM
column_to_remove = 'attack_name'  # Coluna a ser removida

# Criar os datasets
train_dataset = SequenceDataset('data/cic_puro/treino_final_estratificado_random.csv', sequence_length, column_to_remove)
test_dataset = SequenceDataset('data/cic_puro/teste_final_estratificado_random.csv', sequence_length, column_to_remove)

# Criar os DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Formato dos loaders
for batch in train_loader:
    inputs, labels = batch
    print(f"Train Loader - Inputs shape: {inputs.shape}, Labels shape: {labels.shape}")
    break

for batch in test_loader:
    inputs, labels = batch
    print(f"Test Loader - Inputs shape: {inputs.shape}, Labels shape: {labels.shape}")
    break


Train Loader - Inputs shape: torch.Size([64, 1, 9]), Labels shape: torch.Size([64])
Test Loader - Inputs shape: torch.Size([64, 1, 9]), Labels shape: torch.Size([64])


In [96]:
# Criar o modelo
model = LSTM_model(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size)
print(model)

# Testar a dimensão da entrada e saída do modelo
x, y = next(iter(train_loader))  
output = model(x)
print(f"Dimensão da entrada: {x.size()}")   # shape: [batch_size, sequence_length, input_size]
print(f"Dimensão da saída: {y.size()}")    # shape: [batch_size]
print(f"Dimensão da saída do modelo: {output.size()}") # shape: [batch_size, output_size]

criterion = nn.CrossEntropyLoss() # CrossEntropyLoss: Para classificação multi-classe (softmax já incluído)
optimizer = optim.Adam(model.parameters(), lr=lr) # Adam: muito usado, bom para a maioria dos casos
# optimizer = optim.RMSprop(model.parameters(), lr=lr) # RMSprop: bom para dados sequenciais e LSTM


# Treinamento do modelo
best_loss = float('inf') # Inicializa a melhor perda como infinito
for epoch in range(num_epochs):
    # Treinamento
    model.train()
    for inputs, labels in train_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        inputs = inputs.permute(0, 1, 2)
        outputs = model(inputs)   
        # labels = labels.view(-1, 1).float() # Ajustar o formato de labels para [batch_size, 1] / nn.BCEWithLogitsLoss()
        # loss = criterion(outputs.squeeze(1), labels.float()) # nn.BCEWithLogitsLoss()
        loss = criterion(outputs.squeeze(), labels) #  nn.BCELoss()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validação
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.permute(0, 1, 2)
            outputs = model(inputs)
            # labels = labels.view(-1, 1).float() # Ajustar o formato de labels para [batch_size, 1] / nn.BCEWithLogitsLoss()
            val_loss += criterion(outputs.squeeze(), labels).item()
    
    avg_val_loss = val_loss / len(test_loader)

    # Calcular a acurácia
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.permute(0, 1, 2)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
   
    acc = accuracy_score(y_true, y_pred)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {avg_val_loss:.4f}, Accuracy: {acc:.4f}')
    
    # Salvar o melhor modelo
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        torch.save(model.state_dict(), f'best_lstm_model_{acc:.2f}.pth')
        print('Melhor modelo salvo!')

LSTM_model(
  (lstm1): LSTM(9, 128, num_layers=2, batch_first=True)
  (lstm2): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.2)
  (lstm3): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.2)
  (sigmoid): Sigmoid()
  (fc): Linear(in_features=128, out_features=2, bias=True)
)
Dimensão da entrada: torch.Size([64, 1, 9])
Dimensão da saída: torch.Size([64])
Dimensão da saída do modelo: torch.Size([64, 2])
Epoch [1/100], Loss: 0.4529, Val Loss: 0.8944, Accuracy: 0.4654
Melhor modelo salvo!
Epoch [2/100], Loss: 0.3712, Val Loss: 0.7187, Accuracy: 0.5864
Melhor modelo salvo!
Epoch [3/100], Loss: 0.4228, Val Loss: 0.5879, Accuracy: 0.7041
Melhor modelo salvo!
Epoch [4/100], Loss: 0.2935, Val Loss: 0.9191, Accuracy: 0.5445
Epoch [5/100], Loss: 0.4467, Val Loss: 0.7766, Accuracy: 0.6225
Epoch [6/100], Loss: 0.3432, Val Loss: 0.7228, Accuracy: 0.6538
Epoch [7/100], Loss: 0.2184, Val Loss: 0.6046, Accuracy: 0.6659
Epoch [8/100], Loss: 0.2359, Val Loss: 0.4953, Accuracy: 0.7686
Me

KeyboardInterrupt: 

# CNN

In [19]:
# 📦 CÉLULA 1 – IMPORTAÇÕES E CONFIGURAÇÕES
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

SEED = 42
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo em uso: {device}")


Dispositivo em uso: cuda


In [20]:
# 📄 CÉLULA 2 – DATASET COM REMOÇÃO DE COLUNA E DIVISÃO EM SEQUÊNCIAS
class SequenceDataset(Dataset):
    def __init__(self, path, sequence_length, column_to_remove=None):
        df = pd.read_csv(path)

        # Remover coluna opcional (como timestamp ou id)
        if column_to_remove and column_to_remove in df.columns:
            df = df.drop(columns=[column_to_remove])

        # Separar features e labels
        data = df.drop(columns=['label']).values
        labels = df['label'].values

        self.sequences = []
        self.labels = []

        for i in range(len(data) - sequence_length + 1):
            self.sequences.append(data[i:i+sequence_length])
            self.labels.append(labels[i+sequence_length-1])

        # Formato final: (batch, channels=1, time_steps)
        self.sequences = torch.tensor(self.sequences, dtype=torch.float32).unsqueeze(1)
        self.labels = torch.tensor(self.labels, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]


In [21]:
# 🧠 CÉLULA 3 – DEFINIÇÃO DO MODELO CNN 1D
class CNN1DNet(nn.Module):
    def __init__(self, input_length, output_size=2):
        super(CNN1DNet, self).__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Flatten(),
            nn.Linear((input_length // 4) * 64, 100),
            nn.ReLU(),
            nn.Linear(100, output_size)
        )

    def forward(self, x):
        return self.net(x)


In [None]:
sequence_length = 20
input_size = 9     
output_size = 2
batch_size = 32
num_epochs = 20
lr = 0.001
column_to_remove = 'attack_name'

train_dataset = SequenceDataset('data/cic_puro_enhanced/01-12-train.csv', sequence_length, column_to_remove)
test_dataset  = SequenceDataset('data/cic_puro_enhanced/03-11-test.csv', sequence_length, column_to_remove)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
model = CNN1DNet(input_length=sequence_length, output_size=output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_accuracy = 0.0

for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 🔍 Validação
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            predicted = torch.argmax(outputs, dim=1).cpu()
            y_pred.extend(predicted.numpy())
            y_true.extend(y_batch.numpy())

    acc = accuracy_score(y_true, y_pred)
    print(f"Época {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Accuracy: {acc:.4f}")

    if acc > best_accuracy:
        best_accuracy = acc
        torch.save(model.state_dict(), 'best_cnn_model.pth')
        print("✅ Novo melhor modelo salvo com accuracy:", best_accuracy)


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [32, 1, 20, 9]

# LSTM-CNN-SVM

In [None]:
class CNN_LSTM_model(nn.Module):
    def __init__(self, input_size=9, hidden_size=64, num_layers=2, output_size=2):
        super(CNN_LSTM_model, self).__init__()
        
        # Camada CNN para extrair características locais
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=input_size, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2)
        )
        
        # Camada LSTM
        self.lstm = nn.LSTM(
            input_size=32,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        
        # Camada全连接 após a LSTM
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # A CNN espera (batch_size, channels, sequence_length), então precisamos transpor
        x = x.permute(0, 2, 1)  # [batch_size, seq_length, input_size] -> [batch_size, input_size, seq_length]
        
        out = self.cnn(x)
        out = out.permute(0, 2, 1)  # Volta para [batch_size, sequence_length, channels]
        
        out, _ = self.lstm(out)
        out = self.fc(out[:, -1, :])  # Pega o último timestep
        return out
