# Introdução aos Ataques DDoS no Dataset CICDDoS2019

O dataset contém múltiplos cenários de ataques, registrados em arquivos CSV, com detalhes sobre tráfego malicioso e legítimo.


# Pré-Processamento UEL - Gerando dados para treinamento

In [None]:
import pandas as pd
from itertools import cycle
import random

# 1. Carregar os arquivos
teste_ataque = pd.read_csv('data/cic_puro/teste_ataque_ordenado.csv', sep=';')
teste_normal = pd.read_csv('data/cic_puro/teste_sem_ataque_ordenado.csv', sep=';')
treino_ataque = pd.read_csv('data/cic_puro/treino_ataque_ordenado.csv', sep=';')
treino_normal = pd.read_csv('data/cic_puro/treino_sem_ataque_ordenado.csv', sep=';')


# 2. Concatenar para treino e teste
teste_full = pd.concat([teste_normal, teste_ataque], ignore_index=True)
treino_full = pd.concat([treino_normal, treino_ataque], ignore_index=True)

# 3. Separar normais e ataques
def prepare_data(df, max_per_attack=1000, max_normal=5000):
    normal = df[df['label'] == 0].sample(frac=1).reset_index(drop=True)  # embaralhar normais
    attacks = df[df['label'] == 1].reset_index(drop=True)

    # Agora limitar por tipo de ataque
    attack_types = {}
    for name, group in attacks.groupby('attack_name'):
        attack_types[name] = group.sample(n=min(len(group), max_per_attack)).reset_index(drop=True)

    # Limitar normais
    if max_normal is not None:
        normal = normal.sample(n=min(len(normal), max_normal)).reset_index(drop=True)

    return normal, attack_types

train_normal, train_attacks = prepare_data(treino_full, max_per_attack=1000, max_normal=10000)
test_normal, test_attacks = prepare_data(teste_full, max_per_attack=500, max_normal=5000)

# 4. Função para criar sequências aleatórias
def create_random_sequences(normal_df, attack_dict, min_seq=30, max_seq=150):
    final_rows = []
    
    normal_iter = normal_df.iterrows()
    attack_iters = {k: v.iterrows() for k, v in attack_dict.items()}
    attack_cycle = cycle(list(attack_iters.keys()))
    
    normal_remaining = True
    attack_remaining = True

    while normal_remaining or attack_remaining:
        choice = random.choice(['normal', 'attack'])  # Aleatoriamente decidir normal ou ataque primeiro
        
        if choice == 'normal' and normal_remaining:
            seq_len = random.randint(min_seq, max_seq)
            for _ in range(seq_len):
                try:
                    idx, row = next(normal_iter)
                    final_rows.append(row)
                except StopIteration:
                    normal_remaining = False
                    break
        
        elif choice == 'attack' and attack_remaining:
            attack_type = next(attack_cycle)
            seq_len = random.randint(min_seq, max_seq)
            for _ in range(seq_len):
                try:
                    idx, row = next(attack_iters[attack_type])
                    final_rows.append(row)
                except StopIteration:
                    # Se esgotar ataques desse tipo, remover do ciclo
                    del attack_iters[attack_type]
                    if attack_iters:
                        attack_cycle = cycle(list(attack_iters.keys()))
                    else:
                        attack_remaining = False
                    break
        else:
            # Se o tipo escolhido acabou, tenta o outro
            continue

    return pd.DataFrame(final_rows)

# 5. Criar datasets
train_final = create_random_sequences(train_normal, train_attacks, min_seq=30, max_seq=120)
test_final = create_random_sequences(test_normal, test_attacks, min_seq=30, max_seq=120)

# 6. Salvar
train_final.to_csv('treino_final_estratificado_random.csv', sep=';', index=False)
test_final.to_csv('teste_final_estratificado_random.csv', sep=';', index=False)

print('Arquivos treino_final_estratificado_random.csv e teste_final_estratificado_random.csv gerados com sequências aleatórias!')

Arquivos treino_final_estratificado_random.csv e teste_final_estratificado_random.csv gerados com sequências aleatórias!


In [None]:
# Contar a quantidade de cada valor na coluna 'attack_name'
attack_counts_train = train_final['attack_name'].value_counts()
attack_counts_test = test_final['attack_name'].value_counts()

# Exibir os resultados
print('Tamanho:', len(train_final), 'Treino:', attack_counts_train)
print('Total de linhas no conjunto de treino:', len(train_final))

print('Tamanho:', len(test_final), 'Teste:', attack_counts_test)
print('Total de linhas no conjunto de teste:', len(test_final))

Tamanho: 13 Treino: attack_name
normal           8074
DrDoS_DNS        1000
DrDoS_NTP        1000
DrDoS_SNMP       1000
DrDoS_UDP        1000
TFTP             1000
UDP-lag           885
DrDoS_SSDP        822
DrDoS_NetBIOS     726
DrDoS_MSSQL       687
DrDoS_LDAP        592
Syn               237
WebDDoS           125
Name: count, dtype: int64
Tamanho: 8 Teste: attack_name
normal     5000
LDAP        500
MSSQL       500
NetBIOS     500
Syn         500
UDP         500
UDPLag      470
Portmap     449
Name: count, dtype: int64


# Modelos

## LSTM

In [4]:
from models.LSTM.ModelLSTM import LSTM
from models.Sequence import SequenceDataset
from models.LSTM.TrainerLSTM import TrainerLSTM
from torch.utils.data import DataLoader
import torch

SEED = 42
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Configurar os parâmetros da rede LSTM
input_size = 9         # Número de features no dataset / Tamanho do vetor de entrada por tempo
hidden_size = 256       # Tamanho do hidden state / Nº de unidades ocultas por célula
num_layers = 3         # Número de camadas LSTM / Nº de camadas LSTM empilhadas
output_size = 2        # Classes: normal (0), anomalia (1) 
batch_size = 128        # Batch size / 
sequence_length = 10   # Tamanho da sequência de entrada para a LSTM
column_to_remove = 'attack_name'  # Coluna a ser removida

Usando dispositivo: cuda


In [None]:
# Criar os datasets
train_dataset = SequenceDataset('data/cic_puro/treino_final_estratificado_random.csv', sequence_length, column_to_remove=column_to_remove, normalize=True, mode='lstm')
test_dataset = SequenceDataset('data/cic_puro/teste_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='lstm')

print(f"Total de amostras no conjunto de treino: {len(train_dataset)}")
print(f"Total de amostras no conjunto de teste: {len(test_dataset)}")
print("Train Dataset Shape:", train_dataset.sequences.shape)
print("Test Dataset Shape:", test_dataset.sequences.shape)

# Criar os DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Total de batches no conjunto de treino: {len(train_loader)}")
print(f"Total de batches no conjunto de teste: {len(test_loader)}")

# Criar o modelo
model = LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size).to(device)
print(model)

Total de amostras no conjunto de treino: 17139
Total de amostras no conjunto de teste: 8410
Train Dataset Shape: torch.Size([17139, 10, 9])
Test Dataset Shape: torch.Size([8410, 10, 9])


In [None]:
trainer = TrainerLSTM(dir_save="output/LSTM", num_epochs=100)
trainer.fit(model, train_loader, test_loader, device)

Epoch [1/100] Train Loss: 0.4143 Val Loss:   0.3635 Accuracy:   0.8725
🔖 Melhor modelo salvo!
Epoch [2/100] Train Loss: 0.1607 Val Loss:   0.3591 Accuracy:   0.8926
🔖 Melhor modelo salvo!
Epoch [3/100] Train Loss: 0.1231 Val Loss:   0.3107 Accuracy:   0.9001
🔖 Melhor modelo salvo!
Epoch [4/100] Train Loss: 0.0989 Val Loss:   0.2797 Accuracy:   0.8907
🔖 Melhor modelo salvo!
Epoch [5/100] Train Loss: 0.0911 Val Loss:   0.4735 Accuracy:   0.8895
Epoch [6/100] Train Loss: 0.0964 Val Loss:   0.2631 Accuracy:   0.9043
🔖 Melhor modelo salvo!
Epoch [7/100] Train Loss: 0.0857 Val Loss:   0.3939 Accuracy:   0.8484
Epoch [8/100] Train Loss: 0.0873 Val Loss:   0.2725 Accuracy:   0.8898
Epoch [9/100] Train Loss: 0.0873 Val Loss:   0.2713 Accuracy:   0.9100
Epoch [10/100] Train Loss: 0.0909 Val Loss:   0.2565 Accuracy:   0.8930
🔖 Melhor modelo salvo!
Epoch [11/100] Train Loss: 0.0752 Val Loss:   0.2610 Accuracy:   0.9074
Epoch [12/100] Train Loss: 0.0775 Val Loss:   0.2517 Accuracy:   0.9127
🔖 Melho

## CNN

In [None]:
from models.CNN import CNN
from models.Sequence import SequenceDataset
from torch.utils.data import DataLoader
import torch

SEED = 42
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

input_size = 9
sequence_length = 70
output_size = 2
batch_size = 64
column_to_remove = 'attack_name'

Usando dispositivo: cuda


In [6]:
train_dataset = SequenceDataset('data/cic_puro/treino_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='cnn1d')
test_dataset = SequenceDataset('data/cic_puro/teste_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='cnn1d')

print(f"Total de amostras no conjunto de treino: {len(train_dataset)}")
print(f"Total de amostras no conjunto de teste: {len(test_dataset)}")
print("Train Dataset Shape:", train_dataset.sequences.shape)
print("Test Dataset Shape:", test_dataset.sequences.shape)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Total de batches no conjunto de treino: {len(train_loader)}")
print(f"Total de batches no conjunto de teste: {len(test_loader)}")

# Modelo
n_feat = train_dataset.sequences.shape[1]
model = CNN(
    input_channels=n_feat,
    input_length=sequence_length,
    num_classes=output_size
).to(device)
model.to(device)

Total de amostras no conjunto de treino: 17079
Total de amostras no conjunto de teste: 8350
Train Dataset Shape: torch.Size([17079, 9, 70])
Test Dataset Shape: torch.Size([8350, 9, 70])
Total de batches no conjunto de treino: 267
Total de batches no conjunto de teste: 131


CNN(
  (conv1): Conv1d(9, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc1): Linear(in_features=17920, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)

In [None]:
# Exemplo de uso CNN
# treinar
model.train_model(train_loader, device=device, epochs=20, lr=1e-3)

Epoch 1/20 - Loss: 0.1187
Epoch 2/20 - Loss: 0.0512
Epoch 3/20 - Loss: 0.0381
Epoch 4/20 - Loss: 0.0284
Epoch 5/20 - Loss: 0.0195
Epoch 6/20 - Loss: 0.0175
Epoch 7/20 - Loss: 0.0139
Epoch 8/20 - Loss: 0.0104
Epoch 9/20 - Loss: 0.0119
Epoch 10/20 - Loss: 0.0090
Epoch 11/20 - Loss: 0.0113
Epoch 12/20 - Loss: 0.0066
Epoch 13/20 - Loss: 0.0086
Epoch 14/20 - Loss: 0.0047
Epoch 15/20 - Loss: 0.0032
Epoch 16/20 - Loss: 0.0089
Epoch 17/20 - Loss: 0.0072
Epoch 18/20 - Loss: 0.0053
Epoch 19/20 - Loss: 0.0042
Epoch 20/20 - Loss: 0.0046

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      4965
           1       0.95      0.90      0.92      3385

    accuracy                           0.94      8350
   macro avg       0.94      0.93      0.93      8350
weighted avg       0.94      0.94      0.94      8350

Accuracy: 0.9377245508982036


In [None]:
# avaliar
model.evaluate(test_loader, device=device)

## Hybrid V2

In [1]:
from models.Hybrid.ModelHybridAttnSVM import ModelHybridAttnSVM
from torch.utils.data import DataLoader
from models.Sequence import SequenceDataset
import torch

# Parâmetros gerais
sequence_length   = 50
column_to_remove  = 'attack_name'
batch_size        = 64
hidden_size       = 64
num_layers        = 3
num_classes       = 2
device            = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs            = 100
learning_rate     = 1e-4

# Criar os datasets
train_dataset = SequenceDataset(
    path             = 'data/cic_puro/treino_final_estratificado_random.csv',
    sequence_length  = sequence_length,
    column_to_remove = column_to_remove,
    normalize        = True,
    mode             = 'lstm'
)
test_dataset = SequenceDataset(
    path             = 'data/cic_puro/teste_final_estratificado_random.csv',
    sequence_length  = sequence_length,
    column_to_remove = column_to_remove,
    normalize        = True,
    mode             = 'lstm'
)

print(f"Total de amostras no conjunto de treino: {len(train_dataset)}")
print(f"Total de amostras no conjunto de teste:  {len(test_dataset)}")
print("Train Dataset Shape:", train_dataset.sequences.shape)
print("Test  Dataset Shape:", test_dataset.sequences.shape)

# Criar os DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size)

print(f"Total de batches no treino: {len(train_loader)}")
print(f"Total de batches no teste:  {len(test_loader)}")

# Definir n_features a partir do dataset
n_features = train_dataset.sequences.shape[2]

# Instanciar o modelo híbrido
model = ModelHybridAttnSVM(
    seq_len     = sequence_length,
    n_features  = n_features,
    lstm_hidden = hidden_size,
    lstm_layers = num_layers,
    num_classes = num_classes
).to(device)
print(model)


Total de amostras no conjunto de treino: 17099
Total de amostras no conjunto de teste:  8370
Train Dataset Shape: torch.Size([17099, 50, 9])
Test  Dataset Shape: torch.Size([8370, 50, 9])
Total de batches no treino: 268
Total de batches no teste:  131
ModelHybridAttnSVM(
  (conv1): Conv1d(9, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (drop): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(128, 64, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)


In [2]:
# Treinamento
print("### Treinando CNN+LSTM ###")
model.train_model(
    train_loader,
    device=device,
    epochs=epochs,
    lr=learning_rate,
    save_path='output/Hybrid/hybrid_attn.pth'
)


### Treinando CNN+LSTM ###
Epoch 1/100 - Loss: 0.5094
Epoch 2/100 - Loss: 0.1693
Epoch 3/100 - Loss: 0.1238
Epoch 4/100 - Loss: 0.1104
Epoch 5/100 - Loss: 0.1021
Epoch 6/100 - Loss: 0.0973
Epoch 7/100 - Loss: 0.0935
Epoch 8/100 - Loss: 0.0871
Epoch 9/100 - Loss: 0.0841
Epoch 10/100 - Loss: 0.0828
Epoch 11/100 - Loss: 0.0799
Epoch 12/100 - Loss: 0.0743
Epoch 13/100 - Loss: 0.0736
Epoch 14/100 - Loss: 0.0723
Epoch 15/100 - Loss: 0.0697
Epoch 16/100 - Loss: 0.0683
Epoch 17/100 - Loss: 0.0664
Epoch 18/100 - Loss: 0.0668
Epoch 19/100 - Loss: 0.0656
Epoch 20/100 - Loss: 0.0629
Epoch 21/100 - Loss: 0.0624
Epoch 22/100 - Loss: 0.0608
Epoch 23/100 - Loss: 0.0598
Epoch 24/100 - Loss: 0.0604
Epoch 25/100 - Loss: 0.0587
Epoch 26/100 - Loss: 0.0586
Epoch 27/100 - Loss: 0.0566
Epoch 28/100 - Loss: 0.0559
Epoch 29/100 - Loss: 0.0554
Epoch 30/100 - Loss: 0.0541
Epoch 31/100 - Loss: 0.0520
Epoch 32/100 - Loss: 0.0527
Epoch 33/100 - Loss: 0.0517
Epoch 34/100 - Loss: 0.0511
Epoch 35/100 - Loss: 0.0483
Ep

ModelHybridAttnSVM(
  (conv1): Conv1d(9, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (drop): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(128, 64, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)

In [None]:
# 2) Treinar PCA + SVM sobre as features extraídas
print("\n### Treinando PCA + SVM ###")
svm = model.train_svm(
    train_loader,
    pca_path='output/Hybrid/pca.joblib',
    svm_path='output/Hybrid/hybrid_svm.joblib'
)


### Treinando PCA + SVM ###
PCA salvo em output/Hybrid/pca.joblib
SVM salvo em output/Hybrid/hybrid_svm.joblib

### Avaliando Modelo Híbrido no Conjunto de Teste ###
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4965
           1       0.90      0.90      0.90      3405

    accuracy                           0.92      8370
   macro avg       0.92      0.92      0.92      8370
weighted avg       0.92      0.92      0.92      8370

Accuracy: 0.9200716845878136


In [None]:
# 3) Avaliar todo o pipeline (CNN+LSTM → PCA → SVM) no conjunto de teste
print("\n### Avaliando Modelo Híbrido no Conjunto de Teste ###")
model.evaluate(
    test_loader,
    pca_path='output/Hybrid/pca.joblib',
    svm_path='output/Hybrid/hybrid_svm.joblib'
)