# Introdução aos Ataques DDoS no Dataset CICDDoS2019

O dataset contém múltiplos cenários de ataques, registrados em arquivos CSV, com detalhes sobre tráfego malicioso e legítimo. Abaixo, são listados os períodos de tempo (em horas e minutos) em que os ataques ocorreram, organizados por dia e tipo de ataque.


# Pré-Processamento UEL - Gerando dados para treinamento

In [None]:
import pandas as pd
from itertools import cycle
import random

# 1. Carregar os arquivos
teste_ataque = pd.read_csv('data/cic_puro/teste_ataque_ordenado.csv', sep=';')
teste_normal = pd.read_csv('data/cic_puro/teste_sem_ataque_ordenado.csv', sep=';')
treino_ataque = pd.read_csv('data/cic_puro/treino_ataque_ordenado.csv', sep=';')
treino_normal = pd.read_csv('data/cic_puro/treino_sem_ataque_ordenado.csv', sep=';')


# 2. Concatenar para treino e teste
teste_full = pd.concat([teste_normal, teste_ataque], ignore_index=True)
treino_full = pd.concat([treino_normal, treino_ataque], ignore_index=True)

# 3. Separar normais e ataques
def prepare_data(df, max_per_attack=1000, max_normal=5000):
    normal = df[df['label'] == 0].sample(frac=1).reset_index(drop=True)  # embaralhar normais
    attacks = df[df['label'] == 1].reset_index(drop=True)

    # Agora limitar por tipo de ataque
    attack_types = {}
    for name, group in attacks.groupby('attack_name'):
        attack_types[name] = group.sample(n=min(len(group), max_per_attack)).reset_index(drop=True)

    # Limitar normais
    if max_normal is not None:
        normal = normal.sample(n=min(len(normal), max_normal)).reset_index(drop=True)

    return normal, attack_types

train_normal, train_attacks = prepare_data(treino_full, max_per_attack=1000, max_normal=10000)
test_normal, test_attacks = prepare_data(teste_full, max_per_attack=500, max_normal=5000)

# 4. Função para criar sequências aleatórias
def create_random_sequences(normal_df, attack_dict, min_seq=30, max_seq=150):
    final_rows = []
    
    normal_iter = normal_df.iterrows()
    attack_iters = {k: v.iterrows() for k, v in attack_dict.items()}
    attack_cycle = cycle(list(attack_iters.keys()))
    
    normal_remaining = True
    attack_remaining = True

    while normal_remaining or attack_remaining:
        choice = random.choice(['normal', 'attack'])  # Aleatoriamente decidir normal ou ataque primeiro
        
        if choice == 'normal' and normal_remaining:
            seq_len = random.randint(min_seq, max_seq)
            for _ in range(seq_len):
                try:
                    idx, row = next(normal_iter)
                    final_rows.append(row)
                except StopIteration:
                    normal_remaining = False
                    break
        
        elif choice == 'attack' and attack_remaining:
            attack_type = next(attack_cycle)
            seq_len = random.randint(min_seq, max_seq)
            for _ in range(seq_len):
                try:
                    idx, row = next(attack_iters[attack_type])
                    final_rows.append(row)
                except StopIteration:
                    # Se esgotar ataques desse tipo, remover do ciclo
                    del attack_iters[attack_type]
                    if attack_iters:
                        attack_cycle = cycle(list(attack_iters.keys()))
                    else:
                        attack_remaining = False
                    break
        else:
            # Se o tipo escolhido acabou, tenta o outro
            continue

    return pd.DataFrame(final_rows)

# 5. Criar datasets
train_final = create_random_sequences(train_normal, train_attacks, min_seq=30, max_seq=120)
test_final = create_random_sequences(test_normal, test_attacks, min_seq=30, max_seq=120)

# 6. Salvar
train_final.to_csv('treino_final_estratificado_random.csv', sep=';', index=False)
test_final.to_csv('teste_final_estratificado_random.csv', sep=';', index=False)

print('Arquivos treino_final_estratificado_random.csv e teste_final_estratificado_random.csv gerados com sequências aleatórias!')

Arquivos treino_final_estratificado_random.csv e teste_final_estratificado_random.csv gerados com sequências aleatórias!


In [None]:
# Contar a quantidade de cada valor na coluna 'attack_name'
attack_counts_train = train_final['attack_name'].value_counts()
attack_counts_test = test_final['attack_name'].value_counts()

# Exibir os resultados
print('Tamanho:', len(train_final), 'Treino:', attack_counts_train)
print('Total de linhas no conjunto de treino:', len(train_final))

print('Tamanho:', len(test_final), 'Teste:', attack_counts_test)
print('Total de linhas no conjunto de teste:', len(test_final))

Tamanho: 13 Treino: attack_name
normal           8074
DrDoS_DNS        1000
DrDoS_NTP        1000
DrDoS_SNMP       1000
DrDoS_UDP        1000
TFTP             1000
UDP-lag           885
DrDoS_SSDP        822
DrDoS_NetBIOS     726
DrDoS_MSSQL       687
DrDoS_LDAP        592
Syn               237
WebDDoS           125
Name: count, dtype: int64
Tamanho: 8 Teste: attack_name
normal     5000
LDAP        500
MSSQL       500
NetBIOS     500
Syn         500
UDP         500
UDPLag      470
Portmap     449
Name: count, dtype: int64


# Modelos

## LSTM

In [15]:
from models.LSTM.ModelLSTM import LSTM
from models.LSTM.SequenceLSTM import SequenceDataset
from models.LSTM.TrainerLSTM import TrainerLSTM
from torch.utils.data import DataLoader
import torch

SEED = 42
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Configurar os parâmetros da rede LSTM
input_size = 9         # Número de features no dataset / Tamanho do vetor de entrada por tempo
hidden_size = 256       # Tamanho do hidden state / Nº de unidades ocultas por célula
num_layers = 3         # Número de camadas LSTM / Nº de camadas LSTM empilhadas
output_size = 2        # Classes: normal (0), anomalia (1) 
batch_size = 128        # Batch size / 
sequence_length = 5   # Tamanho da sequência de entrada para a LSTM
column_to_remove = 'attack_name'  # Coluna a ser removida

Usando dispositivo: cuda


In [16]:
# Criar os datasets
train_dataset = SequenceDataset('data/cic_puro/treino_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='lstm')
test_dataset = SequenceDataset('data/cic_puro/teste_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='lstm')


print(f"Total de amostras no conjunto de treino: {len(train_dataset)}")
print(f"Total de amostras no conjunto de teste: {len(test_dataset)}")

Total de amostras no conjunto de treino: 17144
Total de amostras no conjunto de teste: 8415


In [17]:
# Criar os DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Total de batches no conjunto de treino: {len(train_loader)}")
print(f"Total de batches no conjunto de teste: {len(test_loader)}")

Total de batches no conjunto de treino: 134
Total de batches no conjunto de teste: 66


In [18]:
# Criar o modelo
model = LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size).to(device)
print(model)

LSTM(
  (lstm1): LSTM(9, 128, num_layers=3, batch_first=True)
  (lstm2): LSTM(128, 256, num_layers=3, batch_first=True, dropout=0.2)
  (lstm3): LSTM(256, 128, num_layers=3, batch_first=True, dropout=0.2)
  (sigmoid): Sigmoid()
  (fc): Linear(in_features=128, out_features=2, bias=True)
)


In [19]:
trainer = TrainerLSTM(dir_save="output/UEL/LSTM", num_epochs=100)
trainer.fit(model, train_loader, test_loader, device)

Epoch [1/100] Train Loss: 0.4158 Val Loss:   0.4716 Accuracy:   0.8449
🔖 Melhor modelo salvo em: output/UEL/LSTM\LSTM_Epoca-1_Acc-0.84.pth
Epoch [2/100] Train Loss: 0.2251 Val Loss:   0.4735 Accuracy:   0.8323
Epoch [3/100] Train Loss: 0.1760 Val Loss:   0.4217 Accuracy:   0.8532
🔖 Melhor modelo salvo em: output/UEL/LSTM\LSTM_Epoca-3_Acc-0.85.pth
Epoch [4/100] Train Loss: 0.1607 Val Loss:   0.7465 Accuracy:   0.7450
Epoch [5/100] Train Loss: 0.1494 Val Loss:   1.0015 Accuracy:   0.6927
Epoch [6/100] Train Loss: 0.1420 Val Loss:   0.6754 Accuracy:   0.5468
Epoch [7/100] Train Loss: 0.1308 Val Loss:   0.8149 Accuracy:   0.6101
Epoch [8/100] Train Loss: 0.1247 Val Loss:   0.5917 Accuracy:   0.6872
Epoch [9/100] Train Loss: 0.1247 Val Loss:   0.6217 Accuracy:   0.7777
Epoch [10/100] Train Loss: 0.1187 Val Loss:   0.5170 Accuracy:   0.7728
Epoch [11/100] Train Loss: 0.1149 Val Loss:   0.6521 Accuracy:   0.7243
Epoch [12/100] Train Loss: 0.1101 Val Loss:   0.5270 Accuracy:   0.7348
Epoch [13

## CNN

In [None]:
input_size = 9
sequence_length = 5
output_size = 2
batch_size = 128
num_epochs = 50
learning_rate = 0.0001
column_to_remove = 'attack_name'

train_dataset = SequenceDataset('data/cic_puro/treino_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='cnn1d')
test_dataset = SequenceDataset('data/cic_puro/teste_final_estratificado_random.csv', sequence_length, column_to_remove, normalize=True, mode='cnn1d')

print("Train Dataset Shape:", train_dataset.sequences.shape)
print("Test Dataset Shape:", test_dataset.sequences.shape)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Modelo
in_ch  = train_dataset.sequences.shape[1]   
length = train_dataset.sequences.shape[2]
model = CNN(input_channels=in_ch, input_length=length, num_classes=output_size).to(device)
model.to(device)

Train Dataset Shape: torch.Size([17144, 9, 5])
Test Dataset Shape: torch.Size([8415, 9, 5])


CNN(
  (conv1): Conv1d(9, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc1): Linear(in_features=1280, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=lr) 
dir_save = "output/UEL/CNN"
os.makedirs(dir_save, exist_ok=True)
best_loss = float('inf') 

for num_epochs in range(1, num_epochs+1):
    # Treino
    model.train()
    train_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * x.size(0)
    train_loss /= len(train_loader.dataset)

    # Validação
    model.eval()
    val_loss, preds, trues = 0, [], []
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            val_loss += criterion(out, y).item() * x.size(0)
            preds.extend(out.argmax(1).cpu().numpy())
            trues.extend(y.cpu().numpy())
    val_loss /= len(test_loader.dataset)
    val_acc  = accuracy_score(trues, preds)

    print(f"num_epochs [{num_epochs}] "
          f"Train Loss: {avg_train_loss:.4f} "
          f"Val Loss: {val_loss:.4f} "
          f"Accuracy: {val_acc:.4f}")

    # ---- Salvar o melhor modelo ----
    if val_loss < best_loss and val_acc > 0.80:
        best_loss = val_loss
        save_path = os.path.join(dir_save, f"CNN_Epoca-{num_epochs}_Acc-{val_acc:.2f}.pth")
        torch.save({
            'epoch': num_epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss
        }, save_path)
        print(f"Melhor modelo salvo")

print("Treinamento concluído.")

num_epochs [1] Train Loss: 0.3576 Val Loss: 0.7702 Accuracy: 0.7051
num_epochs [2] Train Loss: 0.3576 Val Loss: 1.0510 Accuracy: 0.6134
num_epochs [3] Train Loss: 0.3576 Val Loss: 1.1179 Accuracy: 0.6330
num_epochs [4] Train Loss: 0.3576 Val Loss: 0.8157 Accuracy: 0.7176
num_epochs [5] Train Loss: 0.3576 Val Loss: 0.9511 Accuracy: 0.7144
num_epochs [6] Train Loss: 0.3576 Val Loss: 0.9229 Accuracy: 0.6942
num_epochs [7] Train Loss: 0.3576 Val Loss: 0.5452 Accuracy: 0.8007
Melhor modelo salvo
num_epochs [8] Train Loss: 0.3576 Val Loss: 1.1932 Accuracy: 0.6270
num_epochs [9] Train Loss: 0.3576 Val Loss: 0.8717 Accuracy: 0.7630
num_epochs [10] Train Loss: 0.3576 Val Loss: 1.4044 Accuracy: 0.6585
num_epochs [11] Train Loss: 0.3576 Val Loss: 3.8234 Accuracy: 0.4597
num_epochs [12] Train Loss: 0.3576 Val Loss: 0.9889 Accuracy: 0.7616
num_epochs [13] Train Loss: 0.3576 Val Loss: 1.6415 Accuracy: 0.6485
num_epochs [14] Train Loss: 0.3576 Val Loss: 2.0549 Accuracy: 0.6147
num_epochs [15] Train L

## LSTM-CNN-SVM

In [None]:
import torch
from sequence_dataset import FlowSequenceDataset
from hybrid_model import train_feature_extractor, extract_features, train_svm, evaluate_hybrid
from torch.utils.data import DataLoader

# Configurações
device   = 'cuda' if torch.cuda.is_available() else 'cpu'
train_csv = "path/to/train_data.csv"
test_csv  = "path/to/test_data.csv"

# Passo 1: treina o extrator CNN-LSTM
extractor, train_loader = train_feature_extractor(
    train_csv=train_csv,
    seq_len=10,
    batch_size=64,
    epochs=15,
    lr=1e-3,
    device=device
)

# Passo 2: treina o SVM sobre features extraídas
dados_X, dados_y = extract_features(extractor, train_loader, device)
svm_model = train_svm(dados_X, dados_y)

# Passo 3: avalia no conjunto de teste test_csv
test_ds     = FlowSequenceDataset(test_csv, seq_len=10)
test_loader = DataLoader(test_ds, batch_size=64)
evaluate_hybrid(extractor, test_loader, svm_model, device)

KeyError: "['label', 'attack_name'] not found in axis"