# Introdução aos Ataques DDoS no Dataset CICDDoS2019

O dataset contém múltiplos cenários de ataques, registrados em arquivos CSV, com detalhes sobre tráfego malicioso e legítimo. Abaixo, são listados os períodos de tempo (em horas e minutos) em que os ataques ocorreram, organizados por dia e tipo de ataque.


## Ataques coloetados no dia (03/11)

```csv
DrDos_NTP.csv, 10:35 - 10:45
DrDos_DNS.csv, 10:52 - 11:05
DrDos_LDAP.csv, 11:22 - 11:32
DrDos_MSSQL.csv, 11:36 - 11:45
DrDos_NetBIOS.csv, 11:50 - 12:00
DrDos_SNMP.csv, 12:12 - 12:23
DrDos_SSDP.csv, 12:27 - 12:37
DrDos_UDP.csv, 12:45 - 13:09
UDPLag.csv, 13:11 - 13:15
Syn.csv, 13:29 - 13:34
TFTP.csv, 13:35 - 17:15
```

## Ataques coloetados no dia (01/12)


```csv
PortMap.csv, 09:43 - 09:51
DrDos_NetBIOS.csv, 10:00 - 10:09
DrDos_LDAP.csv, 10:21 - 10:30
DrDos_MSSQL.csv, 10:33 - 10:42
DrDos_UDP.csv, 10:53 - 11:03
DrDos_UDP-Lag.csv, 11:14 - 11:24
Syn.csv, 11:28 - 17:35
```

# Pré-Processamento CICDDoS

## Concatena os dias da coleta em um único arquivo

### 01-12

In [1]:
import pandas as pd
import os

data_path = "data/01-12"

files = [
    "DrDos_DNS.csv", "DrDos_LDAP.csv", "DrDos_MSSQL.csv",
    "DrDos_NetBIOS.csv", "DrDos_NTP.csv", "DrDos_SNMP.csv",
    "DrDos_SSDP.csv", "DrDos_UDP.csv", "Syn.csv",
    "TFTP.csv", "UDPLag.csv"
]

all_data = []

for file_name in files:
    file_path = os.path.join(data_path, file_name)
    
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
            df = df.rename(columns=lambda x: x.strip())
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
            all_data.append(df)
            print(f"{file_name} processado com {len(df)} linhas.")
        except Exception as e:
            print(f"Erro ao processar {file_name}: {e}")
    else:
        print(f"Arquivo não encontrado: {file_name}")

if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    final_df = final_df.sort_values(by="Timestamp")
    output_file = os.path.join(data_path, "combined_attacks_01_12.csv")
    final_df.to_csv(output_file, index=False)
    print(f"Arquivo combinado salvo corretamente em ordem cronológica: {output_file}")
else:
    print("Nenhum dado válido encontrado para gerar o arquivo combinado.")


  df = pd.read_csv(file_path)


DrDos_DNS.csv processado com 5074413 linhas.


  df = pd.read_csv(file_path)


DrDos_LDAP.csv processado com 2181542 linhas.


  df = pd.read_csv(file_path)


KeyboardInterrupt: 

#### Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

combined_df = pd.read_csv("data/01-12/combined_attacks_01_12.csv")
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"])
combined_df.set_index("Timestamp", inplace=True)
combined_df = combined_df.sort_index()
print(len(combined_df))

plt.figure(figsize=(12, 6))
plt.plot(combined_df.index, combined_df["Total Fwd Packets"], label="Total Fwd Packets", color="blue")

plt.xlabel("Timestamp")
plt.ylabel("Total Fwd Packets")
plt.title("Combined Attacks Time Series")
plt.legend()
plt.grid()
plt.show()


### 03-11

In [None]:
import pandas as pd
import os

data_path = "data/03-11"

files = [
    "LDAP.csv", "MSSQL.csv", "NetBIOS.csv", "Portmap.csv",
    "Syn.csv", "UDP.csv", "UDPLag.csv"
]

all_data = []

for file_name in files:
    file_path = os.path.join(data_path, file_name)
    
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
            df = df.rename(columns=lambda x: x.strip())
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
            all_data.append(df)
            print(f"{file_name} processado com {len(df)} linhas.")
        except Exception as e:
            print(f"Erro ao processar {file_name}: {e}")
    else:
        print(f"Arquivo não encontrado: {file_name}")

if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    final_df = final_df.sort_values(by="Timestamp")
    output_file = os.path.join(data_path, "combined_attacks_03_11.csv")
    final_df.to_csv(output_file, index=False)
    print(f"Arquivo combinado salvo corretamente em ordem cronológica: {output_file}")
else:
    print("Nenhum dado válido encontrado para gerar o arquivo combinado.")


#### Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

combined_df = pd.read_csv("data/03-11/combined_attacks_03_11.csv")
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"])
combined_df.set_index("Timestamp", inplace=True)
combined_df = combined_df.sort_index()
print(len(combined_df))

plt.figure(figsize=(12, 6))
plt.plot(combined_df.index, combined_df["Total Fwd Packets"], label="Total Fwd Packets", color="blue")

plt.xlabel("Timestamp")
plt.ylabel("Total Fwd Packets")
plt.title("Combined Attacks Time Series")
plt.legend()
plt.grid()
plt.show()


## Análise de correlação

# Pré-processamento UEL

# Modelos - dataset UEL

## LSTM

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Usando dispositivo: {device}')


Usando dispositivo: cuda


In [None]:
class NetworkDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx].unsqueeze(0), self.y[idx]  # [1, seq_len]


In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # última saída da sequência
        out = self.dropout(out)
        out = self.relu(self.fc1(out))
        out = self.sigmoid(self.fc2(out))
        return out.squeeze()


In [None]:
# Dataset
train_dataset = NetworkDataset(X_train, y_train)
test_dataset = NetworkDataset(X_test, y_test)

# Split em treino e validação
val_size = int(0.2 * len(train_dataset))
train_size = len(train_dataset) - val_size
train_data, val_data = random_split(train_dataset, [train_size, val_size])

# DataLoaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)


In [None]:
model = LSTMClassifier(input_size=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
epochs = 30
patience = 5
patience_counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validação
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            val_outputs = model(X_val)
            val_loss += criterion(val_outputs, y_val).item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_lstm_model.pt")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Parando cedo (early stopping)")
            break


In [None]:
# Carregar o melhor modelo
model.load_state_dict(torch.load("best_lstm_model.pt"))
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = (outputs > 0.5).int().cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(y_batch.numpy().astype(int))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Report:\n", classification_report(y_true, y_pred))


## CNN

## LSTM-CNN-SVM

# Backup 

## 1 - Abrir aquivo e fazer o plot

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

data_path = os.path.join("data", "01-12")
csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]

selected_columns = [
    "Flow ID", " Source IP", " Source Port", " Destination IP", 
    " Destination Port", " Protocol", " Timestamp", " Flow Duration",
    " Total Fwd Packets"
]

for file_name in csv_files:
    file_path = os.path.join(data_path, file_name)
    df = pd.read_csv(file_path, usecols=selected_columns)
    plt.figure(figsize=(10, 5))
    df["Timestamp"] = pd.to_datetime(df[" Timestamp"], errors="coerce")
    df = df.dropna(subset=["Timestamp"])
    
    plt.plot(df["Timestamp"], df[" Total Fwd Packets"], label="Total Fwd Packets", color="blue")
    plt.xlabel("Tempo")
    plt.ylabel("Pacotes Enviados")
    plt.title(f"Tráfego de Pacotes - {file_name}")
    plt.xticks(rotation=45)
    plt.legend()
    plt.show()
