# Introdução aos Ataques DDoS no Dataset CICDDoS2019

O dataset contém múltiplos cenários de ataques, registrados em arquivos CSV, com detalhes sobre tráfego malicioso e legítimo. Abaixo, são listados os períodos de tempo (em horas e minutos) em que os ataques ocorreram, organizados por dia e tipo de ataque.


## Ataques coloetados no dia (01/12)

```csv
data_path, length, columns, hour
DrDos_NTP.csv, 1217007, 88, 10:35 - 10:45
DrDos_DNS.csv, 5074413, 88, 10:52 - 11:05
DrDos_LDAP.csv, 2181542, 88, 11:22 - 11:32
DrDos_MSSQL.csv, 4524498, 88, 11:36 - 11:45
DrDos_NetBIOS.csv, 4094986, 88, 11:50 - 12:00
DrDos_SNMP.csv, 5161377, 88, 12:12 - 12:23
DrDos_SSDP.csv, 2611374, 88, 12:27 - 12:37
DrDos_UDP.csv, 3136802, 88, 12:45 - 13:09
UDPLag.csv, 370605, 88, 13:11 - 13:15
Syn.csv, 1582681, 88, 13:29 - 13:34
TFTP.csv, 20107827, 88, 13:35 - 17:15
```

## Ataques coloetados no dia (03/11)

```csv
data_path, length, columns, hour
NetBIOS.csv, 3455899, 88, 10:00 - 10:09
LDAP.csv, 2113234, 88, 10:21 - 10:30
MSSQL.csv, 5775786, 88, 10:33 - 10:42
UDP.csv, 3782206, 88,  10:53 - 11:03
UDPLag.csv, 725165, 88, 11:14 - 11:24
Syn.csv, 4320541, 88, 11:28 - 17:35
```

# Pré-Processamento CICDDoS

In [9]:
import pandas as pd
import os

class DatasetBuilder:
    def __init__(self, data_path, file_list, output_filename="combined_output.csv", chunk_size=100_000):
        self.data_path = data_path
        self.file_list = file_list
        self.chunk_size = chunk_size
        self.output_file = os.path.join(data_path, output_filename)

    def inspect_files(self):
        for file_name in self.file_list:
            file_path = os.path.join(self.data_path, file_name)

            if os.path.exists(file_path):
                try:
                    total_rows = 0
                    column_count = None

                    for chunk in enumerate(pd.read_csv(file_path, chunksize=self.chunk_size, low_memory=False)):
                        chunk.columns = chunk.columns.str.strip()
                        total_rows += len(chunk)
                        if column_count is None:
                            column_count = len(chunk.columns)

                    print(f"✅ {file_name}: {total_rows} linhas, {column_count} colunas")
                except Exception as e:
                    print(f"❌ Erro ao inspecionar {file_name}: {e}")
            else:
                print(f"⚠️ Arquivo não encontrado: {file_name}")

    def concatenate_selected_columns(self, selected_columns):
        if os.path.exists(self.output_file):
            os.remove(self.output_file)

        first_chunk = True

        for file_name in self.file_list:
            file_path = os.path.join(self.data_path, file_name)

            if os.path.exists(file_path):
                try:
                    for chunk in pd.read_csv(file_path, chunksize=self.chunk_size, usecols=selected_columns, low_memory=False):
                        chunk.columns = chunk.columns.str.strip()

                        if "Timestamp" in chunk.columns:
                            chunk["Timestamp"] = pd.to_datetime(chunk["Timestamp"], errors="coerce")

                        chunk.to_csv(self.output_file, mode='a', header=first_chunk, index=False)
                        first_chunk = False

                    print(f"✅ {file_name} processado e salvo com colunas selecionadas.")
                except Exception as e:
                    print(f"❌ Erro ao processar {file_name}: {e}")
            else:
                print(f"⚠️ Arquivo não encontrado: {file_name}")

    def sort_output_by_timestamp(self):
        try:
            df = pd.read_csv(self.output_file, low_memory=False)
            if "Timestamp" in df.columns:
                df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
                df = df.sort_values(by="Timestamp")
                df.to_csv(self.output_file, index=False)
                print(f"✅ Arquivo ordenado por Timestamp salvo em: {self.output_file}")
            else:
                print("⚠️ Coluna 'Timestamp' não encontrada para ordenação.")
        except Exception as e:
            print(f"❌ Erro ao ordenar o arquivo: {e}")
            
    def label_anomaly_period(self, file_name, start_time_str, end_time_str):
        file_path = os.path.join(self.data_path, file_name)

        if not os.path.exists(file_path):
            print(f"⚠️ Arquivo não encontrado: {file_name}")
            return

        try:
            df = pd.read_csv(file_path, low_memory=False)
            if " Timestamp" not in df.columns:
                print("❌ Coluna 'Timestamp' não encontrada.")
                return

            df[" Timestamp"] = pd.to_datetime(df[" Timestamp"], errors="coerce")
            df["TimeOnly"] = df[" Timestamp"].dt.time

            from datetime import time

            start_time = time.fromisoformat(start_time_str)
            end_time = time.fromisoformat(end_time_str)

            # Criação da coluna 'label'
            df["label"] = df["TimeOnly"].apply(lambda t: 1 if start_time <= t <= end_time else 0)

            df.drop(columns=["TimeOnly"], inplace=True)

            # Novo nome de arquivo com "_labeled"
            labeled_file_path = os.path.join(self.data_path, file_name.replace(".csv", "_labeled.csv"))
            df.to_csv(labeled_file_path, index=False)

            print(f"✅ Rotulagem aplicada e salva em: {labeled_file_path}")

        except Exception as e:
            print(f"❌ Erro ao rotular {file_name}: {e}")



In [3]:
# Lista de arquivos
files_1day = [
    "LDAP.csv", "MSSQL.csv", "NetBIOS.csv", "Portmap.csv",
    "Syn.csv", "UDP.csv", "UDPLag.csv",
]


files_2day = [
    "DrDos_DNS.csv", "DrDos_LDAP.csv", "DrDos_MSSQL.csv",
    "DrDos_NetBIOS.csv", "DrDos_NTP.csv", "DrDos_SNMP.csv",
    "DrDos_SSDP.csv", "DrDos_UDP.csv", "Syn.csv",
    "TFTP.csv", "UDPLag.csv"
]

selected_columns = ['Timestamp', 'Flow Duration', 'Total Fwd Packets', 'Label']

# Caminho para os arquivos
data_path_1day = "data/03-11"
data_path_2day = "data/01-12"


In [10]:
# Criando o objeto
builder = DatasetBuilder(data_path_2day, files_2day, output_filename="output_2day.csv")
builder.label_anomaly_period("DrDos_NTP.csv", "10:35:00", "10:45:00")


✅ Rotulagem aplicada e salva em: data/01-12\DrDos_NTP_labeled.csv


## Análise de correlação

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def analyze_rfe_feature_importance(file_path, label_column='label', top_k=15, sample_frac=1.0, random_state=42):
    df = pd.read_csv(file_path)

    # Amostragem dos dados
    if sample_frac < 1.0:
        df = df.sample(frac=sample_frac, random_state=random_state).reset_index(drop=True)

    # Salvar colunas categóricas (ex: IPs)
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    preserved_cols = df[categorical_cols]

    # Limpar dados
    df.dropna(axis=1, how='all', inplace=True)  # remove colunas 100% nulas

    # Trabalhar só com numéricas
    df_numeric = df.select_dtypes(include=['number'])

    if label_column not in df_numeric.columns:
        raise ValueError(f"Coluna '{label_column}' não encontrada no dataset.")

    # X e y
    X = df_numeric.drop(columns=[label_column])
    y = df_numeric[label_column]

    # Remover infs e nulos
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.dropna(inplace=True)
    y = y.loc[X.index]
    preserved_cols = preserved_cols.loc[X.index]

    # Normalização (opcional para RF, mas ajuda no desempenho)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Estimador
    estimator = RandomForestClassifier(n_estimators=100, random_state=random_state)

    # RFE com o número de features que queremos ranquear
    rfe = RFE(estimator=estimator, n_features_to_select=top_k)
    rfe.fit(X_scaled, y)

    # Obter os nomes das colunas selecionadas
    selected_features = X.columns[rfe.support_]
    feature_ranking = pd.Series(rfe.ranking_, index=X.columns)

    # Plotar as top features selecionadas
    top_features = feature_ranking[feature_ranking == 1].index
    importances = estimator.fit(X, y).feature_importances_
    importance_series = pd.Series(importances, index=X.columns).loc[top_features]

    # Gráfico de barras
    plt.figure(figsize=(10, 6))
    sns.barplot(x=importance_series.values, y=importance_series.index, palette='magma')
    plt.title(f'Top {top_k} Features Relevantes (RFE + Random Forest)')
    plt.xlabel("Importância")
    plt.tight_layout()
    plt.show()

    return top_features, preserved_cols


In [None]:
top_rfe_features, preserved_data = analyze_rfe_feature_importance(
    data_path_2day + "DrDos_NTP_labeled.csv",
    top_k=20,
    sample_frac=0.1  # Usa apenas 10% dos dados
)



FileNotFoundError: [Errno 2] No such file or directory: 'DrDos_NTP_labeled.csv'

# Modelos - dataset UEL

## LSTM

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Usando dispositivo: {device}')


Usando dispositivo: cuda


In [None]:
class NetworkDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx].unsqueeze(0), self.y[idx]  # [1, seq_len]


In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # última saída da sequência
        out = self.dropout(out)
        out = self.relu(self.fc1(out))
        out = self.sigmoid(self.fc2(out))
        return out.squeeze()


In [None]:
# Dataset
train_dataset = NetworkDataset(X_train, y_train)
test_dataset = NetworkDataset(X_test, y_test)

# Split em treino e validação
val_size = int(0.2 * len(train_dataset))
train_size = len(train_dataset) - val_size
train_data, val_data = random_split(train_dataset, [train_size, val_size])

# DataLoaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)


In [None]:
model = LSTMClassifier(input_size=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
epochs = 30
patience = 5
patience_counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validação
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            val_outputs = model(X_val)
            val_loss += criterion(val_outputs, y_val).item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_lstm_model.pt")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Parando cedo (early stopping)")
            break


In [None]:
# Carregar o melhor modelo
model.load_state_dict(torch.load("best_lstm_model.pt"))
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = (outputs > 0.5).int().cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(y_batch.numpy().astype(int))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Report:\n", classification_report(y_true, y_pred))


## CNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Carregamento dos CSVs já normalizados
train_df = pd.read_csv('train_final.csv', sep=';')
test_df = pd.read_csv('test_final.csv', sep=';')

# Features selecionadas
features = ['bytes', 'src_ip_entropy', 'dst_ip_entropy', 'src_port_entropy',
            'dst_port_entropy', 'packets', 'flow_count', 'bytes_mean', 'packets_mean']

# Separação de variáveis e labels
X_train = train_df[features].values.astype(np.float32)
y_train = train_df['label'].values.astype(np.float32)

X_test = test_df[features].values.astype(np.float32)
y_test = test_df['label'].values.astype(np.float32)

# Ajuste para CNN: (batch_size, channels, length)
X_train_tensor = torch.tensor(X_train).unsqueeze(1)
y_train_tensor = torch.tensor(y_train).unsqueeze(1)

X_test_tensor = torch.tensor(X_test).unsqueeze(1)
y_test_tensor = torch.tensor(y_test).unsqueeze(1)

# Criação de DataLoaders
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2)

        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2)

        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 2, 32)  # 9 -> pool(4) -> pool(2) ≈ 2 posições
        self.fc2 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return self.sigmoid(x)

# Instancia e envia para GPU
model = CNN1D().to(device)


In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
best_loss = float('inf')

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Validação
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

    val_loss /= len(test_loader)
    val_losses.append(val_loss)

    print(f"Época {epoch+1}/{num_epochs} - Loss treino: {avg_loss:.4f} - Loss validação: {val_loss:.4f}")

    # Salva o melhor modelo
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "melhor_cnn_modelo.pth")
        print("✅ Modelo salvo!")


In [None]:
# Avaliação nos dados de teste
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = (outputs.cpu().numpy() > 0.5).astype(int)
        y_preds.extend(preds)
        y_true.extend(y_batch.numpy())

print("\n📊 Relatório de Classificação:")
print(classification_report(y_true, y_preds))


## LSTM-CNN-SVM

# Backup 

## 1 - Abrir aquivo e fazer o plot

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

data_path = os.path.join("data", "01-12")
csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]

selected_columns = [
    "Flow ID", " Source IP", " Source Port", " Destination IP", 
    " Destination Port", " Protocol", " Timestamp", " Flow Duration",
    " Total Fwd Packets"
]

for file_name in csv_files:
    file_path = os.path.join(data_path, file_name)
    df = pd.read_csv(file_path, usecols=selected_columns)
    plt.figure(figsize=(10, 5))
    df["Timestamp"] = pd.to_datetime(df[" Timestamp"], errors="coerce")
    df = df.dropna(subset=["Timestamp"])
    
    plt.plot(df["Timestamp"], df[" Total Fwd Packets"], label="Total Fwd Packets", color="blue")
    plt.xlabel("Tempo")
    plt.ylabel("Pacotes Enviados")
    plt.title(f"Tráfego de Pacotes - {file_name}")
    plt.xticks(rotation=45)
    plt.legend()
    plt.show()
