# Introdu√ß√£o aos Ataques DDoS no Dataset CICDDoS2019

O dataset cont√©m m√∫ltiplos cen√°rios de ataques, registrados em arquivos CSV, com detalhes sobre tr√°fego malicioso e leg√≠timo. Abaixo, s√£o listados os per√≠odos de tempo (em horas e minutos) em que os ataques ocorreram, organizados por dia e tipo de ataque.


## Ataques coloetados no dia (03/11)

```csv
DrDos_NTP.csv, 10:35 - 10:45
DrDos_DNS.csv, 10:52 - 11:05
DrDos_LDAP.csv, 11:22 - 11:32
DrDos_MSSQL.csv, 11:36 - 11:45
DrDos_NetBIOS.csv, 11:50 - 12:00
DrDos_SNMP.csv, 12:12 - 12:23
DrDos_SSDP.csv, 12:27 - 12:37
DrDos_UDP.csv, 12:45 - 13:09
UDPLag.csv, 13:11 - 13:15
Syn.csv, 13:29 - 13:34
TFTP.csv, 13:35 - 17:15
```

## Ataques coloetados no dia (01/12)


```csv
PortMap.csv, 09:43 - 09:51
DrDos_NetBIOS.csv, 10:00 - 10:09
DrDos_LDAP.csv, 10:21 - 10:30
DrDos_MSSQL.csv, 10:33 - 10:42
DrDos_UDP.csv, 10:53 - 11:03
DrDos_UDP-Lag.csv, 11:14 - 11:24
Syn.csv, 11:28 - 17:35
```

# Pr√©-Processamento

## Concatena os dias da coleta em um √∫nico arquivo

### 01-12

In [None]:
import pandas as pd
import os

data_path = "data/01-12"

files = [
    "DrDos_DNS.csv", "DrDos_LDAP.csv", "DrDos_MSSQL.csv",
    "DrDos_NetBIOS.csv", "DrDos_NTP.csv", "DrDos_SNMP.csv",
    "DrDos_SSDP.csv", "DrDos_UDP.csv", "Syn.csv",
    "TFTP.csv", "UDPLag.csv"
]

selected_columns = [
    "Flow ID", " Source IP", " Source Port", " Destination IP", 
    " Destination Port", " Protocol", " Timestamp", " Flow Duration",
    " Total Fwd Packets"
]

all_data = []

for file_name in files:
    file_path = os.path.join(data_path, file_name)
    
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path, usecols=selected_columns)
            df = df.rename(columns=lambda x: x.strip())
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
            all_data.append(df)
            print(f"{file_name} processado com {len(df)} linhas.")
        except Exception as e:
            print(f"Erro ao processar {file_name}: {e}")
    else:
        print(f"Arquivo n√£o encontrado: {file_name}")

if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    final_df = final_df.sort_values(by="Timestamp")
    output_file = os.path.join(data_path, "combined_attacks_01_12.csv")
    final_df.to_csv(output_file, index=False)
    print(f"Arquivo combinado salvo corretamente em ordem cronol√≥gica: {output_file}")
else:
    print("Nenhum dado v√°lido encontrado para gerar o arquivo combinado.")


‚ùå Erro ao processar DrDos_DNS.csv: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
‚úî DrDos_LDAP.csv processado com 2181542 linhas.
‚úî DrDos_MSSQL.csv processado com 4524498 linhas.
‚ùå Erro ao processar DrDos_NetBIOS.csv: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
‚úî DrDos_NTP.csv processado com 1217007 linhas.


#### Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

combined_df = pd.read_csv("data/01-12/combined_attacks_01_12.csv")
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"])
combined_df.set_index("Timestamp", inplace=True)
combined_df = combined_df.sort_index()
print(len(combined_df))

plt.figure(figsize=(12, 6))
plt.plot(combined_df.index, combined_df["Total Fwd Packets"], label="Total Fwd Packets", color="blue")

plt.xlabel("Timestamp")
plt.ylabel("Total Fwd Packets")
plt.title("Combined Attacks Time Series")
plt.legend()
plt.grid()
plt.show()


### 03-11

In [None]:
import pandas as pd
import os

data_path = "data/03-11"

files = [
    "LDAP.csv", "MSSQL.csv", "NetBIOS.csv", "Portmap.csv",
    "Syn.csv", "UDP.csv", "UDPLag.csv"
]

selected_columns = [
    "Flow ID", " Source IP", " Source Port", " Destination IP", 
    " Destination Port", " Protocol", " Timestamp", " Flow Duration",
    " Total Fwd Packets"
]

all_data = []

for file_name in files:
    file_path = os.path.join(data_path, file_name)
    
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path, usecols=selected_columns)
            df = df.rename(columns=lambda x: x.strip())
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
            all_data.append(df)
            print(f"{file_name} processado com {len(df)} linhas.")
        except Exception as e:
            print(f"Erro ao processar {file_name}: {e}")
    else:
        print(f"Arquivo n√£o encontrado: {file_name}")

if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    final_df = final_df.sort_values(by="Timestamp")
    output_file = os.path.join(data_path, "combined_attacks_03_11.csv")
    final_df.to_csv(output_file, index=False)
    print(f"Arquivo combinado salvo corretamente em ordem cronol√≥gica: {output_file}")
else:
    print("Nenhum dado v√°lido encontrado para gerar o arquivo combinado.")


#### Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

combined_df = pd.read_csv("data/03-11/combined_attacks_03_11.csv")
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"])
combined_df.set_index("Timestamp", inplace=True)
combined_df = combined_df.sort_index()
print(len(combined_df))

plt.figure(figsize=(12, 6))
plt.plot(combined_df.index, combined_df["Total Fwd Packets"], label="Total Fwd Packets", color="blue")

plt.xlabel("Timestamp")
plt.ylabel("Total Fwd Packets")
plt.title("Combined Attacks Time Series")
plt.legend()
plt.grid()
plt.show()


## Normaliza√ß√£o e criar sequ√™ncia 

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

data_path = "data/03-11/combined_attacks_03_11.csv"
df = pd.read_csv(data_path)

features = ["Flow Duration", "Total Fwd Packets", "Protocol"]
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

def create_sequences(data, seq_length=10):
    sequences, labels = [], []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        labels.append(data[i+seq_length])
    return np.array(sequences), np.array(labels)

data_values = df[features].values
seq_length = 10
X, y = create_sequences(data_values, seq_length)

processed_file = "data/03-11/03_11_processado.csv"
pd.DataFrame(X.reshape(X.shape[0], -1)).to_csv(processed_file, index=False)

print(f"Dados preparados para LSTM salvos em: {processed_file}")


Dados preparados para LSTM salvos em: data/03-11/03_11_processado.csv


In [8]:
print("Nomes das colunas:")
print(df.columns.tolist())
print(df.head())


Nomes das colunas:
['Flow ID', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packets']
                                  Flow ID       Source IP  Source Port  \
0          192.168.50.254-224.0.0.5-0-0-0  192.168.50.254            0   
1          192.168.50.253-224.0.0.5-0-0-0  192.168.50.253            0   
2  172.217.10.98-192.168.50.6-443-54799-6    192.168.50.6        54799   
3    172.217.7.2-192.168.50.6-443-54800-6    192.168.50.6        54800   
4  172.217.10.98-192.168.50.6-443-54801-6    192.168.50.6        54801   

  Destination IP  Destination Port  Protocol                   Timestamp  \
0      224.0.0.5                 0  0.000000  2018-11-03 09:18:16.964447   
1      224.0.0.5                 0  0.000000  2018-11-03 09:18:18.506537   
2  172.217.10.98               443  0.352941  2018-11-03 09:18:18.610576   
3    172.217.7.2               443  0.352941  2018-11-03 09:18:18.610579   
4  172.217.10.9

# Modelos

## LSTM

## CNN

## LSTM-CNN-SVM

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# üìå 1Ô∏è‚É£ Carregar Dataset J√° Processado (sequenciado e normalizado)
data_path = "data/01-12/03_11_processado.csv"
df = pd.read_csv(data_path)

# üìå 2Ô∏è‚É£ Definir Features e R√≥tulos
features = ["Flow Duration", "Total Fwd Packets", "Protocol"]  
label = "Attack Type"  

X = df[features].values  
y = df[label].values  

# üìå 3Ô∏è‚É£ Separar Treino e Teste (80% treino, 20% teste)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# üìå 4Ô∏è‚É£ Criar Modelo H√≠brido CNN + LSTM
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation="relu", input_shape=(X_train.shape[1], 1)),
    LSTM(50, return_sequences=True),
    LSTM(50),
    Dropout(0.2),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")  
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# üìå 5Ô∏è‚É£ Treinar Modelo
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# üìå 6Ô∏è‚É£ Extra√ß√£o de Caracter√≠sticas da CNN+LSTM para o SVM
feature_extractor = tf.keras.Model(inputs=model.input, outputs=model.layers[-2].output)
X_train_features = feature_extractor.predict(X_train)
X_test_features = feature_extractor.predict(X_test)

# üìå 7Ô∏è‚É£ Treinar o SVM
svm = SVC(kernel="rbf")
svm.fit(X_train_features, y_train)

# üìå 8Ô∏è‚É£ Fazer Predi√ß√µes
y_pred = svm.predict(X_test_features)

# üìå 9Ô∏è‚É£ Avalia√ß√£o do Modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"üîπ Precis√£o do Modelo H√≠brido: {accuracy:.4f}")

print("\nüîπ Relat√≥rio de Classifica√ß√£o:")
print(classification_report(y_test, y_pred))

# üìå üîü Matriz de Confus√£o
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "Ataque"], yticklabels=["Normal", "Ataque"])
plt.xlabel("Previsto")
plt.ylabel("Real")
plt.title("Matriz de Confus√£o - CNN+LSTM+SVM")
plt.show()

# üìå 1Ô∏è‚É£1Ô∏è‚É£ Gr√°fico: Compara√ß√£o Predi√ß√µes vs Reais
plt.figure(figsize=(10, 5))
plt.plot(y_test[:100], label="Real", linestyle="dashed")
plt.plot(y_pred[:100], label="Previsto", alpha=0.7)
plt.legend()
plt.title("üîπ Predi√ß√µes vs Valores Reais")
plt.show()

# üìå 1Ô∏è‚É£2Ô∏è‚É£ Salvar Modelos
joblib.dump(svm, "svm_model.pkl")
model.save("cnn_lstm_model.h5")


# Backup 

## 1 - Abrir aquivo e fazer o plot

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

data_path = os.path.join("data", "01-12")
csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]

selected_columns = [
    "Flow ID", " Source IP", " Source Port", " Destination IP", 
    " Destination Port", " Protocol", " Timestamp", " Flow Duration",
    " Total Fwd Packets"
]

for file_name in csv_files:
    file_path = os.path.join(data_path, file_name)
    df = pd.read_csv(file_path, usecols=selected_columns)
    plt.figure(figsize=(10, 5))
    df["Timestamp"] = pd.to_datetime(df[" Timestamp"], errors="coerce")
    df = df.dropna(subset=["Timestamp"])
    
    plt.plot(df["Timestamp"], df[" Total Fwd Packets"], label="Total Fwd Packets", color="blue")
    plt.xlabel("Tempo")
    plt.ylabel("Pacotes Enviados")
    plt.title(f"Tr√°fego de Pacotes - {file_name}")
    plt.xticks(rotation=45)
    plt.legend()
    plt.show()
