In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE

In [2]:

warnings.filterwarnings('ignore')

In [3]:
DATASET_DIRECTORY = '../dataset/CICIoT2023/'

In [4]:
# Obtém a lista de arquivos CSV no diretório
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Divide os conjuntos de treinamento e teste com base na lista
split_ratio = 0.8
training_sets = df_sets[:int(len(df_sets) * split_ratio)]
test_sets = df_sets[int(len(df_sets) * split_ratio):]

In [5]:
# Define as colunas de entrada X e a coluna de saída y
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [6]:

# Inicializa o modelo Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Você pode ajustar os hiperparâmetros conforme necessário

In [7]:
# Inicializa um dicionário para mapear rótulos de classe
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'

dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [8]:

# Inicializa o StandardScaler
scaler = StandardScaler()

In [9]:

# Inicializa o SMOTE para lidar com desequilíbrio de classes
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [10]:

# Loop pelos conjuntos de treinamento e ajusta o modelo
for train_set in tqdm(training_sets):
    data = pd.read_csv(os.path.join(DATASET_DIRECTORY, train_set))
    data[y_column] = data[y_column].map(dict_2classes)  # Mapeia as classes para 'Attack' ou 'Benign'
    
    X = data[X_columns]
    y = data[y_column]
    
    # Ajusta o scaler com base no conjunto de treinamento
    scaler.fit(X)
    
    # Transforma as features do conjunto de treinamento
    X_scaled = scaler.transform(X)
    
    # Aplica o SMOTE para balancear as classes
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
    
    rf_classifier.fit(X_resampled, y_resampled)

100%|██████████| 135/135 [6:48:41<00:00, 181.64s/it]  


In [11]:


# Loop pelos conjuntos de teste e previsões

y_test = []
y_pred = []

for test_set in tqdm(test_sets):
    data_test = pd.read_csv(os.path.join(DATASET_DIRECTORY, test_set))
    data_test[y_column] = data_test[y_column].map(dict_2classes)  # Mapeia as classes para 'Attack' ou 'Benign'
    
    X_test = data_test[X_columns]
    
    # Transforma as features do conjunto de teste usando o scaler do conjunto de treinamento
    X_test_scaled = scaler.transform(X_test)
    
    y_test += list(data_test[y_column])
    y_pred += list(rf_classifier.predict(X_test_scaled))


100%|██████████| 34/34 [01:58<00:00,  3.49s/it]


In [12]:
# Calcule as métricas de desempenho
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

Accuracy: 0.9961354566916318
Recall: 0.9892456647087695
Precision: 0.9349736804441207
F1 Score: 0.9604265978532878


In [13]:

from sklearn.metrics import accuracy_score, classification_report

# Avalia o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Acurácia:", accuracy)
print("Relatório de Classificação:")
print(report)


Acurácia: 0.9961354566916318
Relatório de Classificação:
              precision    recall  f1-score   support

      Attack       1.00      1.00      1.00  10096839
      Benign       0.87      0.98      0.92    243322

    accuracy                           1.00  10340161
   macro avg       0.93      0.99      0.96  10340161
weighted avg       1.00      1.00      1.00  10340161

