In [1]:
#use "pip install xgboost" in anaconda

import joblib
import pandas as pd
import os
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from xgboost import XGBClassifier


In [2]:
DATASET_DIRECTORY = '../dataset/CICIoT2023/'

In [3]:
# Obtém a lista de arquivos CSV no diretório
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Divide os conjuntos de treinamento e teste com base na lista
split_ratio = 0.8
training_sets = df_sets[:int(len(df_sets) * split_ratio)]
test_sets = df_sets[int(len(df_sets) * split_ratio):]

In [4]:
# Define as colunas de entrada X e a coluna de saída y
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [5]:
# Inicializa o scaler fora do loop
scaler = StandardScaler()

In [6]:
# Inicializa um dicionário para mapear rótulos de classe
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 1
dict_2classes['DDoS-PSHACK_Flood'] = 1
dict_2classes['DDoS-SYN_Flood'] = 1
dict_2classes['DDoS-UDP_Flood'] = 1
dict_2classes['DDoS-TCP_Flood'] = 1
dict_2classes['DDoS-ICMP_Flood'] = 1
dict_2classes['DDoS-SynonymousIP_Flood'] = 1
dict_2classes['DDoS-ACK_Fragmentation'] = 1
dict_2classes['DDoS-UDP_Fragmentation'] = 1
dict_2classes['DDoS-ICMP_Fragmentation'] = 1
dict_2classes['DDoS-SlowLoris'] = 1
dict_2classes['DDoS-HTTP_Flood'] = 1

dict_2classes['DoS-UDP_Flood'] = 1
dict_2classes['DoS-SYN_Flood'] = 1
dict_2classes['DoS-TCP_Flood'] = 1
dict_2classes['DoS-HTTP_Flood'] = 1


dict_2classes['Mirai-greeth_flood'] = 1
dict_2classes['Mirai-greip_flood'] = 1
dict_2classes['Mirai-udpplain'] = 1

dict_2classes['Recon-PingSweep'] = 1
dict_2classes['Recon-OSScan'] = 1
dict_2classes['Recon-PortScan'] = 1
dict_2classes['VulnerabilityScan'] = 1
dict_2classes['Recon-HostDiscovery'] = 1

dict_2classes['DNS_Spoofing'] = 1
dict_2classes['MITM-ArpSpoofing'] = 1

dict_2classes['BenignTraffic'] = 0

dict_2classes['BrowserHijacking'] = 1
dict_2classes['Backdoor_Malware'] = 1
dict_2classes['XSS'] = 1
dict_2classes['Uploading_Attack'] = 1
dict_2classes['SqlInjection'] = 1
dict_2classes['CommandInjection'] = 1

dict_2classes['DictionaryBruteForce'] = 1

In [8]:
# Loop de treinamento
for train_set in tqdm(training_sets):
    df_train = pd.read_csv(os.path.join(DATASET_DIRECTORY, train_set))
    df_train[X_columns] = scaler.fit_transform(df_train[X_columns])
    df_train[y_column] = df_train[y_column].map(dict_2classes)
    
    # Aplica o SMOTE para balanceamento
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(df_train[X_columns], df_train[y_column])
    
    xgb_model = XGBClassifier()
    xgb_model.fit(X_resampled, y_resampled)
    
    # Salva o modelo treinado
    model_file = 'modelo_xgboost.pkl'
    joblib.dump(xgb_model, model_file)

100%|██████████| 135/135 [21:40<00:00,  9.64s/it]


In [9]:
# Avaliação no conjunto de teste
y_test = []
y_pred_xgb = []

for test_set in tqdm(test_sets):
    df_test = pd.read_csv(os.path.join(DATASET_DIRECTORY, test_set))
    df_test[X_columns] = scaler.transform(df_test[X_columns])
    df_test[y_column] = df_test[y_column].map(dict_2classes)
    
    y_test += list(df_test[y_column].values)
    
    # Carrega o modelo treinado correspondente ao conjunto de treinamento
    xgb_model = joblib.load(model_file)
    
    y_pred = list(xgb_model.predict(df_test[X_columns]))
    y_pred_xgb += y_pred   

100%|██████████| 34/34 [01:53<00:00,  3.34s/it]


In [10]:
# Avalie o desempenho do modelo XGBoost
print("##### XGBoost (2 classes) #####")
print('accuracy_score: ', accuracy_score(y_pred_xgb, y_test))
print('recall_score: ', recall_score(y_pred_xgb, y_test, average='macro'))
print('precision_score: ', precision_score(y_pred_xgb, y_test, average='macro'))
print('f1_score: ', f1_score(y_pred_xgb, y_test, average='macro'))

##### XGBoost (2 classes) #####
accuracy_score:  0.995967954464152
recall_score:  0.9380562309444259
precision_score:  0.9802981662385408
f1_score:  0.9581516550088425


In [11]:
from sklearn.metrics import accuracy_score, classification_report

# Avalia o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred_xgb)
report = classification_report(y_test, y_pred_xgb)

print("Acurácia:", accuracy)
print("Relatório de Classificação:")
print(report)

Acurácia: 0.995967954464152
Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92    243322
           1       1.00      1.00      1.00  10096839

    accuracy                           1.00  10340161
   macro avg       0.94      0.98      0.96  10340161
weighted avg       1.00      1.00      1.00  10340161

