In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


In [2]:
warnings.filterwarnings('ignore')

In [3]:
DATASET_DIRECTORY = '../dataset/CICIoT2023/'

In [4]:
# Obtém a lista de arquivos CSV no diretório
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Divide os conjuntos de treinamento e teste com base na lista
split_ratio = 0.8
training_sets = df_sets[:int(len(df_sets) * split_ratio)]
test_sets = df_sets[int(len(df_sets) * split_ratio):]

In [5]:
# Define as colunas de entrada X e a coluna de saída y
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [6]:
# Inicializa o modelo Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Você pode ajustar os hiperparâmetros conforme necessário

In [7]:
# Inicializa um dicionário para mapear rótulos de classe
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [8]:
# Inicializa o StandardScaler
scaler = StandardScaler()

In [9]:
# Loop pelos conjuntos de treinamento e ajusta o modelo
for train_set in tqdm(training_sets):
    data = pd.read_csv(os.path.join(DATASET_DIRECTORY, train_set))
    data[y_column] = data[y_column].map(dict_7classes)  # Mapeia as classes para 'Attack' ou 'Benign'
    
    X = data[X_columns]
    y = data[y_column]
    
    # Ajusta o scaler com base no conjunto de treinamento
    scaler.fit(X)
    
    # Transforma as features do conjunto de treinamento
    X_scaled = scaler.transform(X)
    
    rf_classifier.fit(X_scaled, y)

100%|██████████| 135/135 [1:13:41<00:00, 32.75s/it]


In [10]:

# Agora você tem um modelo Random Forest treinado

In [10]:
# Loop pelos conjuntos de teste e previsões

y_test = []
y_pred = []

for test_set in tqdm(test_sets):
    data_test = pd.read_csv(os.path.join(DATASET_DIRECTORY, test_set))
    data_test[y_column] = data_test[y_column].map(dict_7classes)  # Mapeia as classes para 'Attack' ou 'Benign'
    
    X_test = data_test[X_columns]
    
    # Transforma as features do conjunto de teste usando o scaler do conjunto de treinamento
    X_test_scaled = scaler.transform(X_test)
    
    y_test += list(data_test[y_column])
    y_pred += list(rf_classifier.predict(X_test_scaled))

100%|██████████| 34/34 [02:56<00:00,  5.19s/it]


In [11]:
# Calcule as métricas de desempenho
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)


Accuracy: 0.9945281316219351
Recall: 0.701941647221552
Precision: 0.9070614818481462
F1 Score: 0.7133683847270598


In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Avalia o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Acurácia:", accuracy)
print("Relatório de Classificação:")
print(report)

Acurácia: 0.9945281316219351
Relatório de Classificação:
              precision    recall  f1-score   support

      Benign       0.89      0.97      0.93    243322
  BruteForce       0.93      0.03      0.05      2983
        DDoS       1.00      1.00      1.00   7526151
         DoS       1.00      1.00      1.00   1792167
       Mirai       1.00      1.00      1.00    583677
       Recon       0.87      0.75      0.81     78630
    Spoofing       0.85      0.83      0.84    107798
         Web       0.71      0.04      0.08      5433

    accuracy                           0.99  10340161
   macro avg       0.91      0.70      0.71  10340161
weighted avg       0.99      0.99      0.99  10340161



In [14]:
# Salvando o Modelo Treinado
import joblib

# Salve o modelo treinado em um arquivo
joblib.dump(rf_classifier, 'modelo_random_forest.pkl')


['modelo_random_forest.pkl']

In [15]:
# Carregue o modelo treinado a partir do arquivo
loaded_rf_model = joblib.load('modelo_random_forest.pkl')

# Agora você pode usar loaded_rf_model para fazer previsões