In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import joblib

In [2]:
# Defina suas colunas de entrada X e a coluna de saída y
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

In [3]:
DATASET_DIRECTORY = '../dataset/CICIoT2023/'

In [4]:
# Inicializa um dicionário para mapear rótulos de classe
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [5]:
# Obtém a lista de arquivos CSV no diretório
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Divide os conjuntos de treinamento e teste com base na lista
split_ratio = 0.8
training_sets = df_sets[:int(len(df_sets) * split_ratio)]
test_sets = df_sets[int(len(df_sets) * split_ratio):]

In [6]:
# Inicializa o scaler fora do loop
scaler = StandardScaler()

In [7]:
# Loop de treinamento
for train_set in tqdm(training_sets):
    df_train = pd.read_csv(os.path.join(DATASET_DIRECTORY, train_set))
    df_train[X_columns] = scaler.fit_transform(df_train[X_columns])
     # Mapeia as classes usando o dicionário dict_2classes
    df_train[y_column] = df_train[y_column].map(dict_7classes)
    
    svm_model = SVC()  # Usa o SVM (Support Vector Machine)
    svm_model.fit(df_train[X_columns], df_train[y_column])

  1%|          | 1/135 [49:48<111:14:41, 2988.67s/it]

In [None]:
# Avaliação no conjunto de teste
y_test = []
y_pred_svm = []

for test_set in tqdm(test_sets):
    df_test = pd.read_csv(os.path.join(DATASET_DIRECTORY, test_set))
    df_test[X_columns] = scaler.transform(df_test[X_columns])
    
    # Mapeia as classes usando o dicionário dict_2classes
    df_test[y_column] = df_test[y_column].map(dict_7classes)
    
    y_test += list(df_test[y_column].values)
    
    # Usa o modelo SVM treinado
    y_pred = list(svm_model.predict(df_test[X_columns]))
    y_pred_svm += y_pred

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [2:03:08<00:00, 217.30s/it] 


In [None]:
# Avalie o desempenho do modelo SVM
print("##### SVM (2 classes) #####")
print('accuracy_score: ', accuracy_score(y_pred_svm, y_test))
print('recall_score: ', recall_score(y_pred_svm, y_test, average='macro'))
print('precision_score: ', precision_score(y_pred_svm, y_test, average='macro'))
print('f1_score: ', f1_score(y_pred_svm, y_test, average='macro'))

##### SVM (2 classes) #####
accuracy_score:  0.9924763260456002
recall_score:  0.9010777308382814
precision_score:  0.946598830659894
f1_score:  0.9225444652374691


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Avalie o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred_svm)
report = classification_report(y_test, y_pred_svm)

print("Acurácia:", accuracy)
print("Relatório de Classificação:")
print(report)

Acurácia: 0.9924763260456002
Relatório de Classificação:
              precision    recall  f1-score   support

      Attack       1.00      0.99      1.00  10096839
      Benign       0.80      0.90      0.85    243322

    accuracy                           0.99  10340161
   macro avg       0.90      0.95      0.92  10340161
weighted avg       0.99      0.99      0.99  10340161



In [15]:
#Salvando o Modelo Treinado
import joblib
# Salve o modelo treinado em um arquivo
joblib.dump(svm_model, 'modelo_svm.pkl')

['modelo_svm.pkl']

In [None]:
# Carregue o modelo treinado a partir do arquivo
loaded_ebm_model = joblib.load('modelo_ebm.pkl')

# Agora você pode usar loaded_ebm_model para fazer previsões