In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from pipeline import build_preprocessing_pipeline
from sklearn.metrics import accuracy_score, f1_score, make_scorer, balanced_accuracy_score, precision_score, recall_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
from scipy.stats import uniform, randint
from modelos import XGBWithThreshold, CascadedXGBClassifier
from metricas import custom_fbeta
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('data/attack_dataset.csv')

# Separação: Dev-Teste

In [15]:
y.type.value_counts()

type
7    20000
2    20000
3    20000
1    20000
5    20000
8    20000
6    20000
0    20000
4     1043
Name: count, dtype: int64

In [3]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2:]

le = LabelEncoder()
y.type = le.fit_transform(y.type)

#X = X.drop(columns=['src_ip', 'src_port', 'dst_ip', 'dst_port'])

Xdev, Xtest, ydev, ytest = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

Xdev = Xdev.reset_index(drop=True)
Xtest = Xtest.reset_index(drop=True)
ydev = ydev.reset_index(drop=True)
ytest = ytest.reset_index(drop=True)

In [4]:
Xdev.head(5)

Unnamed: 0,ts,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_query,dns_qclass,dns_qtype,dns_rcode,dns_AA,dns_RD,dns_RA,dns_rejected,ssl_version,ssl_cipher,ssl_resumed,ssl_established,ssl_subject,ssl_issuer,http_trans_depth,http_method,http_uri,http_version,http_request_body_len,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice
0,1556021714,192.168.1.32,53921,192.168.1.46,1054,tcp,-,0.0,0,0,S0,0,1,44,0,0,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
1,1556129787,192.168.1.30,30,192.168.1.190,30,tcp,-,0.002459,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
2,1556130002,192.168.1.30,52,192.168.1.190,52,tcp,-,0.000212,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
3,1556129974,192.168.1.30,49,192.168.1.152,49,tcp,-,0.022902,0,0,REJ,0,6,240,6,240,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
4,1556021450,192.168.1.30,62947,192.168.1.169,9944,tcp,-,0.0,0,0,S0,0,1,44,0,0,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-


# Pré-processamento

In [5]:
# http_response_body_len nao e exatamente categorica, mas se considerarmos todas
# as entradas infrequentes como uma coisa so, ela vira categorica

# Textuais que podem ser consideradas categoricas:
#ssl_subject, ssl_issuer, dns_query
#Sinonimo para features textuais - features descritivas
features_textuais = ['http_user_agent', 'http_uri', 'ssl_subject', 'ssl_issuer', 'dns_query']
features_categoricas = ['weird_notice', 'weird_addl', 'weird_name', 'http_resp_mime_types', 'http_orig_mime_types', 'http_status_code', 'http_version',
                        'http_method', 'http_trans_depth', 'ssl_established',
                        'ssl_resumed', 'ssl_cipher', 'ssl_version', 'dns_rejected', 'dns_RA', 'dns_RD', 'dns_AA', 'dns_rcode', 'dns_qtype', 'dns_qclass',
                        'service', 'proto', 'conn_state']
features_numericas = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 'dst_bytes',
                     'missed_bytes', 'src_pkts', 'http_request_body_len']

features_ip = ['src_ip', 'dst_ip']

features_port = ['src_port', 'dst_port']

pipeline = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)

Xdev_pre = pipeline.fit_transform(Xdev)
Xtest_pre = pipeline.transform(Xtest)



In [6]:
len(Xdev_pre.columns)

95

# Otimização de Hiperparâmetros

In [9]:
if False:
    estimator = xgb.XGBClassifier(importance_type='gain')

    param_distributions = {
        'max_depth': randint(2, 40),
        'n_estimators': randint(10, 200),
        'learning_rate': uniform(1e-3, 10),  # Valores contínuos entre 0.001 e 0.3
        'reg_lambda': uniform(0, 1e-1),  # Valores contínuos para regularização L2
        'random_state': [42]  # Valor fixo
    }

    rscv = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_distributions,
        n_iter=100,  # Número de combinações aleatórias a serem testadas
        scoring='f1_macro',
        cv=5,  # Stratified K-Fold Cross Validation
        verbose=3,
        return_train_score=True,
        random_state=42,  # Garante reprodutibilidade
        n_jobs=10,
        error_score='raise'
    )

    # Treinar o modelo com RandomizedSearchCV
    rscv.fit(Xdev_pre, ydev.type)

    # Exibir os melhores hiperparâmetros encontrados
    print("Melhores hiperparâmetros:", rscv.best_params_)
    print("Melhor score de validação cruzada:", rscv.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Melhores hiperparâmetros: {'learning_rate': 0.1573640674119393, 'max_depth': 10, 'n_estimators': 97, 'random_state': 42, 'reg_lambda': 0.03948815181755697}
Melhor score de validação cruzada: 0.9628761920467829


In [9]:
best_params = {'learning_rate': 0.1573640674119393, 'max_depth': 10, 'n_estimators': 97, 'random_state': 42, 'reg_lambda': 0.03948815181755697}

# Seleção de Features

In [8]:
if False:
    estimator = xgb.XGBClassifier(**best_params)

    rfecv = RFECV(
        estimator=estimator, 
        step=1,  # Número de features removidas por vez
        cv=5,  # Validação cruzada estratificada
        scoring='f1_macro',  # Métrica usada para avaliação
        n_jobs=10,  # Usar todos os núcleos disponíveis
        verbose=3
    )

    # 6. Executar a seleção de features no conjunto de treino
    rfecv.fit(Xdev_pre, ydev.type)

    # 7. Analisar os resultados
    print(f"Número ótimo de features selecionadas: {rfecv.n_features_}")
    print("Features selecionadas:", Xdev_pre.columns[rfecv.support_])

In [10]:
features_selecionadas = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes',
       'dst_bytes', 'missed_bytes', 'src_pkts', 'weird_notice_F',
       'dns_rejected_-', 'dns_rejected_F', 'dns_rejected_T', 'dns_RA_F',
       'dns_RA_T', 'dns_RD_F', 'dns_RD_T', 'dns_AA_F', 'dns_rcode_0',
       'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_1', 'dns_qtype_6',
       'dns_qtype_28', 'service_-', 'service_dce_rpc', 'service_ftp',
       'service_http', 'service_smb', 'service_smb;gssapi', 'service_ssl',
       'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH',
       'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0',
       'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1',
       'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH',
       'conn_state_SHR']

# Treinamento

In [10]:
model = xgb.XGBClassifier(**best_params)

Xtrain, Xval, ytrain, yval = train_test_split(Xdev_pre[features_selecionadas], ydev, test_size=0.2, random_state=42)

model.fit(Xtrain, ytrain.type)

In [11]:
print('Acurácia:', accuracy_score(model.predict(Xtrain), ytrain.type))
print('F1-score:', f1_score(model.predict(Xtrain), ytrain.type, average='macro'))

Acurácia: 0.9792076998457314
F1-score: 0.9742627701840719


In [12]:
model = xgb.XGBClassifier(**best_params)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_macro = []
accuracy = []
for train_index, val_index in skf.split(Xdev_pre, ydev.type):
    X_train, X_val = Xdev_pre.loc[train_index, features_selecionadas], Xdev_pre.loc[val_index, features_selecionadas]
    y_train, y_val = ydev.loc[train_index, 'type'], ydev.loc[val_index, 'type']
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    f1_macro.append(f1_score(y_val, y_val_pred, average='macro'))
    accuracy.append(accuracy_score(y_val, y_val_pred))

print(f'Acurácia: {np.mean(accuracy):.4f} (média); {np.std(accuracy):.4f} (desvio padrão)')
print(f'F1-score: {np.mean(f1_macro):.4f} (média); {np.std(f1_macro):.4f} (desvio padrão)')


Acurácia: 0.9742 (média); 0.0008 (desvio padrão)
F1-score: 0.9523 (média); 0.0024 (desvio padrão)


# Teste

In [11]:
model = xgb.XGBClassifier(**best_params)
model.fit(Xdev_pre[features_selecionadas], ydev.type)
ytest_pred = model.predict(Xtest_pre[features_selecionadas])

print('Acurácia:', balanced_accuracy_score(ytest.type, ytest_pred))
print('F1 score:', f1_score(ytest.type, ytest_pred, average='macro'))
print('Precisão macro:', precision_score(ytest.type, ytest_pred, average='macro'))
print('Recall macro:', recall_score(ytest.type, ytest_pred, average='macro'))

Acurácia: 0.9572185007974482
F1 score: 0.9539182430860459
Precisão macro: 0.9513450790787006
Recall macro: 0.9572185007974482


In [33]:
classes = sorted(ytest.type.unique())
metrics = {'Precision':[], 'Recall':[], 'Accuracy':[], 'F1score':[]}
for c in classes:
    mask_c = ytest.type == c
    pred = ytest_pred[mask_c]
    mask_pos = pred == c
    mask_neg = pred != c
    pred[mask_pos] = 1
    pred[mask_neg] = 0
    real = np.ones_like(ytest.type[mask_c].values)

    metrics['Precision'].append(precision_score(real, pred))
    metrics['Recall'].append(recall_score(real, pred))
    metrics['Accuracy'].append(accuracy_score(real, pred))
    metrics['F1score'].append(f1_score(real, pred))

pd.DataFrame(metrics, index=le.classes_)

Unnamed: 0,Precision,Recall,Accuracy,F1score
backdoor,1.0,0.99975,0.99975,0.999875
ddos,1.0,0.971,0.971,0.985287
dos,1.0,0.98375,0.98375,0.991808
injection,1.0,0.96675,0.96675,0.983094
mitm,1.0,0.822967,0.822967,0.902887
password,1.0,0.9775,0.9775,0.988622
ransomware,1.0,0.9845,0.9845,0.992189
scanning,1.0,0.994,0.994,0.996991
xss,1.0,0.91475,0.91475,0.955477


In [32]:
sorted(classes)

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [29]:
le.classes_

array(['backdoor', 'ddos', 'dos', 'injection', 'mitm', 'password',
       'ransomware', 'scanning', 'xss'], dtype=object)