In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from pipeline import build_preprocessing_pipeline
from sklearn.metrics import accuracy_score, f1_score, make_scorer, balanced_accuracy_score, precision_score, recall_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
from scipy.stats import uniform, randint
from modelos import XGBWithThreshold, CascadedXGBClassifier
from metricas import custom_fbeta
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('data/attack_dataset.csv')

# Separação: Dev-Teste

In [3]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2:]

le = LabelEncoder()
y.type = le.fit_transform(y.type)

#X = X.drop(columns=['src_ip', 'src_port', 'dst_ip', 'dst_port'])

Xdev, Xtest, ydev, ytest = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

Xdev = Xdev.reset_index(drop=True)
Xtest = Xtest.reset_index(drop=True)
ydev = ydev.reset_index(drop=True)
ytest = ytest.reset_index(drop=True)

In [4]:
Xdev.head(5)

Unnamed: 0,ts,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_query,dns_qclass,dns_qtype,dns_rcode,dns_AA,dns_RD,dns_RA,dns_rejected,ssl_version,ssl_cipher,ssl_resumed,ssl_established,ssl_subject,ssl_issuer,http_trans_depth,http_method,http_uri,http_version,http_request_body_len,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice
0,1556021714,192.168.1.32,53921,192.168.1.46,1054,tcp,-,0.0,0,0,S0,0,1,44,0,0,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
1,1556129787,192.168.1.30,30,192.168.1.190,30,tcp,-,0.002459,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
2,1556130002,192.168.1.30,52,192.168.1.190,52,tcp,-,0.000212,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
3,1556129974,192.168.1.30,49,192.168.1.152,49,tcp,-,0.022902,0,0,REJ,0,6,240,6,240,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-
4,1556021450,192.168.1.30,62947,192.168.1.169,9944,tcp,-,0.0,0,0,S0,0,1,44,0,0,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-


# Pré-processamento

In [5]:
# http_response_body_len nao e exatamente categorica, mas se considerarmos todas
# as entradas infrequentes como uma coisa so, ela vira categorica

# Textuais que podem ser consideradas categoricas:
#ssl_subject, ssl_issuer, dns_query
#Sinonimo para features textuais - features descritivas
features_textuais = ['http_user_agent', 'http_uri', 'ssl_subject', 'ssl_issuer', 'dns_query']
features_categoricas = ['weird_notice', 'weird_addl', 'weird_name', 'http_resp_mime_types', 'http_orig_mime_types', 'http_status_code', 'http_version',
                        'http_method', 'http_trans_depth', 'ssl_established',
                        'ssl_resumed', 'ssl_cipher', 'ssl_version', 'dns_rejected', 'dns_RA', 'dns_RD', 'dns_AA', 'dns_rcode', 'dns_qtype', 'dns_qclass',
                        'service', 'proto', 'conn_state']
features_numericas = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 'dst_bytes',
                     'missed_bytes', 'src_pkts', 'http_request_body_len']

features_ip = ['src_ip', 'dst_ip']

features_port = ['src_port', 'dst_port']

pipeline = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)

Xdev_pre = pipeline.fit_transform(Xdev)
Xtest_pre = pipeline.transform(Xtest)



# Otimização de Hiperparâmetros

In [None]:
if False:
    estimator = LogisticRegression()

    param_distributions = {
        'C': np.logspace(-5, 4, 10),
        'max_iter': [3000],
        'class_weight': [None, 'balanced'],
        'random_state': [42]  # Valor fixo
    }

    gscv = GridSearchCV(
        estimator=estimator,
        param_grid=param_distributions,
        scoring='f1_macro',
        cv=5,  # Stratified K-Fold Cross Validation
        verbose=3,
        return_train_score=True,
        n_jobs=4,
        error_score='raise'
    )

    # Treinar o modelo com RandomizedSearchCV
    gscv.fit(Xdev_pre, ydev.type)

    # Exibir os melhores hiperparâmetros encontrados
    print("Melhores hiperparâmetros:", gscv.best_params_)
    print("Melhor score de validação cruzada:", gscv.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Melhores hiperparâmetros: {'C': 1000.0, 'class_weight': None, 'max_iter': 3000, 'random_state': 42}
Melhor score de validação cruzada: 0.8417132936979028


In [7]:
best_params = {'C': 1000.0, 'class_weight': None, 'max_iter': 3000, 'random_state': 42}

# Seleção de Features

In [8]:
if False:
    estimator = LogisticRegression(**best_params)

    rfecv = RFECV(
        estimator=estimator, 
        step=1,  # Número de features removidas por vez
        cv=5,  # Validação cruzada estratificada
        scoring='f1_macro',  # Métrica usada para avaliação
        n_jobs=10,  # Usar todos os núcleos disponíveis
        verbose=3
    )

    # 6. Executar a seleção de features no conjunto de treino
    rfecv.fit(Xdev_pre, ydev.type)

    # 7. Analisar os resultados
    print(f"Número ótimo de features selecionadas: {rfecv.n_features_}")
    print("Features selecionadas:", Xdev_pre.columns[rfecv.support_])

In [9]:
features_selecionadas = ['src_ip_privado', 'dst_ip_broadcast', 'dst_ip_privado',
       'src_port_well_known', 'src_port_registered', 'src_port_dynamic',
       'dst_port_well_known', 'dst_port_registered', 'dst_port_dynamic',
       'duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_pkts',
       'weird_notice_F', 'weird_name_possible_split_routing',
       'http_orig_mime_types_-', 'http_status_code_0', 'http_status_code_200',
       'http_status_code_404', 'http_method_-', 'dns_rejected_-',
       'dns_rejected_F', 'dns_rejected_T', 'dns_RA_-', 'dns_RA_F', 'dns_RA_T',
       'dns_RD_-', 'dns_RD_F', 'dns_RD_T', 'dns_AA_-', 'dns_AA_F', 'dns_AA_T',
       'dns_rcode_0', 'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_0',
       'dns_qtype_1', 'dns_qtype_6', 'dns_qclass_1', 'service_-',
       'service_dce_rpc', 'service_dns', 'service_ftp', 'service_http',
       'service_smb', 'service_smb;gssapi', 'service_ssl', 'proto_icmp',
       'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ',
       'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR',
       'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2',
       'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR',
       'dns_query_infrequent_sklearn']

# Treinamento

In [18]:
model = LogisticRegression(**best_params)

Xtrain, Xval, ytrain, yval = train_test_split(Xdev_pre[features_selecionadas], ydev, test_size=0.2, random_state=42)

model.fit(Xtrain, ytrain.type)

In [19]:
print(f'Acurácia: {accuracy_score(ytest.type, model.predict(Xtest_pre[features_selecionadas])):.4f}')
print(f'F1-score: {f1_score(ytest.type, model.predict(Xtest_pre[features_selecionadas]), average="macro"):.4f}')

Acurácia: 0.8849
F1-score: 0.8362


In [17]:
model = LogisticRegression(**best_params)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_macro = []
accuracy = []
for train_index, val_index in skf.split(Xdev_pre, ydev.type):
    X_train, X_val = Xdev_pre.loc[train_index, features_selecionadas], Xdev_pre.loc[val_index, features_selecionadas]
    y_train, y_val = ydev.loc[train_index, 'type'], ydev.loc[val_index, 'type']
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    f1_macro.append(f1_score(y_val, y_val_pred, average='macro'))
    accuracy.append(accuracy_score(y_val, y_val_pred))

print(f'Acurácia: {np.mean(accuracy):.4f} (média); {np.std(accuracy):.4f} (desvio padrão)')
print(f'F1-score: {np.mean(f1_macro):.4f} (média); {np.std(f1_macro):.4f} (desvio padrão)')


Acurácia: 0.8889 (média); 0.0049 (desvio padrão)
F1-score: 0.8392 (média); 0.0074 (desvio padrão)


In [10]:
model = LogisticRegression(**best_params)
model.fit(Xdev_pre[features_selecionadas], ydev.type)
ytest_pred = model.predict(Xtest_pre[features_selecionadas])

print('Acurácia:', balanced_accuracy_score(ytest.type, ytest_pred))
print('F1 score:', f1_score(ytest.type, ytest_pred, average='macro'))
print('Precisão macro:', precision_score(ytest.type, ytest_pred, average='macro'))
print('Recall macro:', recall_score(ytest.type, ytest_pred, average='macro'))

Acurácia: 0.8242420255183414
F1 score: 0.8329222713096064
Precisão macro: 0.8557067507917068
Recall macro: 0.8242420255183414
