In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from pipeline import build_preprocessing_pipeline
from sklearn.metrics import accuracy_score, f1_score, make_scorer, balanced_accuracy_score, precision_score, recall_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
from scipy.stats import uniform, randint
from modelos import XGBWithThreshold, CascadedXGBClassifier
from metricas import custom_fbeta
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

pd.set_option('display.max_columns', 50)

In [29]:
df_original = pd.read_csv('data/train_test_network.csv')
df_ataques = pd.read_csv('data/attack_dataset.csv')

# Separação: Dev-Teste

In [30]:
X_original = df_original.iloc[:, :-2]
y_original = df_original.iloc[:, -2:]
X_ataques = df_ataques.iloc[:, :-2]
y_ataques = df_ataques.iloc[:, -2:]

#le = LabelEncoder()
#y_original.type = le.fit_transform(y_original.type)
#y_ataques.type = le.fit_transform(y_ataques.type)

Xdev_original, Xtest_original, ydev_original, ytest_original = train_test_split(X_original, y_original, test_size=0.2, stratify=y_original, random_state=42)
Xdev_ataques, Xtest_ataques, ydev_ataques, ytest_ataques = train_test_split(X_ataques, y_ataques, test_size=0.2, stratify=y_ataques, random_state=42)

Xdev_original = Xdev_original.reset_index(drop=True)
Xtest_original = Xtest_original.reset_index(drop=True)
ydev_original = ydev_original.reset_index(drop=True)
ytest_original = ytest_original.reset_index(drop=True)

Xdev_ataques = Xdev_ataques.reset_index(drop=True)
Xtest_ataques = Xtest_ataques.reset_index(drop=True)
ydev_ataques = ydev_ataques.reset_index(drop=True)
ytest_ataques = ytest_ataques.reset_index(drop=True)

# Pré-processamento

In [31]:
# http_response_body_len nao e exatamente categorica, mas se considerarmos todas
# as entradas infrequentes como uma coisa so, ela vira categorica

# Textuais que podem ser consideradas categoricas:
#ssl_subject, ssl_issuer, dns_query
#Sinonimo para features textuais - features descritivas
features_textuais = ['http_user_agent', 'http_uri', 'ssl_subject', 'ssl_issuer', 'dns_query']
features_categoricas = ['weird_notice', 'weird_addl', 'weird_name', 'http_resp_mime_types', 'http_orig_mime_types', 'http_status_code', 'http_version',
                        'http_method', 'http_trans_depth', 'ssl_established',
                        'ssl_resumed', 'ssl_cipher', 'ssl_version', 'dns_rejected', 'dns_RA', 'dns_RD', 'dns_AA', 'dns_rcode', 'dns_qtype', 'dns_qclass',
                        'service', 'proto', 'conn_state']
features_numericas = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 'dst_bytes',
                     'missed_bytes', 'src_pkts', 'http_request_body_len']

colunas_para_excluir = ['src_ip', 'src_port', 'dst_ip', 'dst_port']

features_ip = ['src_ip', 'dst_ip']

features_port = ['src_port', 'dst_port']

pipeline_original = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)
pipeline_ataques = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)

Xdev_pre_original = pipeline_original.fit_transform(Xdev_original)
Xtest_pre_original = pipeline_original.transform(Xtest_original)

Xdev_pre_ataques = pipeline_ataques.fit_transform(Xdev_ataques)
Xtest_pre_ataques = pipeline_ataques.transform(Xtest_ataques)



# Avaliação

In [53]:
primary_params = {'learning_rate': 0.23162425041415757, 'max_depth': 28, 'n_estimators': 68, 'random_state': 42, 'reg_lambda': 0.011768119524349979, 'threshold': 0.5233328316068078}
secondary_params = {'learning_rate': 0.1573640674119393, 'max_depth': 10, 'n_estimators': 97, 'random_state': 42, 'reg_lambda': 0.03948815181755697}

primary_features = ['src_ip_ipv6', 'src_ip_privado', 'dst_ip_broadcast', 'dst_ip_ipv6', 'dst_ip_privado', 'dst_ip_multicast', 'src_port_well_known',
       'src_port_registered', 'src_port_dynamic', 'dst_port_well_known', 'dst_port_registered', 'dst_port_dynamic', 'duration', 'dst_pkts',
       'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'dst_bytes', 'missed_bytes', 'src_pkts', 'weird_notice_F', 'http_status_code_0',
       'ssl_established_-', 'ssl_resumed_T', 'dns_rejected_-', 'dns_rejected_F', 'dns_rejected_T', 'dns_RA_F', 'dns_RA_T', 'dns_RD_F',
       'dns_RD_T', 'dns_AA_F', 'dns_rcode_0', 'dns_rcode_2', 'dns_rcode_3', 'dns_qtype_0', 'dns_qtype_1', 'dns_qtype_12', 'dns_qtype_28',
       'dns_qtype_33', 'dns_qclass_1', 'service_-', 'service_dns', 'service_ftp', 'service_http', 'service_ssl', 'proto_icmp', 'proto_tcp',
       'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH',
       'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR', 'dns_query_infrequent_sklearn']
secondary_features = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'dst_bytes', 'missed_bytes', 'src_pkts', 'weird_notice_F',
       'dns_rejected_-', 'dns_rejected_F', 'dns_rejected_T', 'dns_RA_F', 'dns_RA_T', 'dns_RD_F', 'dns_RD_T', 'dns_AA_F', 'dns_rcode_0',
       'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_1', 'dns_qtype_6', 'dns_qtype_28', 'service_-', 'service_dce_rpc', 'service_ftp',
       'service_http', 'service_smb', 'service_smb;gssapi', 'service_ssl', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH',
       'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1',
       'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR']


# Primary model
primary_model = XGBWithThreshold(**primary_params)
primary_model.fit(Xdev_pre_original[primary_features], ydev_original['label'])

primary_pred = primary_model.predict(Xtest_pre_original[primary_features])

mask_ataques = (primary_pred == 1) & (ytest_original['label'] == 1)
input_module2 = Xtest_original.loc[mask_ataques, :].reset_index(drop=True)

Xtest_pre_ataques = pipeline_ataques.transform(input_module2)

ytest_ataques = ytest_original.loc[mask_ataques].reset_index(drop=True)
le = LabelEncoder()
ydev_ataques.loc[:, 'type'] = le.fit_transform(ydev_ataques.type)
ytest_ataques.loc[:, 'type'] = le.fit_transform(ytest_ataques.type)

# Secondary model
secondary_model = xgb.XGBClassifier(**secondary_params)
secondary_model.fit(Xdev_pre_ataques[secondary_features], ydev_ataques.type)
secondary_pred = secondary_model.predict(Xtest_pre_ataques[secondary_features])

print(f'Accuracy: {balanced_accuracy_score(ytest_ataques.type.values.astype(int), secondary_pred)}')
print(f'F1 macro: {f1_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro")}')
print(f'Precision macro', precision_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro"))
print(f'Recall macro:', recall_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro"))



Accuracy: 0.9750845851534441
F1 macro: 0.9709341321604114
Precision macro 0.9675294623259342
Recall macro: 0.9750845851534441


In [56]:
print(accuracy_score(ytest_original.label, primary_pred))

0.9956511837239315


In [63]:
classes = sorted(ytest_ataques.type.unique())
metrics = {'Precision':[], 'Recall':[], 'Accuracy':[], 'F1score':[]}
for c in classes:
    mask_c = ytest_ataques.type == c
    pred = secondary_pred[mask_c]
    mask_pos = pred == c
    mask_neg = pred != c
    pred[mask_pos] = 1
    pred[mask_neg] = 0
    real = np.ones_like(ytest_ataques.type[mask_c].values).astype(int)

    metrics['Precision'].append(precision_score(real, pred))
    metrics['Recall'].append(recall_score(real, pred))
    metrics['Accuracy'].append(accuracy_score(real, pred))
    metrics['F1score'].append(f1_score(real, pred))

pd.DataFrame(metrics, index=le.classes_)

Unnamed: 0,Precision,Recall,Accuracy,F1score
backdoor,1.0,0.99975,0.99975,0.999875
ddos,1.0,0.980461,0.980461,0.990134
dos,1.0,0.990481,0.990481,0.995218
injection,1.0,0.977233,0.977233,0.988485
mitm,1.0,0.944444,0.944444,0.971429
password,1.0,0.985496,0.985496,0.992695
ransomware,1.0,0.98302,0.98302,0.991437
scanning,1.0,0.995997,0.995997,0.997994
xss,1.0,0.918878,0.918878,0.957724


In [62]:
np.unique(pred)

array([0, 1], dtype=int64)

In [40]:
print(f'Accuracy: {balanced_accuracy_score(ytest_ataques.type, secondary_pred)}')


Accuracy: 0.5447965124257834


