In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from pipeline import build_preprocessing_pipeline
from sklearn.metrics import accuracy_score, f1_score, make_scorer, balanced_accuracy_score, precision_score, recall_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
from scipy.stats import uniform, randint
from modelos import LogRegWithThreshold, CascadedLogisticRegression
from metricas import custom_fbeta
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

pd.set_option('display.max_columns', 50)

In [20]:
df_original = pd.read_csv('data/train_test_network.csv')
df_ataques = pd.read_csv('data/attack_dataset.csv')

# Separação: Dev-Teste

In [21]:
X_original = df_original.iloc[:, :-2]
y_original = df_original.iloc[:, -2:]
X_ataques = df_ataques.iloc[:, :-2]
y_ataques = df_ataques.iloc[:, -2:]

#le = LabelEncoder()
#y_original.type = le.fit_transform(y_original.type)
#y_ataques.type = le.fit_transform(y_ataques.type)

Xdev_original, Xtest_original, ydev_original, ytest_original = train_test_split(X_original, y_original, test_size=0.2, stratify=y_original, random_state=42)
Xdev_ataques, Xtest_ataques, ydev_ataques, ytest_ataques = train_test_split(X_ataques, y_ataques, test_size=0.2, stratify=y_ataques, random_state=42)

Xdev_original = Xdev_original.reset_index(drop=True)
Xtest_original = Xtest_original.reset_index(drop=True)
ydev_original = ydev_original.reset_index(drop=True)
ytest_original = ytest_original.reset_index(drop=True)

Xdev_ataques = Xdev_ataques.reset_index(drop=True)
Xtest_ataques = Xtest_ataques.reset_index(drop=True)
ydev_ataques = ydev_ataques.reset_index(drop=True)
ytest_ataques = ytest_ataques.reset_index(drop=True)

le = LabelEncoder()
ydev_ataques.loc[:, 'type'] = le.fit_transform(ydev_ataques.type)

# Pré-processamento

In [22]:
# http_response_body_len nao e exatamente categorica, mas se considerarmos todas
# as entradas infrequentes como uma coisa so, ela vira categorica

# Textuais que podem ser consideradas categoricas:
#ssl_subject, ssl_issuer, dns_query
#Sinonimo para features textuais - features descritivas
features_textuais = ['http_user_agent', 'http_uri', 'ssl_subject', 'ssl_issuer', 'dns_query']
features_categoricas = ['weird_notice', 'weird_addl', 'weird_name', 'http_resp_mime_types', 'http_orig_mime_types', 'http_status_code', 'http_version',
                        'http_method', 'http_trans_depth', 'ssl_established',
                        'ssl_resumed', 'ssl_cipher', 'ssl_version', 'dns_rejected', 'dns_RA', 'dns_RD', 'dns_AA', 'dns_rcode', 'dns_qtype', 'dns_qclass',
                        'service', 'proto', 'conn_state']
features_numericas = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 'dst_bytes',
                     'missed_bytes', 'src_pkts', 'http_request_body_len']

colunas_para_excluir = ['src_ip', 'src_port', 'dst_ip', 'dst_port']

features_ip = ['src_ip', 'dst_ip']

features_port = ['src_port', 'dst_port']

pipeline_original = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)
pipeline_ataques = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)

Xdev_pre_original = pipeline_original.fit_transform(Xdev_original)
Xtest_pre_original = pipeline_original.transform(Xtest_original)

Xdev_pre_ataques = pipeline_ataques.fit_transform(Xdev_ataques)
Xtest_pre_ataques = pipeline_ataques.transform(Xtest_ataques)



# Avaliação

In [24]:
primary_params = {'C': 10000.0, 'class_weight': 'balanced', 'max_iter': 250, 'random_state': 42, 'threshold': 0.5552008115994623}
secondary_params = {'C': 1000.0, 'class_weight': None, 'max_iter': 3000, 'random_state': 42}

primary_features = ['src_ip_host', 'src_ip_ipv6', 'src_ip_privado', 'src_ip_multicast', 'dst_ip_host', 'dst_ip_broadcast', 'dst_ip_ipv6', 'dst_ip_privado', 'dst_ip_multicast', 'src_port_well_known', 'src_port_registered', 'src_port_dynamic', 'dst_port_well_known', 'dst_port_registered', 'dst_port_dynamic', 'duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 'missed_bytes', 'http_request_body_len', 'weird_notice_F', 'weird_addl_-', 'weird_addl_46', 'weird_name_-', 'weird_name_DNS_RR_unknown_type', 'weird_name_above_hole_data_without_any_acks', 'weird_name_active_connection_reuse', 'weird_name_bad_TCP_checksum', 'weird_name_bad_UDP_checksum', 'weird_name_connection_originator_SYN_ack', 'weird_name_data_before_established', 'weird_name_inappropriate_FIN', 'http_resp_mime_types_-', 'http_resp_mime_types_application/ocsp-response', 'http_resp_mime_types_application/vnd.ms-cab-compressed', 'http_resp_mime_types_application/xml', 'http_resp_mime_types_image/jpeg', 'http_resp_mime_types_image/png', 'http_resp_mime_types_text/html', 'http_resp_mime_types_text/json', 'http_orig_mime_types_-', 'http_status_code_0', 'http_status_code_200', 'http_status_code_206', 'http_status_code_302', 'http_status_code_404', 'http_version_1.1', 'http_method_-', 'http_method_GET', 'http_trans_depth_-', 'http_trans_depth_1', 'http_trans_depth_2', 'ssl_established_-', 'ssl_established_F', 'ssl_established_T', 'ssl_resumed_-', 'ssl_resumed_F', 'ssl_resumed_T', 'ssl_cipher_-', 'ssl_cipher_TLS_AES_128_GCM_SHA256', 'ssl_cipher_TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', 'ssl_cipher_TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', 'ssl_version_-', 'ssl_version_TLSv12', 'ssl_version_TLSv13', 'dns_rejected_-', 'dns_rejected_F', 'dns_rejected_T', 'dns_RA_-', 'dns_RA_F', 'dns_RA_T', 'dns_RD_-', 'dns_RD_F', 'dns_RD_T', 'dns_AA_-', 'dns_AA_F', 'dns_AA_T', 'dns_rcode_0', 'dns_rcode_1', 'dns_rcode_2', 'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_0', 'dns_qtype_1', 'dns_qtype_2', 'dns_qtype_12', 'dns_qtype_28', 'dns_qtype_32', 'dns_qtype_33', 'dns_qtype_43', 'dns_qtype_48', 'dns_qtype_255', 'dns_qclass_0', 'dns_qclass_1', 'dns_qclass_32769', 'service_-', 'service_dce_rpc', 'service_dhcp', 'service_dns', 'service_ftp', 'service_gssapi', 'service_http', 'service_smb', 'service_ssl', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR', 'http_user_agent_infrequent_sklearn', 'http_uri_infrequent_sklearn', 'ssl_subject_infrequent_sklearn', 'ssl_issuer_infrequent_sklearn', 'dns_query_infrequent_sklearn']
secondary_features = ['src_ip_privado', 'dst_ip_broadcast', 'dst_ip_privado',
       'src_port_well_known', 'src_port_registered', 'src_port_dynamic',
       'dst_port_well_known', 'dst_port_registered', 'dst_port_dynamic',
       'duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_pkts',
       'weird_notice_F', 'weird_name_possible_split_routing',
       'http_orig_mime_types_-', 'http_status_code_0', 'http_status_code_200',
       'http_status_code_404', 'http_method_-', 'dns_rejected_-',
       'dns_rejected_F', 'dns_rejected_T', 'dns_RA_-', 'dns_RA_F', 'dns_RA_T',
       'dns_RD_-', 'dns_RD_F', 'dns_RD_T', 'dns_AA_-', 'dns_AA_F', 'dns_AA_T',
       'dns_rcode_0', 'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_0',
       'dns_qtype_1', 'dns_qtype_6', 'dns_qclass_1', 'service_-',
       'service_dce_rpc', 'service_dns', 'service_ftp', 'service_http',
       'service_smb', 'service_smb;gssapi', 'service_ssl', 'proto_icmp',
       'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ',
       'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR',
       'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2',
       'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR',
       'dns_query_infrequent_sklearn']


# Primary model
primary_model = LogRegWithThreshold(**primary_params)
primary_model.fit(Xdev_pre_original[primary_features], ydev_original['label'])

primary_pred = primary_model.predict(Xtest_pre_original[primary_features])

mask_ataques = (primary_pred == 1) & (ytest_original['label'] == 1)
input_module2 = Xtest_original.loc[mask_ataques, :].reset_index(drop=True)

Xtest_pre_ataques = pipeline_ataques.transform(input_module2)

ytest_ataques = ytest_original.loc[mask_ataques].reset_index(drop=True)
ytest_ataques.loc[:, 'type'] = le.transform(ytest_ataques.type)

# Secondary model
secondary_model = LogisticRegression(**secondary_params)
secondary_model.fit(Xdev_pre_ataques[secondary_features], ydev_ataques.type.astype(int))
secondary_pred = secondary_model.predict(Xtest_pre_ataques[secondary_features])

print(f'Accuracy: {balanced_accuracy_score(ytest_ataques.type.values.astype(int), secondary_pred)}')
print(f'F1 macro: {f1_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro")}')
print(f'Precision macro', precision_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro"))
print(f'Recall macro:', recall_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro"))



Accuracy: 0.8048225047499771
F1 macro: 0.812413414731864
Precision macro 0.8654608528119723
Recall macro: 0.8048225047499771


In [25]:
classes = sorted(ytest_ataques.type.unique())
metrics = {'Precision':[], 'Recall':[], 'Accuracy':[], 'F1score':[]}
for c in classes:
    real = ytest_ataques.type.copy()
    real[ytest_ataques.type == c] = 1
    real[ytest_ataques.type != c] = 0
    real = real.values.astype(int)

    pred = secondary_pred.copy()
    pred[secondary_pred == c] = 1
    pred[secondary_pred != c] = 0
    pred = pred.astype(int)

    metrics['Precision'].append(precision_score(real, pred))
    metrics['Recall'].append(recall_score(real, pred))
    metrics['Accuracy'].append(accuracy_score(real, pred))
    metrics['F1score'].append(f1_score(real, pred))

pd.DataFrame(metrics, index=le.classes_).round(4) * 100

Unnamed: 0,Precision,Recall,Accuracy,F1score
backdoor,80.75,100.0,96.89,89.35
ddos,93.48,92.84,98.24,93.16
dos,88.46,93.02,97.53,90.68
injection,88.85,74.34,95.61,80.95
mitm,70.97,15.71,99.59,25.73
password,78.66,81.89,94.76,80.24
ransomware,90.63,99.25,98.99,94.74
scanning,97.06,79.51,97.02,87.41
xss,90.06,87.77,97.16,88.9


In [50]:
mitm_dificeis = [350, 400, 994, 1025, 1207, 1230, 1314, 1888, 2048, 2054, 2557, 2652, 3068, 3221, 4088, 4228, 4563, 5238, 5364, 5521, 5591, 5809, 5921, 6379, 6459, 6664, 7142, 7348, 7533, 7794, 7830, 7983, 8072, 8106, 8184, 8196, 8254, 8264, 9025, 9257, 9344, 9536, 9542, 9586, 10262, 10460, 10575, 11466, 11787, 12134, 12268, 12590, 12735, 13365, 13476, 13544, 13551, 13697, 13904, 14475, 14486, 14544, 14660, 14916, 14955, 15190, 15592, 15753, 15842, 15888, 15958, 16181, 16604, 16640, 17149, 17463, 17762, 17833, 17901, 18064, 18369, 18507, 18603, 19300, 19666, 19736, 19950, 20137, 20412, 20474, 20652, 20998, 21696, 22066, 22270, 22331, 22552, 22572, 22918, 23351, 23570, 23844, 23914, 24231, 24458, 24878, 25598, 25855, 26011, 26039, 26212, 26871, 26985, 27513, 27623, 27985, 28218, 28275, 28375, 28455, 28456, 28716, 28983, 29062, 29235, 29458, 30197, 30216, 30507, 30808, 30834, 30859, 31093, 31100, 31536, 31783, 31787, 31979, 32059]

mask_acertos = primary_pred == ytest_original.label
mask_mitm = ytest_original.type == 'mitm'
mitm_passados = Xtest_pre_original[mask_mitm & mask_acertos]
mitm_passados.index.isin(mitm_dificeis).sum()

1

In [52]:
ytest_original

Unnamed: 0,label,type
0,1,dos
1,0,normal
2,1,ddos
3,0,normal
4,0,normal
...,...,...
92204,1,backdoor
92205,0,normal
92206,0,normal
92207,0,normal


In [28]:
recall_score(ytest_original['label'], primary_pred)

0.9518768046198267

In [40]:
print(f'Accuracy: {balanced_accuracy_score(ytest_ataques.type, secondary_pred)}')


Accuracy: 0.5447965124257834




In [35]:
ytest_ataques.type.value_counts(normalize=False)

type
0    3993
7    3992
5    3987
8    3975
2    3955
1    3952
3    3850
6    2815
4     140
Name: count, dtype: int64