In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from pipeline import build_preprocessing_pipeline
from sklearn.metrics import accuracy_score, f1_score, make_scorer, balanced_accuracy_score, precision_score, recall_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
from scipy.stats import uniform, randint
from modelos import LogRegWithThreshold, CascadedLogisticRegression
from metricas import custom_fbeta
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

pd.set_option('display.max_columns', 50)

In [2]:
df_original = pd.read_csv('data/train_test_network.csv')
df_ataques = pd.read_csv('data/attack_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/train_test_network.csv'

# Separação: Dev-Teste

In [None]:
X_original = df_original.iloc[:, :-2]
y_original = df_original.iloc[:, -2:]
X_ataques = df_ataques.iloc[:, :-2]
y_ataques = df_ataques.iloc[:, -2:]

#le = LabelEncoder()
#y_original.type = le.fit_transform(y_original.type)
#y_ataques.type = le.fit_transform(y_ataques.type)

Xdev_original, Xtest_original, ydev_original, ytest_original = train_test_split(X_original, y_original, test_size=0.2, stratify=y_original, random_state=42)
Xdev_ataques, Xtest_ataques, ydev_ataques, ytest_ataques = train_test_split(X_ataques, y_ataques, test_size=0.2, stratify=y_ataques, random_state=42)

Xdev_original = Xdev_original.reset_index(drop=True)
Xtest_original = Xtest_original.reset_index(drop=True)
ydev_original = ydev_original.reset_index(drop=True)
ytest_original = ytest_original.reset_index(drop=True)

Xdev_ataques = Xdev_ataques.reset_index(drop=True)
Xtest_ataques = Xtest_ataques.reset_index(drop=True)
ydev_ataques = ydev_ataques.reset_index(drop=True)
ytest_ataques = ytest_ataques.reset_index(drop=True)

# Pré-processamento

In [4]:
# http_response_body_len nao e exatamente categorica, mas se considerarmos todas
# as entradas infrequentes como uma coisa so, ela vira categorica

# Textuais que podem ser consideradas categoricas:
#ssl_subject, ssl_issuer, dns_query
#Sinonimo para features textuais - features descritivas
features_textuais = ['http_user_agent', 'http_uri', 'ssl_subject', 'ssl_issuer', 'dns_query']
features_categoricas = ['weird_notice', 'weird_addl', 'weird_name', 'http_resp_mime_types', 'http_orig_mime_types', 'http_status_code', 'http_version',
                        'http_method', 'http_trans_depth', 'ssl_established',
                        'ssl_resumed', 'ssl_cipher', 'ssl_version', 'dns_rejected', 'dns_RA', 'dns_RD', 'dns_AA', 'dns_rcode', 'dns_qtype', 'dns_qclass',
                        'service', 'proto', 'conn_state']
features_numericas = ['duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 'dst_bytes',
                     'missed_bytes', 'src_pkts', 'http_request_body_len']

colunas_para_excluir = ['src_ip', 'src_port', 'dst_ip', 'dst_port']

features_ip = ['src_ip', 'dst_ip']

features_port = ['src_port', 'dst_port']

pipeline_original = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)
pipeline_ataques = build_preprocessing_pipeline(features_numericas, features_categoricas, features_textuais, features_ip, features_port)

Xdev_pre_original = pipeline_original.fit_transform(Xdev_original)
Xtest_pre_original = pipeline_original.transform(Xtest_original)

Xdev_pre_ataques = pipeline_ataques.fit_transform(Xdev_ataques)
Xtest_pre_ataques = pipeline_ataques.transform(Xtest_ataques)



# Avaliação

In [6]:
primary_params = {'C': 10000.0, 'class_weight': 'balanced', 'max_iter': 250, 'random_state': 42, 'threshold': 0.5539457134966522}
secondary_params = {'C': 1000.0, 'class_weight': None, 'max_iter': 3000, 'random_state': 42}

primary_features = [
    'src_ip_host', 'src_ip_ipv6', 'src_ip_privado', 'src_ip_multicast', 'dst_ip_host', 'dst_ip_broadcast', 'dst_ip_ipv6', 'dst_ip_privado', 'dst_ip_multicast', 'src_port_well_known', 'src_port_registered', 
'src_port_dynamic', 'dst_port_well_known', 'dst_port_registered', 'dst_port_dynamic', 'duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_bytes', 'http_response_body_len', 
'missed_bytes', 'http_request_body_len', 'weird_notice_F', 'weird_addl_-', 'weird_addl_46', 'weird_name_-', 'weird_name_DNS_RR_unknown_type', 'weird_name_above_hole_data_without_any_acks', 'weird_name_active_connection_reuse', 'weird_name_bad_TCP_checksum', 
'weird_name_bad_UDP_checksum', 'weird_name_connection_originator_SYN_ack', 'weird_name_data_before_established', 'weird_name_inappropriate_FIN', 'http_resp_mime_types_-', 'http_resp_mime_types_application/ocsp-response', 'http_resp_mime_types_application/vnd.ms-cab-compressed', 'http_resp_mime_types_application/xml', 'http_resp_mime_types_image/jpeg', 'http_resp_mime_types_image/png', 
'http_resp_mime_types_text/html', 'http_resp_mime_types_text/json', 'http_orig_mime_types_-', 'http_status_code_0', 'http_status_code_200', 'http_status_code_206', 'http_status_code_302', 'http_status_code_404', 'http_version_1.1', 'http_method_-', 
'http_method_GET', 'http_trans_depth_-', 'http_trans_depth_1', 'http_trans_depth_2', 'ssl_established_-', 'ssl_established_F', 'ssl_established_T', 'ssl_resumed_-', 'ssl_resumed_F', 'ssl_resumed_T', 
'ssl_cipher_-', 'ssl_cipher_TLS_AES_128_GCM_SHA256', 'ssl_cipher_TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', 'ssl_cipher_TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', 'ssl_version_-', 'ssl_version_TLSv12', 'ssl_version_TLSv13', 'dns_rejected_-', 'dns_rejected_F', 'dns_rejected_T', 
'dns_RA_-', 'dns_RA_F', 'dns_RA_T', 'dns_RD_-', 'dns_RD_F', 'dns_RD_T', 'dns_AA_-', 'dns_AA_F', 'dns_AA_T', 'dns_rcode_0', 
'dns_rcode_1', 'dns_rcode_2', 'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_0', 'dns_qtype_1', 'dns_qtype_2', 'dns_qtype_12', 'dns_qtype_28', 'dns_qtype_32', 
'dns_qtype_33', 'dns_qtype_43', 'dns_qtype_48', 'dns_qtype_255', 'dns_qclass_0', 'dns_qclass_1', 'dns_qclass_32769', 'service_-', 'service_dce_rpc', 'service_dhcp', 
'service_dns', 'service_ftp', 'service_gssapi', 'service_http', 'service_smb', 'service_ssl', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 
'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_SF', 'conn_state_SH', 
'conn_state_SHR', 'http_user_agent_infrequent_sklearn', 'http_uri_infrequent_sklearn', 'ssl_subject_infrequent_sklearn', 'ssl_issuer_infrequent_sklearn', 'dns_query_infrequent_sklearn'
]
secondary_features = ['src_ip_privado', 'dst_ip_broadcast', 'dst_ip_privado',
       'src_port_well_known', 'src_port_registered', 'src_port_dynamic',
       'dst_port_well_known', 'dst_port_registered', 'dst_port_dynamic',
       'duration', 'dst_pkts', 'src_ip_bytes', 'dst_ip_bytes', 'src_pkts',
       'weird_notice_F', 'weird_name_possible_split_routing',
       'http_orig_mime_types_-', 'http_status_code_0', 'http_status_code_200',
       'http_status_code_404', 'http_method_-', 'dns_rejected_-',
       'dns_rejected_F', 'dns_rejected_T', 'dns_RA_-', 'dns_RA_F', 'dns_RA_T',
       'dns_RD_-', 'dns_RD_F', 'dns_RD_T', 'dns_AA_-', 'dns_AA_F', 'dns_AA_T',
       'dns_rcode_0', 'dns_rcode_3', 'dns_rcode_5', 'dns_qtype_0',
       'dns_qtype_1', 'dns_qtype_6', 'dns_qclass_1', 'service_-',
       'service_dce_rpc', 'service_dns', 'service_ftp', 'service_http',
       'service_smb', 'service_smb;gssapi', 'service_ssl', 'proto_icmp',
       'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ',
       'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR',
       'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2',
       'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR',
       'dns_query_infrequent_sklearn']


# Primary model
primary_model = LogRegWithThreshold(**primary_params)
primary_model.fit(Xdev_pre_original[primary_features], ydev_original['label'])

primary_pred = primary_model.predict(Xtest_pre_original[primary_features])

mask_ataques = (primary_pred == 1) & (ytest_original['label'] == 1)
input_module2 = Xtest_original.loc[mask_ataques, :].reset_index(drop=True)

Xtest_pre_ataques = pipeline_ataques.transform(input_module2)

ytest_ataques = ytest_original.loc[mask_ataques].reset_index(drop=True)
le = LabelEncoder()
ydev_ataques.loc[:, 'type'] = le.fit_transform(ydev_ataques.type)
ytest_ataques.loc[:, 'type'] = le.fit_transform(ytest_ataques.type)

# Secondary model
secondary_model = xgb.XGBClassifier(**secondary_params)
secondary_model.fit(Xdev_pre_ataques[secondary_features], ydev_ataques.type)
secondary_pred = secondary_model.predict(Xtest_pre_ataques[secondary_features])

print(f'Accuracy: {balanced_accuracy_score(ytest_ataques.type.values.astype(int), secondary_pred)}')
print(f'F1 macro: {f1_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro")}')
print(f'Precision macro', precision_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro"))
print(f'Recall macro:', recall_score(ytest_ataques.type.values.astype(int), secondary_pred, average="macro"))

Parameters: { "C", "max_iter" } are not used.



Accuracy: 0.9852605485751018
F1 macro: 0.9777475704332667
Precision macro 0.9712707702602749
Recall macro: 0.9852605485751018


In [7]:
print(accuracy_score(ytest_original.label, primary_pred))

0.9666301554078235


In [10]:
ytest_ataques.type.value_counts()

type
7    3995
0    3993
5    3989
8    3975
2    3962
1    3952
3    3850
6    2815
4     159
Name: count, dtype: int64

In [23]:
classes = sorted(ytest_ataques.type.unique())
metrics = {'Precision':[], 'Recall':[], 'Accuracy':[], 'F1score':[]}
for c in classes:
    real = ytest_ataques.type.copy()
    real[ytest_ataques.type == c] = 1
    real[ytest_ataques.type != c] = 0
    real = real.values.astype(int)

    pred = secondary_pred.copy()
    pred[secondary_pred == c] = 1
    pred[secondary_pred != c] = 0
    pred = pred.astype(int)

    metrics['Precision'].append(precision_score(real, pred))
    metrics['Recall'].append(recall_score(real, pred))
    metrics['Accuracy'].append(accuracy_score(real, pred))
    metrics['F1score'].append(f1_score(real, pred))

pd.DataFrame(metrics, index=le.classes_).round(4) * 100

Unnamed: 0,Precision,Recall,Accuracy,F1score
backdoor,100.0,100.0,100.0,100.0
ddos,99.26,98.23,99.68,98.74
dos,99.7,99.12,99.85,99.41
injection,98.9,97.64,99.57,98.26
mitm,82.51,94.97,99.87,88.3
password,99.47,99.12,99.82,99.3
ransomware,97.17,100.0,99.73,98.56
scanning,99.1,99.75,99.85,99.43
xss,98.04,97.91,99.48,97.97


In [13]:
pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [40]:
print(f'Accuracy: {balanced_accuracy_score(ytest_ataques.type, secondary_pred)}')


Accuracy: 0.5447965124257834


