In [None]:
!pip install x-transformers

> embeddings generation

In [None]:
import pandas as pd
import torch
import numpy as np
from x_transformers import ContinuousTransformerWrapper, Encoder
from sklearn.preprocessing import RobustScaler
import os


early_warning_url = "../../data/early_warning_exp_2.csv"
output_dir = '.'
os.makedirs(output_dir, exist_ok=True)

colunas = [
    'total_pacotes', 'total_pacotes_icmp', 'total_pacotes_udp', 'total_pacotes_tcp', 'maior_pacote',
    'menor_pacote', 'soma_pacotes', 'total_ips_origem', 'total_ips_destino', 'porta_origem_mais_frequente',
    'porta_destino_mais_frequente', 'total_mac_source', 'total_mac_dst', 'ip_version', 'maior_ttl', 'menor_ttl',
    'std_ttl', 'mean_ttl', 'total_flags_tcp', 'total_tcp_flags_fin', 'total_tcp_flags_syn', 'total_tcp_flags_reset',
    'total_tcp_flags_push', 'total_tcp_flags_ack', 'total_tcp_flags_urg', 'maior_tcp_window_size_value', 'menor_tcp_window_size_value',
    'soma_tcp_window_size_value', 'std_tcp_window_size_value', 'mean_tcp_window_size_value', 'maior_tcp_seq', 'menor_tcp_seq', 'soma_tcp_seq',
    'std_tcp_seq', 'mean_tcp_seq', 'maior_time_delta', 'menor_ttime_delta', 'soma_time_delta', 'std_time_delta', 'mean_time_delta',
    'maior_tcp_time_delta', 'menor_ttcp_time_delta', 'soma_tcp_time_delta', 'std_tcp_time_delta', 'mean_tcp_time_delta', 'maior_tcp_time_relative',
    'menor_ttcp_time_relative', 'soma_tcp_time_relative', 'std_tcp_time_relative', 'mean_tcp_time_relative'
]


sup, inf = 0, 778

early_warning = pd.read_csv(early_warning_url, sep=";")
x_test_df = early_warning[colunas][sup:inf].copy()

scaler = RobustScaler()
x_scaled = scaler.fit_transform(x_test_df)
x_test_tensor = torch.tensor(x_scaled, dtype=torch.float32).unsqueeze(0)  

dim = 12
depth = 64
heads = 64
dim_out = 12
num_features = x_test_tensor.shape[2]

model = ContinuousTransformerWrapper(
    dim_in=num_features,
    dim_out=dim_out,
    max_seq_len=6000,
    attn_layers=Encoder(
        dim=dim,
        depth=depth,
        heads=heads,
        dynamic_pos_bias=True
    )
)

def split_sequence(tensor, window_size):
    seq_len = tensor.shape[1]
    return [tensor[:, i:i+window_size, :] for i in range(0, seq_len, window_size)]

windows = split_sequence(x_test_tensor, window_size=30)
embeddings_list = []
with torch.no_grad():
    for window in windows:
        mask = torch.ones_like(window[:, :, 0]).bool()
        emb = model(window, mask=mask)
        embeddings_list.append(emb.squeeze(0).cpu())

embeddings = torch.cat(embeddings_list, dim=0).numpy()

np.save(f"{output_dir}/embeddings_ex2.npy", embeddings)

embeddings_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embeddings.shape[1])])
embeddings_df.to_csv(f"{output_dir}/embeddings_ex2.csv", index=False)



> embeddings analyzed by SGD One-Class SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

colunas = [
    'total_pacotes', 'total_pacotes_icmp', 'total_pacotes_udp', 'total_pacotes_tcp', 'maior_pacote',
    'menor_pacote', 'soma_pacotes', 'total_ips_origem', 'total_ips_destino', 'porta_origem_mais_frequente',
    'porta_destino_mais_frequente', 'total_mac_source', 'total_mac_dst', 'ip_version', 'maior_ttl', 'menor_ttl',
    'std_ttl', 'mean_ttl', 'total_flags_tcp', 'total_tcp_flags_fin', 'total_tcp_flags_syn', 'total_tcp_flags_reset',
    'total_tcp_flags_push', 'total_tcp_flags_ack', 'total_tcp_flags_urg', 'maior_tcp_window_size_value', 'menor_tcp_window_size_value',
    'soma_tcp_window_size_value', 'std_tcp_window_size_value', 'mean_tcp_window_size_value', 'maior_tcp_seq', 'menor_tcp_seq', 'soma_tcp_seq',
    'std_tcp_seq', 'mean_tcp_seq', 'maior_time_delta', 'menor_ttime_delta', 'soma_time_delta', 'std_time_delta', 'mean_time_delta',
    'maior_tcp_time_delta', 'menor_ttcp_time_delta', 'soma_tcp_time_delta', 'std_tcp_time_delta', 'mean_tcp_time_delta', 'maior_tcp_time_relative',
    'menor_ttcp_time_relative', 'soma_tcp_time_relative', 'std_tcp_time_relative', 'mean_tcp_time_relative'
]

early_warning_url = "../../data/early_warning_exp_2.csv"
embeddings_path = "embeddings_ex2.npy"
resultados = "../../data/early_warning_exp_2.csv"
csv_novo = "resultado_ex2.csv"


df_anterior = pd.read_csv(resultados, sep=";")
df_novo = df_anterior.copy()

early_warning = pd.read_csv(early_warning_url, sep=";")
embeddings = np.load(embeddings_path)

slice_init_melhor_run = 0
slice_end_melhor_run = 778

x = embeddings[slice_init_melhor_run:slice_end_melhor_run]
y_real = early_warning['has_bot'][slice_init_melhor_run:slice_end_melhor_run].to_numpy()

clf = linear_model.SGDOneClassSVM(learning_rate = 'constant', nu = 0.02, eta0= 0.1, random_state=0)


preds = []
step = 30


for i in range(0, len(x), step):
    it = x[i:i+step]
    if len(it) < 1:
        continue
    y_test = clf.fit_predict(it)
    y_test_final = [0 if label == 1 else 1 for label in y_test]
    preds.extend(y_test_final)

assert len(preds) == (slice_end_melhor_run - slice_init_melhor_run), "Error."

df_resultado_ocsvm = early_warning.loc[slice_init_melhor_run:slice_end_melhor_run - 1, colunas].copy()
df_resultado_ocsvm["predicted_label"] = preds
df_resultado_ocsvm["real_label"] = y_real

df_resultado_ocsvm.to_csv("resultado_ex2.csv", index=False)

print("\n- Confusion Matrix:")
print(confusion_matrix(y_real, preds))
print("\n- Classification Report:")
print(classification_report(y_real, preds, digits=4, zero_division=0))
