> Original data projection and separation index:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("../../data/early_warning_exp_1.csv", sep=";", encoding="utf-8")
subset = df.iloc[4681:4711].copy()
dataset = subset.iloc[:, :12].apply(pd.to_numeric, errors="coerce").fillna(0).values
colors = ["red" if val == 1 else "blue" for val in subset["has_bot"]]
real_label = subset['has_bot'].values 

pca = PCA(n_components=3)
pca_result = pca.fit_transform(dataset)

scaler = MinMaxScaler()
pca_result_scaled = scaler.fit_transform(pca_result)

fig_pca = plt.figure(figsize=(7,5))
ax_pca = fig_pca.add_subplot(111, projection='3d')
ax_pca.scatter(
    pca_result_scaled[:, 0],
    pca_result_scaled[:, 1],
    pca_result_scaled[:, 2],
    c=colors,
    s=50
)
ax_pca.scatter([], [], c="blue", s=80, label="Normal traffic")
ax_pca.scatter([], [], c="red", s=80, label="Malicious traffic")
ax_pca.legend(loc='best', fontsize=14)

dist_matrix = pairwise_distances(pca_result, metric='euclidean')

idx_red = [i for i, r in enumerate(real_label) if r == 1]
idx_blue = [i for i, r in enumerate(real_label) if r != 1]

medias_por_vermelho = []
for r in idx_red:
    dist_red_to_blue = dist_matrix[r, idx_blue]
    media = dist_red_to_blue.mean()
    medias_por_vermelho.append(media)

soma_das_medias = sum(medias_por_vermelho)
media_das_medias = np.mean(medias_por_vermelho)
distancia_media_geral = dist_matrix.mean()
indice_separacao = media_das_medias / distancia_media_geral


print(f"Separation Index: {indice_separacao:.4f}")

ax_pca.tick_params(axis='both', which='major', labelsize=13)  
ax_pca.tick_params(axis='z', which='major', labelsize=13)    

plt.subplots_adjust(left=0.1, right=1.2, bottom=0.1, top=0.9)

plt.show()

> Embeddings projection and separation index:

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

embeddings = np.load("embeddings_ex1.npy")
embeddings = embeddings[4681:4711, :12]

df = pd.read_csv("resultado_ex1.csv")
real_label = df['real_label'].values[4681:4711]
colors = ['red' if r == 1 else 'blue' for r in real_label]


cols = (5,6,9)
embedding_subset = embeddings[:, cols]


scaler = MinMaxScaler()
embedding_subset_scaled = scaler.fit_transform(embedding_subset)

fig = plt.figure(figsize=(5,9))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(
    embedding_subset_scaled[:, 0],
    embedding_subset_scaled[:, 1],
    embedding_subset_scaled[:, 2],
    c=colors,
    s=50,
    alpha=0.6
)
ax.set_xlabel(f"Embedding_{cols[0]}", labelpad=7, fontsize=16)
ax.set_ylabel(f"Embedding_{cols[1]}", labelpad=7, fontsize=16)
ax.set_zlabel(f"Embedding_{cols[2]}", labelpad=7, fontsize=16)

ax.scatter([], [], c="blue", s=80, label="Normal traffic")
ax.scatter([], [], c="red", s=80, label="Malicious traffic")
ax.legend(loc='best', fontsize=14)


ax.view_init(elev=30, azim=45)

dist_matrix = pairwise_distances(embedding_subset, metric='euclidean')

idx_red = [i for i, r in enumerate(real_label) if r == 1]
idx_blue = [i for i, r in enumerate(real_label) if r != 1]

medias_por_vermelho = []
for r in idx_red:
    dist_red_to_blue = dist_matrix[r, idx_blue]
    media = dist_red_to_blue.mean()
    medias_por_vermelho.append(media)


soma_das_medias = sum(medias_por_vermelho)
media_das_medias = np.mean(medias_por_vermelho)
distancia_media_geral = dist_matrix.mean()
indice_separacao = media_das_medias / distancia_media_geral

print(f"Separation index: {indice_separacao:.4f}")


ax.tick_params(axis='both', which='major', labelsize=13)  
ax.tick_params(axis='z', which='major', labelsize=13)    

plt.subplots_adjust(left=0.0, right=0.8, bottom=0.1, top=0.9)

plt.show()
