In [None]:
import torch

with_storage_dbscan = False
with_storage_dbocsvm = False
dbscan_optuna_storage_path = "sqlite:///optuna_storage/dbscan_study.db"
ocsvm_optuna_storage_path = "sqlite:///optuna_storage/dbocsvm_study.db"

sample_size = 0.01
use_sample = True
best_model_path = "best_models/best_model_proposed.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
export_model = False
onnx_path = "autoencoder.onnx"

use_existing_model = True
existing_model_path = "saved_models/val_score_pca/autoencoder_pca_Model_pca_1_pca_hidden[40, 25, 17]_latent8_best.pth"
existing_model_architecture = {
    "input_dim": 68,
    "hidden_dims": [40, 25, 17],
    "latent_dim": 8,
    "activation_type": "LeakyReLU",
    "output_activation_type": "Sigmoid",
}

train_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/train_set_full.csv"
)
test_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/test_set.csv"
)

In [4]:
# model architecture
hidden_dims = [56, 32, 16]
latent_dim = 4

# Learning parameters
lr = 0.001
batch_size = 128

epochs = 10
improvement_threshold = 0.0001
good_model_threshold = 0.00015
early_stopping_patience = 5

dbscan_evaluation_metric = "silhouette"  # "davies_bouldin" or "calinski_harabasz"
dbscan_tuning_distance_metric = "euclidean"  # "euclidean" or "cosine" or "manhattan"

# Used by the DBOCSVM clustering
dbocsvm_dbscan_distance_metric = "euclidean"  # "euclidean or "cosine" or "manhattan"
dbocsvm_tree_algorithm = "kd_tree"  # "ball_tree" or "kd_tree"

# Used for DBOCSVM ocsvm tuning
ocsvm_trials = 10

import dataset

In [5]:
import pandas as pd

train_df = pd.read_csv(train_set_path)
if use_sample:
    train_df = train_df.sample(frac=sample_size, random_state=42).reset_index(drop=True)
print(train_df.shape)
train_df.head(1)

(673, 122)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,5.833486e-07,2.572642e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

X_train_full = train_df.values

X_train, X_val = train_test_split(train_df, test_size=0.2, random_state=42)
autoencoder_training_pca = PCA(n_components=68)

X_train = autoencoder_training_pca.fit_transform(X_train.values)
X_val = autoencoder_training_pca.transform(X_val.values)

print(X_train.shape, X_val.shape, X_train_full.shape)

(538, 68) (135, 68) (673, 122)


training the autoencoder

In [7]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
X_val_tensor = torch.FloatTensor(X_val)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor)
val_dataset = TensorDataset(X_val_tensor)

input_dim = X_train.shape[1]
input_dim

68

In [8]:
from models import BatchNormAutoencoder
from torch import nn, optim

if use_existing_model:
    autoencoder = BatchNormAutoencoder(
        input_dim=existing_model_architecture["input_dim"],
        hidden_dims=existing_model_architecture["hidden_dims"],
        latent_dim=existing_model_architecture["latent_dim"],
        activation_type=existing_model_architecture["activation_type"],
        output_activation_type=existing_model_architecture["output_activation_type"],
    )
else:
    # Create model
    autoencoder = BatchNormAutoencoder(
        input_dim=input_dim,
        hidden_dims=hidden_dims,
        latent_dim=latent_dim,
        activation_type="LeakyReLU",
        output_activation_type="Sigmoid",
    )

    # loss and optimizer
    optimizer = optim.Adam(autoencoder.parameters(), lr=lr)
    criterion = nn.MSELoss()

In [9]:
if not use_existing_model:
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [10]:
from utils import train_autoencoder

if not use_existing_model:
    history, is_good_model = train_autoencoder(
        model=autoencoder,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        epochs=epochs,
        best_model_path=best_model_path,
        verbose=True,
        early_stopping_patience=early_stopping_patience,
        improvement_threshold=improvement_threshold,
        good_model_threshold=good_model_threshold,
        plot_results=True,
    )

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Load best model
if use_existing_model:
    checkpoint = torch.load(existing_model_path)
    autoencoder.load_state_dict(checkpoint["model_state_dict"])
else:
    checkpoint = torch.load(best_model_path)
    autoencoder.load_state_dict(checkpoint["model_state_dict"])

autoencoder.eval()

BatchNormAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=68, out_features=40, bias=True)
    (1): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): Linear(in_features=40, out_features=25, bias=True)
    (4): BatchNorm1d(25, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=25, out_features=17, bias=True)
    (7): BatchNorm1d(17, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): LeakyReLU(negative_slope=0.01)
    (9): Linear(in_features=17, out_features=8, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=8, out_features=17, bias=True)
    (1): BatchNorm1d(17, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): Linear(in_features=17, out_features=25, bias=True)
    (4): BatchNorm1d(25, eps=1e-05, momentum=0.1, 

saving the model

In [12]:
import torch.onnx

if export_model and not use_existing_model:
    # Prepare a sample input tensor with the correct shape
    dummy_input = torch.randn(1, input_dim, device=device)

    torch.onnx.export(
        autoencoder,  # model being run
        dummy_input,  # model input
        onnx_path,  # where to save the model
        export_params=True,  # store trained parameters inside model file
        opset_version=17,  # ONNX version
        do_constant_folding=True,  # optimize constant folding
        input_names=["input"],  # model's input names
        output_names=["output"],  # model's output names
        dynamic_axes={
            "input": {0: "batch_size"},  # variable length axes
            "output": {0: "batch_size"},
        },
    )

    print(f"Model exported to ONNX format: {onnx_path}")

dbscan tuning

In [13]:
import torch
from torch.utils.data import DataLoader, TensorDataset

dbocsvm_pca = PCA(n_components=68)
X_train_full = dbocsvm_pca.fit_transform(X_train_full)

# Convert to PyTorch tensors
X_train_full_tensor = torch.FloatTensor(X_train_full)

# Create data loaders
X_train_full_dataset = TensorDataset(X_train_full_tensor)

# create data loader
X_train_full_loader = DataLoader(X_train_full_dataset, batch_size=128)

input_dim = X_train_full.shape[1]
input_dim

68

In [14]:
import numpy as np

# Extract in batches to prevent memory issues
X_encoded_full = []
with torch.no_grad():
    for data in X_train_full_loader:
        data_x = data[0].to(device)
        encoded = autoencoder.encode(data_x)
        X_encoded_full.append(encoded.cpu().numpy())
X_encoded_full = np.vstack(X_encoded_full)
X_encoded_full.shape

(673, 8)

In [15]:
from utils import find_eps_range_with_elbow_method

input_dim_encoded = X_encoded_full.shape[1]

k_for_elbow = int((20 + input_dim_encoded * 2) / 2)
min_eps, max_eps = find_eps_range_with_elbow_method(
    X_encoded_full,
    k=k_for_elbow,
    plot=False,
)

min_eps, max_eps

(np.float64(1.2888195337727666), np.float64(5.155278135091066))

In [16]:
from utils import objective_dbscan
import optuna

dbscan_objective_lambda = lambda trial: objective_dbscan(
    trial,
    X_encoded=X_encoded_full,
    evaluation_metric=dbscan_evaluation_metric,
    eps_range=(min_eps, max_eps),
    min_samples_range=(1, input_dim_encoded * 2),
    distance_metric=dbscan_tuning_distance_metric,
    n_jobs=-1,
)

dbscan_trials = 10

if with_storage_dbscan:
    dbscan_study = optuna.create_study(
        direction="maximize",
        storage=dbscan_optuna_storage_path,
        study_name="dbscan_study",
        load_if_exists=True,
    )
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_trials,
    )
else:
    dbscan_study = optuna.create_study(direction="maximize")
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_trials,
    )

[I 2025-03-06 18:47:30,375] A new study created in memory with name: no-name-c1e4cf74-9440-4ba2-b8f1-446e8521a3c9
[I 2025-03-06 18:47:30,429] Trial 0 finished with value: -inf and parameters: {'eps': 3.7781914607207128, 'min_samples': 13}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,472] Trial 1 finished with value: -inf and parameters: {'eps': 3.756273791900667, 'min_samples': 7}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,564] Trial 2 finished with value: -inf and parameters: {'eps': 1.5447075460072976, 'min_samples': 12}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,615] Trial 3 finished with value: -inf and parameters: {'eps': 3.7462394668731895, 'min_samples': 1}. Best is trial 0 with value: -inf.


not enough clusters
not enough clusters
not enough clusters
not enough clusters


[I 2025-03-06 18:47:30,657] Trial 4 finished with value: -inf and parameters: {'eps': 2.9957571735077497, 'min_samples': 9}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,696] Trial 5 finished with value: -inf and parameters: {'eps': 4.64618732949849, 'min_samples': 6}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,718] Trial 6 finished with value: -inf and parameters: {'eps': 4.8580466573356595, 'min_samples': 12}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,760] Trial 7 finished with value: -inf and parameters: {'eps': 3.6358249034036376, 'min_samples': 1}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,806] Trial 8 finished with value: -inf and parameters: {'eps': 4.9871553130448625, 'min_samples': 2}. Best is trial 0 with value: -inf.
[I 2025-03-06 18:47:30,827] Trial 9 finished with value: -inf and parameters: {'eps': 4.081148383578258, 'min_samples': 11}. Best is trial 0 with value: -inf.


not enough clusters
not enough clusters
not enough clusters
not enough clusters
not enough clusters
not enough clusters


In [17]:
import pprint

# get dbscan best parameters
eps = dbscan_study.best_params["eps"]
min_samples = dbscan_study.best_params["min_samples"]

# get dbscan best trial
best_trial_dbscan = dbscan_study.best_trial
best_trial_dbscan_user_attrs = best_trial_dbscan.user_attrs

n_clusters = best_trial_dbscan_user_attrs["n_clusters"]
cluster_data_points = best_trial_dbscan_user_attrs["cluster_data_points"]

print(f"eps = {eps}")
print(f"min_samples = {min_samples}")
print(f"n_clusters = {n_clusters}")
print("cluster_data_points")
pprint.pprint(cluster_data_points)

eps = 3.7781914607207128
min_samples = 13
n_clusters = 1
cluster_data_points
{0: 673}


fit the DBSCAN

In [18]:
from models import DBOCSVM_V2

# Create DB-OC-SVM model with default ocsvm parameters
dbocsvm = DBOCSVM_V2(
    kernel="rbf",
    gamma="auto",
    nu=0.2,
    eps=eps,
    min_samples=min_samples,
    dbscan_metric=dbocsvm_dbscan_distance_metric,
    algorithm=dbocsvm_tree_algorithm,  # ball_tree, kd_tree,
)

In [19]:
dbocsvm.fit_cluster(X_encoded_full, verbose=True)

Fitting DBSCAN...
DBSCAN Fitted...
Unique Clusters: [0]
Cluster Sizes: {0: 673}


importing test set

In [20]:
test_df = pd.read_csv(test_set_path)
print(test_df.shape)
test_df.head(1)

(22543, 125)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary,attack_categorical,attack_class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,neptune,DoS


In [21]:
# Splitting into X and y
X_test = test_df.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).values

X_test = dbocsvm_pca.transform(X_test)

y_test = test_df["attack_binary"].values
y_test_class = test_df["attack_class"]
X_test.shape, y_test.shape

((22543, 68), (22543,))

extract features from test data

In [22]:
X_test_tensor = torch.FloatTensor(X_test).to(device)
X_test_encoded = []

test_dataset = TensorDataset(X_test_tensor, torch.zeros(len(X_test_tensor)))
test_loader = DataLoader(test_dataset, batch_size=128)

with torch.no_grad():
    for data, _ in test_loader:
        encoded = autoencoder.encode(data)
        X_test_encoded.append(encoded.cpu().numpy())

X_test_encoded = np.vstack(X_test_encoded)
print(X_test_encoded.shape)

(22543, 8)


tuning the ocsvms

In [23]:
from utils import objective_dbocsvm_fit_ocsvm

# Inner Optuna study for DBSCAN
dbocsvm_fit_ocsvm_objective_lambda = lambda trial: objective_dbocsvm_fit_ocsvm(
    trial,
    model=dbocsvm,
    X_encoded_train=X_encoded_full,
    X_encoded_test=X_test_encoded,
    y_test=y_test,
    cluster_count=n_clusters,
)

if with_storage_dbocsvm:
    dbocsvm_study = optuna.create_study(
        direction="maximize",
        storage=ocsvm_optuna_storage_path,
        study_name="dbocsvm_study",
        load_if_exists=True,
    )
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )
else:
    dbocsvm_study = optuna.create_study(direction="maximize")
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )

[I 2025-03-06 18:47:31,640] A new study created in memory with name: no-name-f56e45ca-7356-4510-b6cc-160bd673bc8a
[I 2025-03-06 18:47:34,867] Trial 0 finished with value: 0.74337931952269 and parameters: {'gamma_0': 0.25676691717207983, 'nu_0': 0.2478138055508375}. Best is trial 0 with value: 0.74337931952269.
[I 2025-03-06 18:47:37,176] Trial 1 finished with value: 0.5677593931597391 and parameters: {'gamma_0': 0.01539200028913066, 'nu_0': 0.19603369436221424}. Best is trial 0 with value: 0.74337931952269.
[I 2025-03-06 18:47:39,455] Trial 2 finished with value: 0.6302621656390011 and parameters: {'gamma_0': 0.11968653850944348, 'nu_0': 0.08597888060137186}. Best is trial 0 with value: 0.74337931952269.
[I 2025-03-06 18:47:41,589] Trial 3 finished with value: 0.5921572106640642 and parameters: {'gamma_0': 0.04891441511385903, 'nu_0': 0.037458129824031655}. Best is trial 0 with value: 0.74337931952269.
[I 2025-03-06 18:47:43,842] Trial 4 finished with value: 0.5891851128953556 and para

In [24]:
parameter_list = {}

for key, value in dbocsvm_study.best_params.items():
    cluster = key.split("_")[1]
    cluster = int(cluster)

    parameter_list[cluster] = {
        "kernel": "rbf",
        "gamma": dbocsvm_study.best_params[f"gamma_{cluster}"],
        "nu": dbocsvm_study.best_params[f"nu_{cluster}"],
    }

best parameters and values

In [25]:
autoencoder_architecture = {
    "input_dim": input_dim,
    "hidden_dims": hidden_dims,
    "latent_dim": latent_dim,
    "activation_type": "LeakyReLU",
    "output_activation_type": "Sigmoid",
    "learning_rate": lr,
    "batch_size": batch_size,
    "val_loss": checkpoint["val_loss"],
}
print("Best autoencoder model:")
pprint.pprint(autoencoder_architecture, sort_dicts=False)
print("")
best_dbscan_parameters = {
    "eps": eps,
    "min_samples": min_samples,
    "distance_metric": dbocsvm_dbscan_distance_metric,
    "score": best_trial_dbscan.value,
}
print("Best dbscan parameters")
pprint.pprint(best_dbscan_parameters, sort_dicts=False)
print("")
print("Best ocsvm parameters")
print(f"Tree algorithm: {dbocsvm_tree_algorithm}")
print(f"Accuracy: {dbocsvm_study.best_value}")
pprint.pprint(parameter_list, sort_dicts=False)

Best autoencoder model:
{'input_dim': 68,
 'hidden_dims': [56, 32, 16],
 'latent_dim': 4,
 'activation_type': 'LeakyReLU',
 'output_activation_type': 'Sigmoid',
 'learning_rate': 0.001,
 'batch_size': 128,
 'val_loss': 0.01944045111853278}

Best dbscan parameters
{'eps': 3.7781914607207128,
 'min_samples': 13,
 'distance_metric': 'euclidean',
 'score': -inf}

Best ocsvm parameters
Tree algorithm: kd_tree
Accuracy: 0.7667125049904626
{0: {'kernel': 'rbf', 'gamma': 0.5698829485600022, 'nu': 0.06324164848666507}}
