In [8]:
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

dbscan_optuna_storage_path = "sqlite:///optuna_storage/dbscan_study.db"
ocsvm_optuna_storage_path = "sqlite:///optuna_storage/dbocsvm_study.db"

train_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/CIDDS-001/train_set_full.csv"
)
test_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/CIDDS-001/test_set.csv"
)

results_path = "tuning_results/results_dbocsvm_config_3.json"

test_run = True

if test_run:
    with_storage_dbscan = False
    with_storage_dbocsvm = False
    sample_size = 0.03
    use_sample = True
    ocsvm_trials = 10

else:
    os.makedirs("optuna_storage", exist_ok=True)
    with_storage_dbscan = True
    with_storage_dbocsvm = True
    sample_size = 1.0
    use_sample = False
    ocsvm_trials = 1000


dbscan_tuning_parameters = {
    "evaluation_metric": "silhouette",  # silhouette, calinski_harabasz, davies_bouldin
    "distance_metric": "manhattan",  # manhattan, euclidean, cosine
    "trials": 10,
}
dbocsvm_tree_algorithm = "ball_tree"  # "ball_tree" or "kd_tree"

existing_model_path = "saved_models/config 4/autoencoder_Model_1_hidden[14, 12]_latent10_lr0.001_bs256_optrmsprop_actLeakyReLU_slp0.2_wd0.pth"

existing_model_architecture = {
    "input_dim": 16,
    "hidden_dims": [14, 12],
    "latent_dim": 10,
    "activation_type": "LeakyReLU",
    "negative_slope": 0.2,
    "output_activation_type": "Sigmoid",
}

In [9]:
# CHANGE
override_dbscan_tuning = False
dbscan_override_params = {
    "eps": 0.5,
    "min_samples": 5,
    "distance_metric": "manhattan",
    "n_clusters": 5,
    "cluster_data_points": {"-1": 293, "0": 66992, "1": 57},
}

import dataset

In [10]:
import pandas as pd

train_df = pd.read_csv(train_set_path)

if use_sample:
    train_df = train_df.sample(frac=sample_size, random_state=42).reset_index(drop=True)

print(train_df.shape)
train_df.head(1)

(5400, 18)


Unnamed: 0,duration,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,attack_id,proto_ICMP,proto_IGMP,proto_TCP,proto_UDP,attack_binary,attack_categorical
0,0.001965,3.2e-05,7e-06,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,benign


In [11]:
X_train = train_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_train = train_df["attack_binary"].values

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor)

input_dim = X_train.shape[1]
print(f"Input dimension: {input_dim}")

Input dimension: 16


use existing autoencoder

In [13]:
from torch import nn
from autoencoder import BatchNormAutoencoder

autoencoder = BatchNormAutoencoder(
    input_dim=existing_model_architecture["input_dim"],
    hidden_dims=existing_model_architecture["hidden_dims"],
    latent_dim=existing_model_architecture["latent_dim"],
    activation_type=existing_model_architecture["activation_type"],
    negative_slope=existing_model_architecture["negative_slope"],
    output_activation_type=existing_model_architecture["output_activation_type"],
)

In [14]:
# Load best model
checkpoint = torch.load(existing_model_path)
autoencoder.load_state_dict(checkpoint["model_state_dict"])

autoencoder.eval()

BatchNormAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=16, out_features=14, bias=True)
    (1): BatchNorm1d(14, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2)
    (3): Linear(in_features=14, out_features=12, bias=True)
    (4): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.2)
    (6): Linear(in_features=12, out_features=10, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=10, out_features=12, bias=True)
    (1): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2)
    (3): Linear(in_features=12, out_features=14, bias=True)
    (4): BatchNorm1d(14, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.2)
    (6): Linear(in_features=14, out_features=16, bias=True)
    (7): Sigmoid()
  )
)

dbscan tuning

In [15]:
import numpy as np

# extract encoded features
X_train_tensor = torch.FloatTensor(X_train)
X_train_dataset = TensorDataset(X_train_tensor)
X_train_loader = DataLoader(X_train_dataset, batch_size=256)


X_encoded = []
with torch.no_grad():
    for data in X_train_loader:
        data_x = data[0].to(device)
        encoded = autoencoder.encode(data_x)
        X_encoded.append(encoded.cpu().numpy())
X_encoded = np.vstack(X_encoded)

In [16]:
from utils import find_eps_range_with_elbow_method

input_dim_encoded = X_encoded.shape[1]

k_for_elbow = int((20 + input_dim_encoded * 2) / 2)
# CHANGE
if not override_dbscan_tuning:
    min_eps, max_eps = find_eps_range_with_elbow_method(
        X_encoded,
        k=k_for_elbow,
        plot=False,
    )
    min_eps, max_eps
    print(min_eps, max_eps)

0.12987014651298523 0.5194805860519409


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from utils import objective_dbscan
import optuna

dbscan_objective_lambda = lambda trial: objective_dbscan(
    trial,
    X_encoded=X_encoded,
    evaluation_metric=dbscan_tuning_parameters["evaluation_metric"],
    eps_range=(min_eps, max_eps),
    min_samples_range=(1, input_dim_encoded * 2),
    distance_metric=dbscan_tuning_parameters["distance_metric"],
    n_jobs=-1,
)

if with_storage_dbscan:
    dbscan_study = optuna.create_study(
        direction="maximize",
        storage=dbscan_optuna_storage_path,
        study_name="dbscan_study",
        load_if_exists=True,
    )
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_tuning_parameters["trials"],
    )
else:
    dbscan_study = optuna.create_study(direction="maximize")
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_tuning_parameters["trials"],
    )

[I 2025-03-23 17:15:45,160] A new study created in memory with name: no-name-6f5f0782-3022-4045-b03d-9c2e4b5a57c0


12
{0: 1556, 1: 1188, 2: 1058, 3: 689, 4: 91, 5: 326, 6: 213, 7: 46, 8: 69, 9: 19, 10: 20, 11: 16, -1: 109}


[I 2025-03-23 17:15:45,846] Trial 0 finished with value: 0.952646791934967 and parameters: {'eps': 0.27825760406705596, 'min_samples': 16}. Best is trial 0 with value: 0.952646791934967.


15
{0: 1555, 1: 1186, 2: 1056, 3: 689, 4: 91, 5: 326, 6: 212, 7: 46, 8: 69, 9: 19, 10: 20, 11: 16, 12: 9, 13: 10, 14: 9, -1: 87}


[I 2025-03-23 17:15:46,343] Trial 1 finished with value: 0.9532589912414551 and parameters: {'eps': 0.19511251087684434, 'min_samples': 9}. Best is trial 1 with value: 0.9532589912414551.


45
{0: 1556, 1: 1189, 2: 1068, 3: 689, 4: 91, 5: 326, 6: 6, 7: 213, 8: 8, 9: 46, 10: 10, 11: 7, 12: 69, 13: 4, 14: 19, 15: 1, 16: 12, 17: 1, 18: 21, 19: 3, 20: 2, 21: 16, 22: 4, 23: 11, 24: 4, 25: 1, 26: 1, 27: 1, 28: 3, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 2, 35: 1, 36: 1, 37: 2, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1}


[I 2025-03-23 17:15:47,083] Trial 2 finished with value: 0.9360581040382385 and parameters: {'eps': 0.5014008340021165, 'min_samples': 1}. Best is trial 1 with value: 0.9532589912414551.


17
{0: 1556, 1: 1189, 2: 1066, 3: 689, 4: 91, 5: 326, 6: 213, 7: 8, 8: 46, 9: 10, 10: 7, 11: 69, 12: 19, 13: 12, 14: 21, 15: 16, 16: 11, -1: 51}


[I 2025-03-23 17:15:47,690] Trial 3 finished with value: 0.9520520567893982 and parameters: {'eps': 0.49107284870121476, 'min_samples': 7}. Best is trial 1 with value: 0.9532589912414551.


26
{0: 1556, 1: 1189, 2: 1068, 3: 689, 4: 91, 5: 326, 6: 6, 7: 213, 8: 8, 9: 46, 10: 10, 11: 7, 12: 69, 13: 4, 14: 19, 15: 12, 16: 21, 17: 3, 18: 2, 19: 16, 20: 4, 21: 11, 22: 4, 23: 3, 24: 2, 25: 2, -1: 19}


[I 2025-03-23 17:15:48,502] Trial 4 finished with value: 0.9398307800292969 and parameters: {'eps': 0.5060313896812529, 'min_samples': 2}. Best is trial 1 with value: 0.9532589912414551.


17
{0: 1556, 1: 1189, 2: 1064, 3: 689, 4: 91, 5: 326, 6: 213, 7: 8, 8: 46, 9: 10, 10: 7, 11: 69, 12: 19, 13: 21, 14: 16, 15: 10, 16: 11, -1: 55}


[I 2025-03-23 17:15:49,555] Trial 5 finished with value: 0.9522518515586853 and parameters: {'eps': 0.31556405770710044, 'min_samples': 7}. Best is trial 1 with value: 0.9532589912414551.


18
{0: 1556, 1: 1189, 2: 1066, 3: 689, 4: 91, 5: 326, 6: 6, 7: 213, 8: 8, 9: 46, 10: 10, 11: 7, 12: 69, 13: 19, 14: 12, 15: 21, 16: 16, 17: 11, -1: 45}


[I 2025-03-23 17:15:50,473] Trial 6 finished with value: 0.9520823955535889 and parameters: {'eps': 0.49313029962872834, 'min_samples': 6}. Best is trial 1 with value: 0.9532589912414551.


12
{0: 1556, 1: 1188, 2: 1059, 3: 689, 4: 91, 5: 326, 6: 213, 7: 46, 8: 69, 9: 19, 10: 21, 11: 16, -1: 107}


[I 2025-03-23 17:15:51,183] Trial 7 finished with value: 0.9525811672210693 and parameters: {'eps': 0.29154772573498966, 'min_samples': 16}. Best is trial 1 with value: 0.9532589912414551.


12
{0: 1556, 1: 1189, 2: 1064, 3: 689, 4: 91, 5: 326, 6: 213, 7: 46, 8: 69, 9: 19, 10: 21, 11: 16, -1: 101}


[I 2025-03-23 17:15:51,983] Trial 8 finished with value: 0.952174961566925 and parameters: {'eps': 0.39792424434108586, 'min_samples': 16}. Best is trial 1 with value: 0.9532589912414551.


26
{0: 1556, 1: 1189, 2: 1068, 3: 689, 4: 91, 5: 326, 6: 6, 7: 213, 8: 8, 9: 46, 10: 10, 11: 7, 12: 69, 13: 4, 14: 19, 15: 12, 16: 21, 17: 3, 18: 2, 19: 16, 20: 4, 21: 11, 22: 4, 23: 3, 24: 2, 25: 2, -1: 19}


[I 2025-03-23 17:15:52,674] Trial 9 finished with value: 0.9398307800292969 and parameters: {'eps': 0.4742994125237059, 'min_samples': 2}. Best is trial 1 with value: 0.9532589912414551.


In [18]:
import optuna
from plotly.io import show

fig = optuna.visualization.plot_optimization_history(dbscan_study)
show(fig)

In [19]:
fig = optuna.visualization.plot_edf([dbscan_study])
show(fig)

In [20]:
import pprint

if override_dbscan_tuning:
    eps = dbscan_override_params["eps"]
    min_samples = dbscan_override_params["min_samples"]
else:
    eps = dbscan_study.best_params["eps"]
    min_samples = dbscan_study.best_params["min_samples"]

if override_dbscan_tuning:
    n_clusters = dbscan_override_params["n_clusters"]
    cluster_data_points = dbscan_override_params["cluster_data_points"]
else:
    best_trial_dbscan = dbscan_study.best_trial
    best_trial_dbscan_user_attrs = best_trial_dbscan.user_attrs

    n_clusters = best_trial_dbscan_user_attrs["n_clusters"]
    cluster_data_points = best_trial_dbscan_user_attrs["cluster_data_points"]


print(f"eps = {eps}")
print(f"min_samples = {min_samples}")
print(f"n_clusters = {n_clusters}")
print("cluster_data_points")
pprint.pprint(cluster_data_points)

eps = 0.19511251087684434
min_samples = 9
n_clusters = 15
cluster_data_points
{-1: 87,
 0: 1555,
 1: 1186,
 2: 1056,
 3: 689,
 4: 91,
 5: 326,
 6: 212,
 7: 46,
 8: 69,
 9: 19,
 10: 20,
 11: 16,
 12: 9,
 13: 10,
 14: 9}


fit the DBSCAN

In [21]:
from db_ocsvm_04 import DBOCSVM

# Create DB-OC-SVM model with default ocsvm parameters

if override_dbscan_tuning:
    dbscan_distance_metric = dbscan_override_params["distance_metric"]
else:
    dbscan_distance_metric = dbscan_tuning_parameters["distance_metric"]
dbocsvm = DBOCSVM(
    kernel="rbf",
    gamma="auto",
    nu=0.2,
    eps=eps,
    min_samples=min_samples,
    dbscan_distance_metric=dbscan_distance_metric,
    tree_algorithm=dbocsvm_tree_algorithm,
)

In [22]:
dbocsvm.fit_cluster(X_encoded, verbose=True)

Fitting DBSCAN...
DBSCAN Fitted...
Unique Clusters: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Cluster Sizes: {-1: 87, 0: 1555, 1: 1186, 2: 1056, 3: 689, 4: 91, 5: 326, 6: 212, 7: 46, 8: 69, 9: 19, 10: 20, 11: 16, 12: 9, 13: 10, 14: 9}


<db_ocsvm_04.DBOCSVM at 0x7bcdd111da60>

importing test set

In [23]:
test_dataset = pd.read_csv(test_set_path)
print(f"test set count: {test_dataset.shape[0]:,}")
print(f"unique values: {test_dataset['attack_categorical'].value_counts()}")
test_dataset.head(3)

test set count: 20,000
unique values: attack_categorical
benign        11226
dos            5564
portScan       1730
bruteForce      804
pingScan        676
Name: count, dtype: int64


Unnamed: 0,duration,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,attack_id,proto_ICMP,proto_IGMP,proto_TCP,proto_UDP,attack_binary,attack_categorical
0,0.00064,2.1e-05,2e-06,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,1.0,0.0,1,benign
1,0.011037,1.1e-05,1e-06,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,benign
2,0.011396,9.1e-05,6e-06,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,benign


Splitting into test and validation set

In [24]:
from sklearn.model_selection import train_test_split

test_df, val_df = train_test_split(
    test_dataset,
    test_size=0.5,
    random_state=42,
    stratify=test_dataset["attack_categorical"],
)

test set

In [25]:
# Splitting into X and y
X_test = test_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_test = test_df["attack_binary"].values
y_test_class = test_df["attack_categorical"].values

print(f"test set count: {test_df.shape[0]:,}")
print(f"unique values: {test_df['attack_categorical'].value_counts()}")
test_df.head(3)

test set count: 10,000
unique values: attack_categorical
benign        5613
dos           2782
portScan       865
bruteForce     402
pingScan       338
Name: count, dtype: int64


Unnamed: 0,duration,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,attack_id,proto_ICMP,proto_IGMP,proto_TCP,proto_UDP,attack_binary,attack_categorical
12406,0.000829,7.5e-05,4.657671e-06,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,66.0,0.0,0.0,1.0,0.0,-1,pingScan
15620,0.002278,1.6e-05,4.326256e-07,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,53.0,0.0,0.0,1.0,0.0,-1,dos
17951,0.000202,2.1e-05,1.103394e-06,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,39.0,0.0,0.0,1.0,0.0,-1,bruteForce


validation set

In [26]:
# Splitting into X and y
X_val = val_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_val = val_df["attack_binary"].values
y_val_class = val_df["attack_categorical"].values

print(f"test set count: {val_df.shape[0]:,}")
print(f"unique values: {val_df['attack_categorical'].value_counts()}")
val_df.head(3)

test set count: 10,000
unique values: attack_categorical
benign        5613
dos           2782
portScan       865
bruteForce     402
pingScan       338
Name: count, dtype: int64


Unnamed: 0,duration,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,attack_id,proto_ICMP,proto_IGMP,proto_TCP,proto_UDP,attack_binary,attack_categorical
9482,0.00558,5.4e-05,3.099822e-06,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,benign
12490,0.000561,1.6e-05,4.564398e-07,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,60.0,0.0,0.0,1.0,0.0,-1,dos
18001,0.002043,4.8e-05,1.697559e-05,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,0.0,-1,dos


Applying SMOTE

In [27]:
from imblearn.over_sampling import SMOTE
import numpy as np

# First, display the original distribution
print("Before SMOTE:")
print(f"Val set count: {X_val.shape[0]:,}")
before_counts = pd.Series(y_val_class).value_counts()
print(before_counts)

# Apply SMOTE to training data using class labels
sampling_strategy = {
    "dos": 4000,
    "portScan": 1500,
    "bruteForce": 1500,
    "pingScan": 500,
}
smote = SMOTE(random_state=42, k_neighbors=3, sampling_strategy=sampling_strategy)
X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val_class)

# Display the distribution after SMOTE
print("\nAfter SMOTE:")
print(f"Val set count: {X_val_resampled.shape[0]:,}")
after_counts = pd.Series(y_val_resampled).value_counts()
print(after_counts)

# If you need binary labels for further processing, convert back
y_val_resampled = np.where(y_val_resampled == "benign", 1, -1)

Before SMOTE:
Val set count: 10,000
benign        5613
dos           2782
portScan       865
bruteForce     402
pingScan       338
Name: count, dtype: int64

After SMOTE:
Val set count: 13,113
benign        5613
dos           4000
portScan      1500
bruteForce    1500
pingScan       500
Name: count, dtype: int64


reconstruction error inspection

In [28]:
# Separate normal and anomaly samples from test set
X_test_normal = X_test[y_test == 1]
X_test_anomaly = X_test[y_test == -1]

print(f"Normal test samples: {X_test_normal.shape[0]}")
print(f"Anomaly test samples: {X_test_anomaly.shape[0]}")

# Convert test data to PyTorch tensors
X_test_normal_tensor = torch.FloatTensor(X_test_normal).to(device)
X_test_anomaly_tensor = torch.FloatTensor(X_test_anomaly).to(device)

# Create DataLoaders for test data evaluation
normal_test_dataset = TensorDataset(X_test_normal_tensor)
anomaly_test_dataset = TensorDataset(X_test_anomaly_tensor)
normal_test_loader = DataLoader(normal_test_dataset, batch_size=256, shuffle=False)
anomaly_test_loader = DataLoader(anomaly_test_dataset, batch_size=256, shuffle=False)


def calculate_reconstruction_error(model, loader):
    model.eval()
    total_loss = 0
    total_samples = 0
    criterion = nn.MSELoss(reduction="none")

    with torch.no_grad():
        for batch in loader:
            x = batch[0]
            outputs = model(x)
            # Calculate MSE for each sample
            loss = criterion(outputs, x)
            loss = loss.mean(dim=1)
            total_loss += torch.sum(loss).item()
            total_samples += x.size(0)

    return total_loss / total_samples


# Function to evaluate a model's reconstruction performance
def evaluate_model(model):
    normal_loss = calculate_reconstruction_error(model, normal_test_loader)
    anomaly_loss = calculate_reconstruction_error(model, anomaly_test_loader)
    loss_difference = anomaly_loss - normal_loss

    return {
        "normal_loss": normal_loss,
        "anomaly_loss": anomaly_loss,
        "loss_difference": loss_difference,
    }


reconstruction_error = evaluate_model(autoencoder)
reconstruction_error

Normal test samples: 5613
Anomaly test samples: 4387


{'normal_loss': 2.2778223763883204e-06,
 'anomaly_loss': 122.1120784664741,
 'loss_difference': 122.11207618865173}

extract features from validation data

In [29]:
X_val_tensor = torch.FloatTensor(X_val_resampled).to(device)

X_val_dataset_tensor = TensorDataset(X_val_tensor, torch.zeros(len(X_val_tensor)))
X_val_loader = DataLoader(X_val_dataset_tensor, batch_size=128)

X_val_encoded = []
with torch.no_grad():
    for data, _ in X_val_loader:
        encoded = autoencoder.encode(data)
        X_val_encoded.append(encoded.cpu().numpy())

X_val_encoded = np.vstack(X_val_encoded)
print(X_val_encoded.shape)

(13113, 10)


extract features from test data

In [30]:
X_test_tensor = torch.FloatTensor(X_test).to(device)

test_dataset = TensorDataset(X_test_tensor, torch.zeros(len(X_test_tensor)))
test_loader = DataLoader(test_dataset, batch_size=128)

X_test_encoded = []
with torch.no_grad():
    for data, _ in test_loader:
        encoded = autoencoder.encode(data)
        X_test_encoded.append(encoded.cpu().numpy())

X_test_encoded = np.vstack(X_test_encoded)
print(X_test_encoded.shape)

(10000, 10)


tuning the ocsvms

In [31]:
from utils import objective_dbocsvm_fit_ocsvm

# Inner Optuna study for DBSCAN
dbocsvm_fit_ocsvm_objective_lambda = lambda trial: objective_dbocsvm_fit_ocsvm(
    trial,
    model=dbocsvm,
    X_encoded_train=X_encoded,
    X_encoded_validation=X_val_encoded,
    y_validation=y_val_resampled,
    X_encoded_test=X_test_encoded,
    y_test=y_test,
    cluster_count=n_clusters,
    metric="f1",
)

if with_storage_dbocsvm:
    dbocsvm_study = optuna.create_study(
        direction="maximize",
        storage=ocsvm_optuna_storage_path,
        study_name="dbocsvm_study",
        load_if_exists=True,
    )
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )
else:
    dbocsvm_study = optuna.create_study(direction="maximize")
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )

[I 2025-03-23 17:15:54,086] A new study created in memory with name: no-name-b6bd89dc-7950-443d-921c-69c6f8e56540
[I 2025-03-23 17:15:56,894] Trial 0 finished with value: 0.895843287147635 and parameters: {'gamma_0': 0.3022565295099909, 'nu_0': 0.45562472699017664, 'gamma_1': 0.9481470819531959, 'nu_1': 0.12534348360412176, 'gamma_2': 0.7321861002344294, 'nu_2': 0.19605596208009532, 'gamma_3': 0.2962806701732386, 'nu_3': 0.4486875922662252, 'gamma_4': 0.39324528746285714, 'nu_4': 0.37872831761166953, 'gamma_5': 0.6564901964797778, 'nu_5': 0.4401984345593006, 'gamma_6': 0.4337525165826453, 'nu_6': 0.032273544011922714, 'gamma_7': 0.25406128351819174, 'nu_7': 0.4564626336657586, 'gamma_8': 0.8404468565527354, 'nu_8': 0.43328732065166276, 'gamma_9': 0.7533243208205446, 'nu_9': 0.14191695235556315, 'gamma_10': 0.03605700027692492, 'nu_10': 0.12868586614331556, 'gamma_11': 0.5471550712342759, 'nu_11': 0.4795455067782017, 'gamma_12': 0.4689734945580381, 'nu_12': 0.41414975910358587, 'gamma_1

Validation Results:
{'accuracy': '86.70', 'f1': '89.58', 'precision': '81.13', 'recall': '100.00'}

Test Results:
{'accuracy': '82.46', 'f1': '83.34', 'precision': '71.44', 'recall': '100.00'}


[I 2025-03-23 17:16:00,487] Trial 1 finished with value: 0.8997660608241858 and parameters: {'gamma_0': 0.7494423895393946, 'nu_0': 0.46393626526782966, 'gamma_1': 0.25720898624784544, 'nu_1': 0.11198172390845917, 'gamma_2': 0.6988899343691743, 'nu_2': 0.23785281274394615, 'gamma_3': 0.4357293011292819, 'nu_3': 0.4355138009874627, 'gamma_4': 0.5091393590525338, 'nu_4': 0.3247757609929658, 'gamma_5': 0.5928268021318711, 'nu_5': 0.1267168913482833, 'gamma_6': 0.7137514205700942, 'nu_6': 0.4324691151578541, 'gamma_7': 0.19471893851322328, 'nu_7': 0.4150642505791425, 'gamma_8': 0.09918293057797885, 'nu_8': 0.3646476359690805, 'gamma_9': 0.04304467340054076, 'nu_9': 0.07532664356391737, 'gamma_10': 0.9496159603638564, 'nu_10': 0.034525854788238296, 'gamma_11': 0.990556994644294, 'nu_11': 0.15496755699136866, 'gamma_12': 0.5585970916305931, 'nu_12': 0.4056389265234785, 'gamma_13': 0.32832093594553435, 'nu_13': 0.08191488440027572, 'gamma_14': 0.927620058584125, 'nu_14': 0.4187816472365474}. 

Validation Results:
{'accuracy': '87.26', 'f1': '89.98', 'precision': '81.78', 'recall': '100.00'}

Test Results:
{'accuracy': '82.58', 'f1': '83.43', 'precision': '71.58', 'recall': '100.00'}


[I 2025-03-23 17:16:03,623] Trial 2 finished with value: 0.8955223880597015 and parameters: {'gamma_0': 0.9832887538354625, 'nu_0': 0.3809544652144057, 'gamma_1': 0.6012621295151823, 'nu_1': 0.1799601283645402, 'gamma_2': 0.2145153714208462, 'nu_2': 0.2782957085525044, 'gamma_3': 0.7451824670408879, 'nu_3': 0.48856463528916755, 'gamma_4': 0.8918961152591217, 'nu_4': 0.2578447698970574, 'gamma_5': 0.662854149720816, 'nu_5': 0.1241363399539701, 'gamma_6': 0.04968969553027785, 'nu_6': 0.43860286459203934, 'gamma_7': 0.001085849462727517, 'nu_7': 0.29261557886451983, 'gamma_8': 0.06023092771724046, 'nu_8': 0.011260002467539925, 'gamma_9': 0.9709371180501986, 'nu_9': 0.3104751546787806, 'gamma_10': 0.9627915743658894, 'nu_10': 0.3801058009011189, 'gamma_11': 0.16389007161335928, 'nu_11': 0.37166271047431865, 'gamma_12': 0.5158830720220057, 'nu_12': 0.24350997213398057, 'gamma_13': 0.3487203620214816, 'nu_13': 0.06788823638169376, 'gamma_14': 0.08417338928905307, 'nu_14': 0.3650144748568997}

Validation Results:
{'accuracy': '86.65', 'f1': '89.55', 'precision': '81.08', 'recall': '100.00'}

Test Results:
{'accuracy': '82.32', 'f1': '83.23', 'precision': '71.28', 'recall': '100.00'}


[I 2025-03-23 17:16:06,637] Trial 3 finished with value: 0.8772955901275002 and parameters: {'gamma_0': 0.03644992684431349, 'nu_0': 0.281310043640185, 'gamma_1': 0.17295306248889067, 'nu_1': 0.3815861046119608, 'gamma_2': 0.5236221192683603, 'nu_2': 0.49376824080549464, 'gamma_3': 0.6190352894629599, 'nu_3': 0.45812651991813336, 'gamma_4': 0.7159009453776839, 'nu_4': 0.3826139693981117, 'gamma_5': 0.6617424762850775, 'nu_5': 0.05545719680844441, 'gamma_6': 0.8688873400921544, 'nu_6': 0.22850377548373965, 'gamma_7': 0.6258273610115933, 'nu_7': 0.40676577144136283, 'gamma_8': 0.9008038723463636, 'nu_8': 0.44899829833855326, 'gamma_9': 0.6549697107860363, 'nu_9': 0.2995139836337696, 'gamma_10': 0.8761889962135833, 'nu_10': 0.477481746420287, 'gamma_11': 0.5594701898893907, 'nu_11': 0.3347718942369531, 'gamma_12': 0.598383747277164, 'nu_12': 0.3540520496710268, 'gamma_13': 0.7249056441734836, 'nu_13': 0.18351058489099456, 'gamma_14': 0.8050294729033303, 'nu_14': 0.023238399235734085}. Bes

Validation Results:
{'accuracy': '84.00', 'f1': '87.73', 'precision': '78.14', 'recall': '100.00'}

Test Results:
{'accuracy': '79.33', 'f1': '80.93', 'precision': '67.97', 'recall': '100.00'}


[I 2025-03-23 17:16:09,417] Trial 4 finished with value: 0.9255260072808046 and parameters: {'gamma_0': 0.5669154936864093, 'nu_0': 0.12921817484541642, 'gamma_1': 0.058994155303764464, 'nu_1': 0.43704826479050446, 'gamma_2': 0.14648506976966313, 'nu_2': 0.05961200970976328, 'gamma_3': 0.15575171098598728, 'nu_3': 0.24905138912154026, 'gamma_4': 0.8047025132965189, 'nu_4': 0.32529090786306625, 'gamma_5': 0.763163727684511, 'nu_5': 0.017900105243288986, 'gamma_6': 0.12927885379108484, 'nu_6': 0.3022025633702206, 'gamma_7': 0.4189652209923801, 'nu_7': 0.4281875805781366, 'gamma_8': 0.12561222641639283, 'nu_8': 0.08281499508360768, 'gamma_9': 0.9719570072488274, 'nu_9': 0.439451163876606, 'gamma_10': 0.8845027841429421, 'nu_10': 0.3325377075270317, 'gamma_11': 0.8555967249916172, 'nu_11': 0.3173104182765947, 'gamma_12': 0.05968467783777435, 'nu_12': 0.03448327824375494, 'gamma_13': 0.9876180983016957, 'nu_13': 0.3200910864983726, 'gamma_14': 0.03145520779596738, 'nu_14': 0.204443580375505

Validation Results:
{'accuracy': '90.80', 'f1': '92.55', 'precision': '86.14', 'recall': '100.00'}

Test Results:
{'accuracy': '87.62', 'f1': '87.63', 'precision': '77.99', 'recall': '100.00'}


[I 2025-03-23 17:16:11,994] Trial 5 finished with value: 0.916198387490838 and parameters: {'gamma_0': 0.2734035759452197, 'nu_0': 0.02589378271261529, 'gamma_1': 0.7221512038282002, 'nu_1': 0.44798891030070714, 'gamma_2': 0.27617269223815094, 'nu_2': 0.29314022899896647, 'gamma_3': 0.2111165805395112, 'nu_3': 0.3406275701400363, 'gamma_4': 0.28115441565581606, 'nu_4': 0.2736204710232699, 'gamma_5': 0.41153921087273027, 'nu_5': 0.13278238887461236, 'gamma_6': 0.22977629648046907, 'nu_6': 0.4240509942760464, 'gamma_7': 0.8383955571940247, 'nu_7': 0.41342990167317845, 'gamma_8': 0.7781316255975664, 'nu_8': 0.19989560528643946, 'gamma_9': 0.5221662172895151, 'nu_9': 0.2021723344811848, 'gamma_10': 0.611588398377882, 'nu_10': 0.46975031215863766, 'gamma_11': 0.23723815826171468, 'nu_11': 0.047273507462255296, 'gamma_12': 0.10262168369052817, 'nu_12': 0.09826070612421638, 'gamma_13': 0.44117716627593306, 'nu_13': 0.3417772556505521, 'gamma_14': 0.20077928867535244, 'nu_14': 0.07056407469167

Validation Results:
{'accuracy': '89.54', 'f1': '91.62', 'precision': '84.54', 'recall': '100.00'}

Test Results:
{'accuracy': '85.66', 'f1': '85.95', 'precision': '75.37', 'recall': '100.00'}


[I 2025-03-23 17:16:14,600] Trial 6 finished with value: 0.9113554893978978 and parameters: {'gamma_0': 0.15908487554666845, 'nu_0': 0.1775463163996632, 'gamma_1': 0.8808008589191347, 'nu_1': 0.13669261432729263, 'gamma_2': 0.20673483010796373, 'nu_2': 0.4893550745661552, 'gamma_3': 0.9524799395821554, 'nu_3': 0.04552507575078269, 'gamma_4': 0.7109064428669034, 'nu_4': 0.0860588201399553, 'gamma_5': 0.23692222232455745, 'nu_5': 0.3468327250700627, 'gamma_6': 0.7463556290660461, 'nu_6': 0.29027398097637924, 'gamma_7': 0.7067221526597534, 'nu_7': 0.23048143914516592, 'gamma_8': 0.808784397905914, 'nu_8': 0.34959849222746864, 'gamma_9': 0.8978885163596545, 'nu_9': 0.441156111325116, 'gamma_10': 0.6058466557183015, 'nu_10': 0.39903798324474976, 'gamma_11': 0.9750024723442264, 'nu_11': 0.4305189208950752, 'gamma_12': 0.19859542095757488, 'nu_12': 0.05290349965586281, 'gamma_13': 0.27144982907218634, 'nu_13': 0.30182403380733563, 'gamma_14': 0.9620300808385422, 'nu_14': 0.11784351608818164}.

Validation Results:
{'accuracy': '88.87', 'f1': '91.14', 'precision': '83.71', 'recall': '100.00'}

Test Results:
{'accuracy': '85.69', 'f1': '85.98', 'precision': '75.40', 'recall': '100.00'}


[I 2025-03-23 17:16:17,769] Trial 7 finished with value: 0.9008467959882289 and parameters: {'gamma_0': 0.9801132999151941, 'nu_0': 0.42189712269633173, 'gamma_1': 0.08282642621643382, 'nu_1': 0.12049748880372743, 'gamma_2': 0.4037860098916045, 'nu_2': 0.13654239792804437, 'gamma_3': 0.004235222120988094, 'nu_3': 0.4354517634041983, 'gamma_4': 0.4945048883917035, 'nu_4': 0.05441019606579194, 'gamma_5': 0.48028616127621265, 'nu_5': 0.42123439901951404, 'gamma_6': 0.08512786810198976, 'nu_6': 0.4479785600951971, 'gamma_7': 0.9294487004616346, 'nu_7': 0.3042668180534234, 'gamma_8': 0.39191333884410384, 'nu_8': 0.02151514480739976, 'gamma_9': 0.055080592101232444, 'nu_9': 0.4295143665675078, 'gamma_10': 0.29672106667634157, 'nu_10': 0.4579922356747768, 'gamma_11': 0.019711045057503193, 'nu_11': 0.4785987813321709, 'gamma_12': 0.7638766310905619, 'nu_12': 0.1131168081287981, 'gamma_13': 0.917666840024555, 'nu_13': 0.07620532786508671, 'gamma_14': 0.2884706015729046, 'nu_14': 0.4180397550853

Validation Results:
{'accuracy': '87.41', 'f1': '90.08', 'precision': '81.96', 'recall': '100.00'}

Test Results:
{'accuracy': '83.28', 'f1': '83.99', 'precision': '72.40', 'recall': '100.00'}


[I 2025-03-23 17:16:22,607] Trial 8 finished with value: 0.8870490833826138 and parameters: {'gamma_0': 0.6073961655675137, 'nu_0': 0.22913670690728224, 'gamma_1': 0.36236180139847257, 'nu_1': 0.4146704427242079, 'gamma_2': 0.5949159742984201, 'nu_2': 0.35293390108646255, 'gamma_3': 0.1829130809510132, 'nu_3': 0.3383527595065068, 'gamma_4': 0.6700303841304416, 'nu_4': 0.397442387957175, 'gamma_5': 0.6593957876356328, 'nu_5': 0.43994181661381915, 'gamma_6': 0.8436955931896641, 'nu_6': 0.38403412510558005, 'gamma_7': 0.8967689026524674, 'nu_7': 0.41660344083372297, 'gamma_8': 0.5365970818146175, 'nu_8': 0.39689628725066095, 'gamma_9': 0.7959576167471973, 'nu_9': 0.08831104304978017, 'gamma_10': 0.1403494011806155, 'nu_10': 0.020003466312256017, 'gamma_11': 0.846460128258641, 'nu_11': 0.02437931091973282, 'gamma_12': 0.6319263395314921, 'nu_12': 0.26422074619441044, 'gamma_13': 0.8241457105444242, 'nu_13': 0.4842093410534231, 'gamma_14': 0.14689174426861418, 'nu_14': 0.24241539872055134}.

Validation Results:
{'accuracy': '85.43', 'f1': '88.70', 'precision': '79.70', 'recall': '100.00'}

Test Results:
{'accuracy': '80.82', 'f1': '82.06', 'precision': '69.58', 'recall': '100.00'}


[I 2025-03-23 17:16:25,439] Trial 9 finished with value: 0.918498561018921 and parameters: {'gamma_0': 0.12902505087746488, 'nu_0': 0.09981037700854342, 'gamma_1': 0.7774780035846625, 'nu_1': 0.12232995580700454, 'gamma_2': 0.6727536551709004, 'nu_2': 0.4622139764495268, 'gamma_3': 0.5443036635433685, 'nu_3': 0.4100935604635576, 'gamma_4': 0.4242358057226022, 'nu_4': 0.037619684144891245, 'gamma_5': 0.7443326707610354, 'nu_5': 0.0911986772380252, 'gamma_6': 0.7437521764440741, 'nu_6': 0.4581869440297834, 'gamma_7': 0.5590558918868506, 'nu_7': 0.4372582937515891, 'gamma_8': 0.23299483932142684, 'nu_8': 0.07172103769267855, 'gamma_9': 0.9559278681371495, 'nu_9': 0.06276496251637831, 'gamma_10': 0.43185900319369036, 'nu_10': 0.15595927331841825, 'gamma_11': 0.6860626281247586, 'nu_11': 0.3607785724914142, 'gamma_12': 0.3461616027186046, 'nu_12': 0.06441251435233655, 'gamma_13': 0.9248214818015166, 'nu_13': 0.2705236885500987, 'gamma_14': 0.9500479370951481, 'nu_14': 0.06777708383223977}. 

Validation Results:
{'accuracy': '89.85', 'f1': '91.85', 'precision': '84.93', 'recall': '100.00'}

Test Results:
{'accuracy': '86.52', 'f1': '86.68', 'precision': '76.50', 'recall': '100.00'}


In [32]:
import optuna
from plotly.io import show

fig = optuna.visualization.plot_optimization_history(dbocsvm_study)
show(fig)

In [33]:
fig = optuna.visualization.plot_edf([dbocsvm_study])
show(fig)

In [34]:
parameter_list = {}

for key, value in dbocsvm_study.best_params.items():
    cluster = key.split("_")[1]
    cluster = int(cluster)

    parameter_list[cluster] = {
        "kernel": "rbf",
        "gamma": dbocsvm_study.best_params[f"gamma_{cluster}"],
        "nu": dbocsvm_study.best_params[f"nu_{cluster}"],
    }

best parameters and values

In [35]:
autoencoder_architecture = {
    "input_dim": existing_model_architecture["input_dim"],
    "hidden_dims": existing_model_architecture["hidden_dims"],
    "latent_dim": existing_model_architecture["latent_dim"],
    "activation_type": existing_model_architecture["activation_type"],
    "negative_slope": existing_model_architecture["negative_slope"],
    "output_activation_type": existing_model_architecture["output_activation_type"],
    "val_loss": checkpoint["val_loss"],
}

print("Best autoencoder model:")
pprint.pprint(autoencoder_architecture, sort_dicts=False)
print("")

print("Reconstruction error:")
pprint.pprint(reconstruction_error, sort_dicts=False)
print("")

best_dbscan_parameters = {
    "eps": eps,
    "min_samples": min_samples,
    "distance_metric": dbscan_tuning_parameters["distance_metric"],
    "evaluation_metric": dbscan_tuning_parameters["evaluation_metric"],
    "score": best_trial_dbscan.value,
    "n_clusters": n_clusters,
    "cluster_data_points": cluster_data_points,
}

print("Best dbscan parameters")
pprint.pprint(best_dbscan_parameters, sort_dicts=False)
print("")

print("Best ocsvm parameters")
print(f"Tree algorithm: {dbocsvm_tree_algorithm}")
print(f"Accuracy: {dbocsvm_study.best_value}")
pprint.pprint(parameter_list, sort_dicts=False)

Best autoencoder model:
{'input_dim': 16,
 'hidden_dims': [14, 12],
 'latent_dim': 10,
 'activation_type': 'LeakyReLU',
 'negative_slope': 0.2,
 'output_activation_type': 'Sigmoid',
 'val_loss': 3.415589755640219e-06}

Reconstruction error:
{'normal_loss': 2.2778223763883204e-06,
 'anomaly_loss': 122.1120784664741,
 'loss_difference': 122.11207618865173}

Best dbscan parameters
{'eps': 0.19511251087684434,
 'min_samples': 9,
 'distance_metric': 'manhattan',
 'evaluation_metric': 'silhouette',
 'score': 0.9532589912414551,
 'n_clusters': 15,
 'cluster_data_points': {0: 1555,
                         1: 1186,
                         2: 1056,
                         3: 689,
                         4: 91,
                         5: 326,
                         6: 212,
                         7: 46,
                         8: 69,
                         9: 19,
                         10: 20,
                         11: 16,
                         12: 9,
                         1

In [36]:
import json

tuning_result = {
    "dbscan": best_dbscan_parameters,
    "ocsvm": {
        "tree_algorithm": dbocsvm_tree_algorithm,
        "accuracy": dbocsvm_study.best_value,
        "parameters": parameter_list,
    },
}

results = {
    "max_score": 0,
    "autoencoder_architecture": autoencoder_architecture,
    "reconstruction_error": reconstruction_error,
    "tuning_results": {},
}

os.makedirs("tuning_results", exist_ok=True)
if os.path.exists(results_path):
    with open(results_path, "r") as file:
        existing_results = json.load(file)
        if existing_results["max_score"] < dbocsvm_study.best_value:
            with open(results_path, "w") as f:
                existing_results["max_score"] = dbocsvm_study.best_value
                tuning_result_id = len(existing_results["tuning_results"])
                tuning_result["score"] = dbocsvm_study.best_value
                existing_results["tuning_results"][tuning_result_id] = tuning_result
                json.dump(existing_results, f)
else:
    with open(results_path, "w") as f:
        results["max_score"] = dbocsvm_study.best_value
        tuning_result["score"] = dbocsvm_study.best_value
        results["tuning_results"][0] = tuning_result
        json.dump(results, f)