In [1]:
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

dbscan_optuna_storage_path = "sqlite:///optuna_storage/dbscan_study.db"
ocsvm_optuna_storage_path = "sqlite:///optuna_storage/dbocsvm_study.db"

train_set_path = "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/minamax_scaler_new/train_set.csv"
val_set_path = "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/minamax_scaler_new/validation_set.csv"
test_set_path = "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/minamax_scaler_new/test_set.csv"

results_path = "tuning_results/results_dbocsvm_01.json"

test_run = True

if test_run:
    with_storage_dbscan = False
    with_storage_dbocsvm = False
    sample_size = 0.1
    use_sample = True
    ocsvm_trials = 10

else:
    os.makedirs("optuna_storage", exist_ok=True)
    with_storage_dbscan = True
    with_storage_dbocsvm = True
    sample_size = 1.0
    use_sample = False
    ocsvm_trials = 1000


dbscan_tuning_parameters = {
    "evaluation_metric": "silhouette",  # silhouette, calinski_harabasz, davies_bouldin
    "distance_metric": "manhattan",  # manhattan, euclidean, cosine
    "trials": 10,
}
dbocsvm_tree_algorithm = "ball_tree"  # "ball_tree" or "kd_tree"

existing_model_path = "best_models/config_2/autoencoder_Model_1_hidden[64, 32]_latent8_lr0.01_bs128_optadam_actLeakyReLU_slp0.2_dr0.3_wd0.pth"

existing_model_architecture = {
    "input_dim": 122,
    "hidden_dims": [64, 32],
    "latent_dim": 8,
    "activation_type": "LeakyReLU",
    "negative_slope": 0.2,
    "dropout_rate": 0.3,
    "output_activation_type": "Sigmoid",
}

In [2]:
# CHANGE
override_dbscan_tuning = False
dbscan_override_params = {
    "eps": 0.5,
    "min_samples": 5,
    "distance_metric": "manhattan",
    "n_clusters": 5,
    "cluster_data_points": {"-1": 293, "0": 66992, "1": 57},
}

import dataset

In [3]:
import pandas as pd

train_df = pd.read_csv(train_set_path)

if use_sample:
    train_df = train_df.sample(frac=sample_size, random_state=42).reset_index(drop=True)

print(train_df.shape)
train_df.head(1)

(6734, 122)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,9e-06,4.8e-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
from sklearn.model_selection import train_test_split

X_train = train_df.values

print(X_train.shape)

(6734, 122)


In [5]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor)

input_dim = X_train.shape[1]
print(f"Input dimension: {input_dim}")

Input dimension: 122


use existing autoencoder

In [6]:
from torch import nn
from autoencoder import BatchNormAutoencoderV2

autoencoder = BatchNormAutoencoderV2(
    input_dim=existing_model_architecture["input_dim"],
    hidden_dims=existing_model_architecture["hidden_dims"],
    latent_dim=existing_model_architecture["latent_dim"],
    activation_type=existing_model_architecture["activation_type"],
    negative_slope=existing_model_architecture["negative_slope"],
    dropout_rate=existing_model_architecture["dropout_rate"],
    output_activation_type=existing_model_architecture["output_activation_type"],
)

In [7]:
# Load best model
checkpoint = torch.load(existing_model_path)
autoencoder.load_state_dict(checkpoint["model_state_dict"])

autoencoder.eval()

BatchNormAutoencoderV2(
  (encoder): Sequential(
    (0): Linear(in_features=122, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): LeakyReLU(negative_slope=0.2)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=32, out_features=8, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=8, out_features=32, bias=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=32, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): LeakyReLU(negative_slope

dbscan tuning

In [8]:
import numpy as np

# extract encoded features
X_train_tensor = torch.FloatTensor(X_train)
X_train_dataset = TensorDataset(X_train_tensor)
X_train_loader = DataLoader(X_train_dataset, batch_size=256)


X_encoded = []
with torch.no_grad():
    for data in X_train_loader:
        data_x = data[0].to(device)
        encoded = autoencoder.encode(data_x)
        X_encoded.append(encoded.cpu().numpy())
X_encoded = np.vstack(X_encoded)

In [9]:
from utils import find_eps_range_with_elbow_method

input_dim_encoded = X_encoded.shape[1]

k_for_elbow = int((20 + input_dim_encoded * 2) / 2)
# CHANGE
if not override_dbscan_tuning:
    min_eps, max_eps = find_eps_range_with_elbow_method(
        X_encoded,
        k=k_for_elbow,
        plot=False,
    )
    min_eps, max_eps
    print(min_eps, max_eps)

12.041840557008982 48.16736222803593


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from utils import objective_dbscan
import optuna

dbscan_objective_lambda = lambda trial: objective_dbscan(
    trial,
    X_encoded=X_encoded,
    evaluation_metric=dbscan_tuning_parameters["evaluation_metric"],
    eps_range=(min_eps, max_eps),
    min_samples_range=(1, input_dim_encoded * 2),
    distance_metric=dbscan_tuning_parameters["distance_metric"],
    n_jobs=-1,
)

if with_storage_dbscan:
    dbscan_study = optuna.create_study(
        direction="maximize",
        storage=dbscan_optuna_storage_path,
        study_name="dbscan_study",
        load_if_exists=True,
    )
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_tuning_parameters["trials"],
    )
else:
    dbscan_study = optuna.create_study(direction="maximize")
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_tuning_parameters["trials"],
    )

[I 2025-03-22 15:00:21,816] A new study created in memory with name: no-name-840cbbd0-4f8f-41c1-a3df-14f9c87dfa61


20
{0: 4930, 1: 289, 2: 66, 3: 26, 4: 834, 5: 94, 6: 42, 7: 30, 8: 91, 9: 63, 10: 46, 11: 13, 12: 28, 13: 67, 14: 13, 15: 13, 16: 6, 17: 9, 18: 7, 19: 6, -1: 61}


[I 2025-03-22 15:00:23,978] Trial 0 finished with value: 0.5079969763755798 and parameters: {'eps': 46.89610723861345, 'min_samples': 6}. Best is trial 0 with value: 0.5079969763755798.


25
{0: 383, 1: 3413, 2: 285, 3: 61, 4: 800, 5: 211, 6: 86, 7: 93, 8: 17, 9: 313, 10: 40, 11: 30, 12: 26, 13: 19, 14: 62, 15: 135, 16: 52, 17: 17, 18: 50, 19: 43, 20: 51, 21: 27, 22: 20, 23: 39, 24: 16, -1: 445}


[I 2025-03-22 15:00:25,344] Trial 1 finished with value: 0.5664207935333252 and parameters: {'eps': 24.262438595319438, 'min_samples': 12}. Best is trial 1 with value: 0.5664207935333252.


35
{0: 349, 1: 3271, 2: 270, 3: 55, 4: 718, 5: 80, 6: 191, 7: 84, 8: 14, 9: 293, 10: 44, 11: 18, 12: 48, 13: 39, 14: 33, 15: 120, 16: 9, 17: 24, 18: 44, 19: 22, 20: 24, 21: 25, 22: 21, 23: 13, 24: 20, 25: 17, 26: 15, 27: 10, 28: 8, 29: 18, 30: 9, 31: 10, 32: 9, 33: 10, 34: 9, -1: 790}


[I 2025-03-22 15:00:26,083] Trial 2 finished with value: 0.43399566411972046 and parameters: {'eps': 13.101446893693169, 'min_samples': 9}. Best is trial 1 with value: 0.5664207935333252.


23
{0: 351, 1: 3328, 2: 275, 3: 36, 4: 716, 5: 82, 6: 195, 7: 91, 8: 299, 9: 22, 10: 48, 11: 39, 12: 33, 13: 125, 14: 24, 15: 46, 16: 21, 17: 23, 18: 32, 19: 15, 20: 26, 21: 19, 22: 19, -1: 869}


[I 2025-03-22 15:00:26,896] Trial 3 finished with value: 0.5952938199043274 and parameters: {'eps': 16.030210554032887, 'min_samples': 15}. Best is trial 3 with value: 0.5952938199043274.


53
{0: 384, 1: 3419, 2: 286, 3: 4, 4: 66, 5: 25, 6: 830, 7: 236, 8: 280, 9: 93, 10: 4, 11: 321, 12: 41, 13: 42, 14: 30, 15: 26, 16: 63, 17: 28, 18: 58, 19: 50, 20: 4, 21: 4, 22: 3, 23: 18, 24: 64, 25: 54, 26: 8, 27: 6, 28: 18, 29: 12, 30: 6, 31: 4, 32: 8, 33: 11, 34: 10, 35: 4, 36: 5, 37: 3, 38: 13, 39: 3, 40: 5, 41: 3, 42: 3, 43: 3, 44: 6, 45: 3, 46: 4, 47: 4, 48: 4, 49: 3, 50: 3, 51: 3, 52: 3, -1: 143}


[I 2025-03-22 15:00:28,061] Trial 4 finished with value: 0.42953962087631226 and parameters: {'eps': 28.029589275983646, 'min_samples': 3}. Best is trial 3 with value: 0.5952938199043274.


107
{0: 385, 1: 3821, 2: 287, 3: 5, 4: 66, 5: 25, 6: 2, 7: 835, 8: 298, 9: 93, 10: 4, 11: 326, 12: 2, 13: 42, 14: 30, 15: 26, 16: 63, 17: 8, 18: 1, 19: 28, 20: 64, 21: 8, 22: 1, 23: 4, 24: 4, 25: 26, 26: 67, 27: 12, 28: 12, 29: 6, 30: 1, 31: 18, 32: 2, 33: 4, 34: 12, 35: 7, 36: 2, 37: 11, 38: 4, 39: 2, 40: 2, 41: 1, 42: 15, 43: 3, 44: 3, 45: 1, 46: 2, 47: 4, 48: 2, 49: 1, 50: 2, 51: 1, 52: 3, 53: 1, 54: 1, 55: 2, 56: 2, 57: 1, 58: 1, 59: 3, 60: 4, 61: 7, 62: 1, 63: 1, 64: 2, 65: 2, 66: 1, 67: 1, 68: 2, 69: 2, 70: 1, 71: 3, 72: 2, 73: 1, 74: 3, 75: 2, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 2, 83: 1, 84: 2, 85: 1, 86: 1, 87: 1, 88: 1, 89: 1, 90: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 1, 97: 1, 98: 1, 99: 2, 100: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 106: 1}


[I 2025-03-22 15:00:29,283] Trial 5 finished with value: 0.38646483421325684 and parameters: {'eps': 36.23156846985679, 'min_samples': 1}. Best is trial 3 with value: 0.5952938199043274.


19
{0: 4946, 1: 289, 2: 66, 3: 26, 4: 834, 5: 94, 6: 42, 7: 30, 8: 92, 9: 63, 10: 46, 11: 9, 12: 28, 13: 67, 14: 13, 15: 13, 16: 6, 17: 7, 18: 6, -1: 57}


[I 2025-03-22 15:00:30,492] Trial 6 finished with value: 0.5058961510658264 and parameters: {'eps': 47.99440511712989, 'min_samples': 6}. Best is trial 3 with value: 0.5952938199043274.


33
{0: 384, 1: 3418, 2: 286, 3: 66, 4: 24, 5: 830, 6: 233, 7: 280, 8: 93, 9: 321, 10: 42, 11: 42, 12: 30, 13: 26, 14: 63, 15: 28, 16: 58, 17: 50, 18: 18, 19: 65, 20: 54, 21: 12, 22: 6, 23: 18, 24: 11, 25: 6, 26: 8, 27: 11, 28: 10, 29: 5, 30: 13, 31: 6, 32: 6, -1: 211}


[I 2025-03-22 15:00:31,569] Trial 7 finished with value: 0.4702093005180359 and parameters: {'eps': 29.29171412808236, 'min_samples': 5}. Best is trial 3 with value: 0.5952938199043274.


16
{0: 4891, 1: 289, 2: 66, 3: 26, 4: 833, 5: 94, 6: 42, 7: 30, 8: 87, 9: 63, 10: 46, 11: 28, 12: 66, 13: 12, 14: 23, 15: 12, -1: 126}


[I 2025-03-22 15:00:32,689] Trial 8 finished with value: 0.5001127123832703 and parameters: {'eps': 47.52527257891838, 'min_samples': 12}. Best is trial 3 with value: 0.5952938199043274.


19
{0: 384, 1: 3772, 2: 287, 3: 66, 4: 830, 5: 239, 6: 93, 7: 324, 8: 44, 9: 42, 10: 30, 11: 26, 12: 63, 13: 59, 14: 65, 15: 55, 16: 17, 17: 20, 18: 16, -1: 302}


[I 2025-03-22 15:00:33,916] Trial 9 finished with value: 0.5180031061172485 and parameters: {'eps': 37.094580584649634, 'min_samples': 16}. Best is trial 3 with value: 0.5952938199043274.


In [27]:
import optuna
from plotly.io import show

fig = optuna.visualization.plot_optimization_history(dbscan_study)
show(fig)

In [28]:
fig = optuna.visualization.plot_edf([dbscan_study])
show(fig)

In [11]:
import pprint

if override_dbscan_tuning:
    eps = dbscan_override_params["eps"]
    min_samples = dbscan_override_params["min_samples"]
else:
    eps = dbscan_study.best_params["eps"]
    min_samples = dbscan_study.best_params["min_samples"]

if override_dbscan_tuning:
    n_clusters = dbscan_override_params["n_clusters"]
    cluster_data_points = dbscan_override_params["cluster_data_points"]
else:
    best_trial_dbscan = dbscan_study.best_trial
    best_trial_dbscan_user_attrs = best_trial_dbscan.user_attrs

    n_clusters = best_trial_dbscan_user_attrs["n_clusters"]
    cluster_data_points = best_trial_dbscan_user_attrs["cluster_data_points"]


print(f"eps = {eps}")
print(f"min_samples = {min_samples}")
print(f"n_clusters = {n_clusters}")
print("cluster_data_points")
pprint.pprint(cluster_data_points)

eps = 16.030210554032887
min_samples = 15
n_clusters = 23
cluster_data_points
{-1: 869,
 0: 351,
 1: 3328,
 2: 275,
 3: 36,
 4: 716,
 5: 82,
 6: 195,
 7: 91,
 8: 299,
 9: 22,
 10: 48,
 11: 39,
 12: 33,
 13: 125,
 14: 24,
 15: 46,
 16: 21,
 17: 23,
 18: 32,
 19: 15,
 20: 26,
 21: 19,
 22: 19}


fit the DBSCAN

In [12]:
from db_ocsvm_04 import DBOCSVM

# Create DB-OC-SVM model with default ocsvm parameters

if override_dbscan_tuning:
    dbscan_distance_metric = dbscan_override_params["distance_metric"]
else:
    dbscan_distance_metric = dbscan_tuning_parameters["distance_metric"]
dbocsvm = DBOCSVM(
    kernel="rbf",
    gamma="auto",
    nu=0.2,
    eps=eps,
    min_samples=min_samples,
    dbscan_distance_metric=dbscan_distance_metric,
    tree_algorithm=dbocsvm_tree_algorithm,
)

In [13]:
dbocsvm.fit_cluster(X_encoded, verbose=True)

Fitting DBSCAN...
DBSCAN Fitted...
Unique Clusters: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
Cluster Sizes: {-1: 869, 0: 351, 1: 3328, 2: 275, 3: 36, 4: 716, 5: 82, 6: 195, 7: 91, 8: 299, 9: 22, 10: 48, 11: 39, 12: 33, 13: 125, 14: 24, 15: 46, 16: 21, 17: 23, 18: 32, 19: 15, 20: 26, 21: 19, 22: 19}


<db_ocsvm_04.DBOCSVM at 0x7a38d712f8f0>

Importing validation set

In [14]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

print("Before SMOTE:")
val_df = pd.read_csv(val_set_path)
X_val = val_df.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).values
y_val = val_df["attack_binary"].values
y_val_class = val_df["attack_class"].values
print(val_df.shape)
print(val_df["attack_class"].value_counts())
val_df.head(1)

# Apply SMOTE to training data using class labels
sampling_strategy = {
    "DoS": 4000,
    "R2L": 2000,
    "Probe": 2000,
    "U2R": 500,
}
smote = SMOTE(random_state=42, k_neighbors=3, sampling_strategy=sampling_strategy)
X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val_class)

# Display the distribution after SMOTE
print("\nAfter SMOTE:")
print(f"Val set count: {X_val_resampled.shape[0]:,}")
after_counts = pd.Series(y_val_resampled).value_counts()
print(after_counts)

# If you need binary labels for further processing, convert back
y_val_resampled = np.where(y_val_resampled == "normal", 1, -1)
print(f"\n{X_val_resampled.shape}, {y_val_resampled.shape}")

Before SMOTE:
(11272, 125)
attack_class
normal    4856
DoS       3729
R2L       1377
Probe     1210
U2R        100
Name: count, dtype: int64

After SMOTE:
Val set count: 13,356
normal    4856
DoS       4000
R2L       2000
Probe     2000
U2R        500
Name: count, dtype: int64

(13356, 122), (13356,)


importing test set

In [15]:
test_df = pd.read_csv(test_set_path)
print(test_df.shape)
print(test_df["attack_class"].value_counts())
test_df.head(1)

(11271, 125)
attack_class
normal    4855
DoS       3728
R2L       1377
Probe     1211
U2R        100
Name: count, dtype: int64


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary,attack_categorical,attack_class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,neptune,DoS


In [16]:
# Splitting into X and y
X_test = test_df.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).values
y_test = test_df["attack_binary"].values
y_test_class = test_df["attack_class"]

print(X_test.shape, y_test.shape)

(11271, 122) (11271,)


reconstruction error inspection

In [17]:
# Separate normal and anomaly samples from test set
X_test_normal = X_test[y_test == 1]
X_test_anomaly = X_test[y_test == -1]

print(f"Normal test samples: {X_test_normal.shape[0]}")
print(f"Anomaly test samples: {X_test_anomaly.shape[0]}")

# Convert test data to PyTorch tensors
X_test_normal_tensor = torch.FloatTensor(X_test_normal).to(device)
X_test_anomaly_tensor = torch.FloatTensor(X_test_anomaly).to(device)

# Create DataLoaders for test data evaluation
normal_test_dataset = TensorDataset(X_test_normal_tensor)
anomaly_test_dataset = TensorDataset(X_test_anomaly_tensor)
normal_test_loader = DataLoader(normal_test_dataset, batch_size=256, shuffle=False)
anomaly_test_loader = DataLoader(anomaly_test_dataset, batch_size=256, shuffle=False)


def calculate_reconstruction_error(model, loader):
    model.eval()
    total_loss = 0
    total_samples = 0
    criterion = nn.MSELoss(reduction="none")

    with torch.no_grad():
        for batch in loader:
            x = batch[0]
            outputs = model(x)
            # Calculate MSE for each sample
            loss = criterion(outputs, x)
            loss = loss.mean(dim=1)
            total_loss += torch.sum(loss).item()
            total_samples += x.size(0)

    return total_loss / total_samples


# Function to evaluate a model's reconstruction performance
def evaluate_model(model):
    normal_loss = calculate_reconstruction_error(model, normal_test_loader)
    anomaly_loss = calculate_reconstruction_error(model, anomaly_test_loader)
    loss_difference = anomaly_loss - normal_loss

    return {
        "normal_loss": normal_loss,
        "anomaly_loss": anomaly_loss,
        "loss_difference": loss_difference,
    }


reconstruction_error = evaluate_model(autoencoder)
reconstruction_error

Normal test samples: 4855
Anomaly test samples: 6416


{'normal_loss': 0.0014022980549062932,
 'anomaly_loss': 0.03032843417434621,
 'loss_difference': 0.028926136119439914}

extract features from validation data

In [18]:
X_val_tensor = torch.FloatTensor(X_val_resampled).to(device)

X_val_dataset_tensor = TensorDataset(X_val_tensor, torch.zeros(len(X_val_tensor)))
X_val_loader = DataLoader(X_val_dataset_tensor, batch_size=128)

X_val_encoded = []
with torch.no_grad():
    for data, _ in X_val_loader:
        encoded = autoencoder.encode(data)
        X_val_encoded.append(encoded.cpu().numpy())

X_val_encoded = np.vstack(X_val_encoded)
print(X_val_encoded.shape)

(13356, 8)


extract features from test data

In [19]:
X_test_tensor = torch.FloatTensor(X_test).to(device)

test_dataset = TensorDataset(X_test_tensor, torch.zeros(len(X_test_tensor)))
test_loader = DataLoader(test_dataset, batch_size=128)

X_test_encoded = []
with torch.no_grad():
    for data, _ in test_loader:
        encoded = autoencoder.encode(data)
        X_test_encoded.append(encoded.cpu().numpy())

X_test_encoded = np.vstack(X_test_encoded)
print(X_test_encoded.shape)

(11271, 8)


tuning the ocsvms

In [20]:
from utils import objective_dbocsvm_fit_ocsvm

# Inner Optuna study for DBSCAN
dbocsvm_fit_ocsvm_objective_lambda = lambda trial: objective_dbocsvm_fit_ocsvm(
    trial,
    model=dbocsvm,
    X_encoded_train=X_encoded,
    X_encoded_validation=X_val_encoded,
    y_validation=y_val_resampled,
    X_encoded_test=X_test_encoded,
    y_test=y_test,
    cluster_count=n_clusters,
    metric="f1",
)

if with_storage_dbocsvm:
    dbocsvm_study = optuna.create_study(
        direction="maximize",
        storage=ocsvm_optuna_storage_path,
        study_name="dbocsvm_study",
        load_if_exists=True,
    )
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )
else:
    dbocsvm_study = optuna.create_study(direction="maximize")
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )

[I 2025-03-22 15:00:35,306] A new study created in memory with name: no-name-f199d9b8-bedc-41cc-a210-5b56b92e6ed2
[I 2025-03-22 15:00:39,248] Trial 0 finished with value: 0.8676523994811932 and parameters: {'gamma_0': 0.25623113805785297, 'nu_0': 0.13873082213276486, 'gamma_1': 0.9528062194301627, 'nu_1': 0.2464891845645179, 'gamma_2': 0.6428562870278463, 'nu_2': 0.3025285254888912, 'gamma_3': 0.7119787880052952, 'nu_3': 0.4924801172120277, 'gamma_4': 0.9824760521081745, 'nu_4': 0.3704434375756312, 'gamma_5': 0.6058394290264879, 'nu_5': 0.01416225658713383, 'gamma_6': 0.1795385537017633, 'nu_6': 0.10137290555071118, 'gamma_7': 0.8019390501691726, 'nu_7': 0.17786576168309734, 'gamma_8': 0.16808569636004925, 'nu_8': 0.28854358217032167, 'gamma_9': 0.38659504843467785, 'nu_9': 0.395553279400038, 'gamma_10': 0.45759720529285974, 'nu_10': 0.28155277312637633, 'gamma_11': 0.3377663867654113, 'nu_11': 0.21302351674580042, 'gamma_12': 0.8564380731047843, 'nu_12': 0.2939522133173486, 'gamma_13'

Validation Results:
{'accuracy': '80.90', 'f1': '86.77', 'precision': '77.61', 'recall': '98.38'}

Test Results:
{'accuracy': '78.74', 'f1': '84.04', 'precision': '73.38', 'recall': '98.32'}


[I 2025-03-22 15:00:43,028] Trial 1 finished with value: 0.8797083839611178 and parameters: {'gamma_0': 0.10410322417231911, 'nu_0': 0.19232538877204144, 'gamma_1': 0.6106616232448593, 'nu_1': 0.06931042224527346, 'gamma_2': 0.8730159494930958, 'nu_2': 0.10302459992986295, 'gamma_3': 0.8246039902828104, 'nu_3': 0.30353881859998366, 'gamma_4': 0.6289380950499356, 'nu_4': 0.47607516132955724, 'gamma_5': 0.5034143942419885, 'nu_5': 0.3909535111038408, 'gamma_6': 0.2876601272098942, 'nu_6': 0.1294259288489986, 'gamma_7': 0.6811806199919828, 'nu_7': 0.4392928109906282, 'gamma_8': 0.5986586193361947, 'nu_8': 0.41008283113036487, 'gamma_9': 0.3422318924409581, 'nu_9': 0.06825540804999107, 'gamma_10': 0.07773275533374371, 'nu_10': 0.22866383058935766, 'gamma_11': 0.8611982685509222, 'nu_11': 0.045743135404684165, 'gamma_12': 0.3932781116169794, 'nu_12': 0.3652298001291156, 'gamma_13': 0.1079282208376915, 'nu_13': 0.2042157675542284, 'gamma_14': 0.3268170443968526, 'nu_14': 0.34822438607985845,

Validation Results:
{'accuracy': '82.95', 'f1': '87.97', 'precision': '79.84', 'recall': '97.95'}

Test Results:
{'accuracy': '80.76', 'f1': '85.27', 'precision': '75.57', 'recall': '97.83'}


[I 2025-03-22 15:00:46,685] Trial 2 finished with value: 0.8748952221290863 and parameters: {'gamma_0': 0.45801987919105414, 'nu_0': 0.11678484733356802, 'gamma_1': 0.9491436578255616, 'nu_1': 0.15907917486337672, 'gamma_2': 0.536776733694983, 'nu_2': 0.24572511554562812, 'gamma_3': 0.5480088947218986, 'nu_3': 0.4119312850693259, 'gamma_4': 0.572364954733424, 'nu_4': 0.0807735989607746, 'gamma_5': 0.80969147172905, 'nu_5': 0.28335182022961264, 'gamma_6': 0.3228306108898019, 'nu_6': 0.3217697176849564, 'gamma_7': 0.9987577298389, 'nu_7': 0.49808080052319004, 'gamma_8': 0.13863259301985184, 'nu_8': 0.4106894533757948, 'gamma_9': 0.7825848019099976, 'nu_9': 0.3377846594094824, 'gamma_10': 0.37141009214020704, 'nu_10': 0.30033974407163094, 'gamma_11': 0.5795725977722586, 'nu_11': 0.02547306249539768, 'gamma_12': 0.7034354917258744, 'nu_12': 0.24622015993209012, 'gamma_13': 0.05408029921434303, 'nu_13': 0.35533069000164674, 'gamma_14': 0.29137785785827786, 'nu_14': 0.3075315942990233, 'gamm

Validation Results:
{'accuracy': '82.12', 'f1': '87.49', 'precision': '78.86', 'recall': '98.24'}

Test Results:
{'accuracy': '80.09', 'f1': '84.89', 'precision': '74.72', 'recall': '98.29'}


[I 2025-03-22 15:00:49,850] Trial 3 finished with value: 0.8800252578404546 and parameters: {'gamma_0': 0.8250467688891865, 'nu_0': 0.24400939034490074, 'gamma_1': 0.4645327049251971, 'nu_1': 0.1866813465051962, 'gamma_2': 0.2824314829986835, 'nu_2': 0.15298955732761482, 'gamma_3': 0.3427851778018676, 'nu_3': 0.41754069557951806, 'gamma_4': 0.8705971345031307, 'nu_4': 0.08905746176478929, 'gamma_5': 0.4453260594536206, 'nu_5': 0.44272755530109315, 'gamma_6': 0.6450631097510566, 'nu_6': 0.21921207726905217, 'gamma_7': 0.30080046442821057, 'nu_7': 0.10233047628702886, 'gamma_8': 0.2770673351184267, 'nu_8': 0.016826398621519113, 'gamma_9': 0.9970150664731702, 'nu_9': 0.030050167494213534, 'gamma_10': 0.41918816056347347, 'nu_10': 0.0866948639930445, 'gamma_11': 0.6895138963343518, 'nu_11': 0.47405102189193543, 'gamma_12': 0.024793937004186358, 'nu_12': 0.1408629884619456, 'gamma_13': 0.60561627514641, 'nu_13': 0.47565164420771494, 'gamma_14': 0.8702601086869612, 'nu_14': 0.179421702648586

Validation Results:
{'accuracy': '82.93', 'f1': '88.00', 'precision': '79.61', 'recall': '98.38'}

Test Results:
{'accuracy': '80.68', 'f1': '85.27', 'precision': '75.31', 'recall': '98.27'}


[I 2025-03-22 15:00:53,675] Trial 4 finished with value: 0.8552101621398327 and parameters: {'gamma_0': 0.051468454328699585, 'nu_0': 0.03092785216679903, 'gamma_1': 0.5853189696655466, 'nu_1': 0.40547713280676934, 'gamma_2': 0.004503509283913137, 'nu_2': 0.2812049071719025, 'gamma_3': 0.21391817556027518, 'nu_3': 0.04950495332427521, 'gamma_4': 0.8901655609993594, 'nu_4': 0.46744368254397023, 'gamma_5': 0.09225295139125586, 'nu_5': 0.2051899319759697, 'gamma_6': 0.28274509559441735, 'nu_6': 0.1873011507151988, 'gamma_7': 0.22028288980842514, 'nu_7': 0.2597373325284134, 'gamma_8': 0.5252587155611405, 'nu_8': 0.18004479761718137, 'gamma_9': 0.8655472142017638, 'nu_9': 0.1606445952220797, 'gamma_10': 0.8004496449998035, 'nu_10': 0.42348985067088224, 'gamma_11': 0.52367988078645, 'nu_11': 0.07200343309859138, 'gamma_12': 0.47844573906396676, 'nu_12': 0.1683727480255828, 'gamma_13': 0.3480385517647494, 'nu_13': 0.48612440912230265, 'gamma_14': 0.08243246035200606, 'nu_14': 0.28772643975814

Validation Results:
{'accuracy': '79.01', 'f1': '85.52', 'precision': '76.21', 'recall': '97.42'}

Test Results:
{'accuracy': '76.03', 'f1': '82.24', 'precision': '71.12', 'recall': '97.48'}


[I 2025-03-22 15:00:57,011] Trial 5 finished with value: 0.8812539582013933 and parameters: {'gamma_0': 0.5757517763551621, 'nu_0': 0.03283100931585545, 'gamma_1': 0.4647903380921226, 'nu_1': 0.2055246904334975, 'gamma_2': 0.11384006571080982, 'nu_2': 0.15686146312022403, 'gamma_3': 0.3005580356555729, 'nu_3': 0.24652460807720647, 'gamma_4': 0.16546085720459136, 'nu_4': 0.36400701774630434, 'gamma_5': 0.027453242980593446, 'nu_5': 0.1742656209286344, 'gamma_6': 0.868475161569277, 'nu_6': 0.33535773986810413, 'gamma_7': 0.35989191995623543, 'nu_7': 0.43170717622363347, 'gamma_8': 0.3670099105610485, 'nu_8': 0.46619165451925243, 'gamma_9': 0.42685744204224196, 'nu_9': 0.32728596496100687, 'gamma_10': 0.00950222063950123, 'nu_10': 0.21091471290480504, 'gamma_11': 0.9894742279761547, 'nu_11': 0.25242195114517935, 'gamma_12': 0.7049871973556343, 'nu_12': 0.08383075599329053, 'gamma_13': 0.9746740601576385, 'nu_13': 0.24200331145726517, 'gamma_14': 0.16656177794816104, 'nu_14': 0.31856489646

Validation Results:
{'accuracy': '83.15', 'f1': '88.13', 'precision': '79.91', 'recall': '98.22'}

Test Results:
{'accuracy': '80.80', 'f1': '85.32', 'precision': '75.55', 'recall': '97.99'}


[I 2025-03-22 15:01:00,443] Trial 6 finished with value: 0.8765872599433309 and parameters: {'gamma_0': 0.14702170130273443, 'nu_0': 0.4645580596911664, 'gamma_1': 0.24455421141172567, 'nu_1': 0.3207079258493952, 'gamma_2': 0.006820280511246862, 'nu_2': 0.011008126591359899, 'gamma_3': 0.8497696240165304, 'nu_3': 0.17887781841577938, 'gamma_4': 0.2573684753300145, 'nu_4': 0.0727931866920603, 'gamma_5': 0.2963181518160783, 'nu_5': 0.2256748020861992, 'gamma_6': 0.6643310891175991, 'nu_6': 0.4175200279377218, 'gamma_7': 0.8755544833165999, 'nu_7': 0.46338379394527146, 'gamma_8': 0.9630889707796232, 'nu_8': 0.13162933502779503, 'gamma_9': 0.18868170548755622, 'nu_9': 0.23813987386008292, 'gamma_10': 0.8576740406971972, 'nu_10': 0.11605325067555544, 'gamma_11': 0.88063491916363, 'nu_11': 0.43338245763778754, 'gamma_12': 0.5849107153135833, 'nu_12': 0.17982510455902226, 'gamma_13': 0.9590066259905683, 'nu_13': 0.27443065739460054, 'gamma_14': 0.41337155750336535, 'nu_14': 0.4666829854555465

Validation Results:
{'accuracy': '82.39', 'f1': '87.66', 'precision': '79.12', 'recall': '98.27'}

Test Results:
{'accuracy': '79.60', 'f1': '84.57', 'precision': '74.26', 'recall': '98.21'}


[I 2025-03-22 15:01:04,074] Trial 7 finished with value: 0.8641593148635403 and parameters: {'gamma_0': 0.9458125205142646, 'nu_0': 0.23646841321263504, 'gamma_1': 0.012966837499919946, 'nu_1': 0.37605741604579224, 'gamma_2': 0.195945411306497, 'nu_2': 0.42483414131132946, 'gamma_3': 0.20345744478597536, 'nu_3': 0.18147572707496284, 'gamma_4': 0.6795314595175077, 'nu_4': 0.4013689757525263, 'gamma_5': 0.24618924100290093, 'nu_5': 0.47174368303067926, 'gamma_6': 0.1681464272139919, 'nu_6': 0.4338778720161217, 'gamma_7': 0.7967722192107901, 'nu_7': 0.18874106713941521, 'gamma_8': 0.6003125353330113, 'nu_8': 0.354249559628762, 'gamma_9': 0.5576782034140547, 'nu_9': 0.4719627885150287, 'gamma_10': 0.8041885750971405, 'nu_10': 0.0897101186086356, 'gamma_11': 0.03886267726060677, 'nu_11': 0.292909743341908, 'gamma_12': 0.6153184288547611, 'nu_12': 0.050859044722192236, 'gamma_13': 0.8685264397653899, 'nu_13': 0.498960239810536, 'gamma_14': 0.9972611900923661, 'nu_14': 0.47418253951975525, 'g

Validation Results:
{'accuracy': '80.29', 'f1': '86.42', 'precision': '76.95', 'recall': '98.53'}

Test Results:
{'accuracy': '76.86', 'f1': '82.90', 'precision': '71.55', 'recall': '98.53'}


[I 2025-03-22 15:01:07,660] Trial 8 finished with value: 0.8606334377085365 and parameters: {'gamma_0': 0.5064803820474238, 'nu_0': 0.49298972959659904, 'gamma_1': 0.9545818666707051, 'nu_1': 0.3919938966954595, 'gamma_2': 0.02691104463791596, 'nu_2': 0.1836409124653621, 'gamma_3': 0.7959957813328502, 'nu_3': 0.4309589493951602, 'gamma_4': 0.4391563596475496, 'nu_4': 0.05317448509691278, 'gamma_5': 0.22767852239946593, 'nu_5': 0.01470457897063163, 'gamma_6': 0.9065518962183124, 'nu_6': 0.3419629361327847, 'gamma_7': 0.3209876682259564, 'nu_7': 0.3194188709941471, 'gamma_8': 0.9882675713464741, 'nu_8': 0.2266254836156347, 'gamma_9': 0.5975639491040831, 'nu_9': 0.027421061178241793, 'gamma_10': 0.5581457608672201, 'nu_10': 0.21944797291115128, 'gamma_11': 0.7585575588350612, 'nu_11': 0.4603693454174291, 'gamma_12': 0.9811471580457214, 'nu_12': 0.014175781201616094, 'gamma_13': 0.6638892709323817, 'nu_13': 0.2983686034030399, 'gamma_14': 0.32919290130176787, 'nu_14': 0.38889096983795607, 

Validation Results:
{'accuracy': '79.67', 'f1': '86.06', 'precision': '76.34', 'recall': '98.62'}

Test Results:
{'accuracy': '77.14', 'f1': '83.09', 'precision': '71.75', 'recall': '98.71'}


[I 2025-03-22 15:01:11,132] Trial 9 finished with value: 0.872579637726421 and parameters: {'gamma_0': 0.9202773717136662, 'nu_0': 0.3878934589717985, 'gamma_1': 0.8453957533367764, 'nu_1': 0.17930024021086977, 'gamma_2': 0.2133783586116206, 'nu_2': 0.4724865460087344, 'gamma_3': 0.40072177605005904, 'nu_3': 0.05229702236307669, 'gamma_4': 0.11608616409825864, 'nu_4': 0.49326796491677, 'gamma_5': 0.9191850969370347, 'nu_5': 0.26507292526029613, 'gamma_6': 0.6218022594943851, 'nu_6': 0.3760659433490513, 'gamma_7': 0.9695587756613566, 'nu_7': 0.07085974061525631, 'gamma_8': 0.03094107348204534, 'nu_8': 0.3480616981600322, 'gamma_9': 0.15261648194285132, 'nu_9': 0.3838166377532291, 'gamma_10': 0.9036237483760211, 'nu_10': 0.22090524743148318, 'gamma_11': 0.41676479026941, 'nu_11': 0.3745448385439501, 'gamma_12': 0.6362987872437144, 'nu_12': 0.0784648953719209, 'gamma_13': 0.48988895450317527, 'nu_13': 0.3981087939257058, 'gamma_14': 0.28906062248913006, 'nu_14': 0.34220809135518476, 'gamm

Validation Results:
{'accuracy': '81.67', 'f1': '87.26', 'precision': '78.25', 'recall': '98.61'}

Test Results:
{'accuracy': '79.35', 'f1': '84.46', 'precision': '73.87', 'recall': '98.58'}


In [24]:
import optuna
from plotly.io import show

fig = optuna.visualization.plot_optimization_history(dbocsvm_study)
show(fig)

In [25]:
fig = optuna.visualization.plot_edf([dbocsvm_study])
show(fig)

In [21]:
parameter_list = {}

for key, value in dbocsvm_study.best_params.items():
    cluster = key.split("_")[1]
    cluster = int(cluster)

    parameter_list[cluster] = {
        "kernel": "rbf",
        "gamma": dbocsvm_study.best_params[f"gamma_{cluster}"],
        "nu": dbocsvm_study.best_params[f"nu_{cluster}"],
    }

best parameters and values

In [22]:
autoencoder_architecture = {
    "input_dim": existing_model_architecture["input_dim"],
    "hidden_dims": existing_model_architecture["hidden_dims"],
    "latent_dim": existing_model_architecture["latent_dim"],
    "activation_type": existing_model_architecture["activation_type"],
    "negative_slope": existing_model_architecture["negative_slope"],
    "dropout_rate": existing_model_architecture["dropout_rate"],
    "output_activation_type": existing_model_architecture["output_activation_type"],
    "val_loss": checkpoint["val_loss"],
}

print("Best autoencoder model:")
pprint.pprint(autoencoder_architecture, sort_dicts=False)
print("")

print("Reconstruction error:")
pprint.pprint(reconstruction_error, sort_dicts=False)
print("")

best_dbscan_parameters = {
    "eps": eps,
    "min_samples": min_samples,
    "distance_metric": dbscan_tuning_parameters["distance_metric"],
    "evaluation_metric": dbscan_tuning_parameters["evaluation_metric"],
    "score": best_trial_dbscan.value,
    "n_clusters": n_clusters,
    "cluster_data_points": cluster_data_points,
}

print("Best dbscan parameters")
pprint.pprint(best_dbscan_parameters, sort_dicts=False)
print("")

print("Best ocsvm parameters")
print(f"Tree algorithm: {dbocsvm_tree_algorithm}")
print(f"Accuracy: {dbocsvm_study.best_value}")
pprint.pprint(parameter_list, sort_dicts=False)

Best autoencoder model:
{'input_dim': 122,
 'hidden_dims': [64, 32],
 'latent_dim': 8,
 'activation_type': 'LeakyReLU',
 'negative_slope': 0.2,
 'dropout_rate': 0.3,
 'output_activation_type': 'Sigmoid',
 'val_loss': 0.0009392629441242876}

Reconstruction error:
{'normal_loss': 0.0014022980549062932,
 'anomaly_loss': 0.03032843417434621,
 'loss_difference': 0.028926136119439914}

Best dbscan parameters
{'eps': 16.030210554032887,
 'min_samples': 15,
 'distance_metric': 'manhattan',
 'evaluation_metric': 'silhouette',
 'score': 0.5952938199043274,
 'n_clusters': 23,
 'cluster_data_points': {0: 351,
                         1: 3328,
                         2: 275,
                         3: 36,
                         4: 716,
                         5: 82,
                         6: 195,
                         7: 91,
                         8: 299,
                         9: 22,
                         10: 48,
                         11: 39,
                         12: 33,
  

In [23]:
import json

tuning_result = {
    "dbscan": best_dbscan_parameters,
    "ocsvm": {
        "tree_algorithm": dbocsvm_tree_algorithm,
        "accuracy": dbocsvm_study.best_value,
        "parameters": parameter_list,
    },
}

results = {
    "max_score": 0,
    "autoencoder_architecture": autoencoder_architecture,
    "reconstruction_error": reconstruction_error,
    "tuning_results": {},
}

os.makedirs("tuning_results", exist_ok=True)
if os.path.exists(results_path):
    with open(results_path, "r") as file:
        existing_results = json.load(file)
        if existing_results["max_score"] < dbocsvm_study.best_value:
            with open(results_path, "w") as f:
                existing_results["max_score"] = dbocsvm_study.best_value
                tuning_result_id = len(existing_results["tuning_results"])
                tuning_result["score"] = dbocsvm_study.best_value
                existing_results["tuning_results"][tuning_result_id] = tuning_result
                json.dump(existing_results, f)
else:
    with open(results_path, "w") as f:
        results["max_score"] = dbocsvm_study.best_value
        tuning_result["score"] = dbocsvm_study.best_value
        results["tuning_results"][0] = tuning_result
        json.dump(results, f)