In [1]:
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

with_storage_dbscan = False
with_storage_dbocsvm = False
os.makedirs("optuna_storage", exist_ok=True)
dbscan_optuna_storage_path = "sqlite:///optuna_storage/dbscan_study.db"
ocsvm_optuna_storage_path = "sqlite:///optuna_storage/dbocsvm_study.db"

train_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/train_set_full.csv"
)
test_set_path = (
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/test_set.csv"
)

results_path = "tuning_results/results_dbocsvm_01.json"

test_run = True

if test_run:
    sample_size = 0.03
    use_sample = True
    ocsvm_trials = 5
else:
    sample_size = 1.0
    use_sample = False
    ocsvm_trials = 1000


use_full_train_set = True

dbscan_tuning_parameters = {
    "evaluation_metric": "silhouette",  # silhouette, calinski_harabasz, davies_bouldin
    "distance_metric": "manhattan",  # manhattan, euclidean
    "trials": 10,
}
dbocsvm_tree_algorithm = "kd_tree"  # "ball_tree" or "kd_tree"

existing_model_path = "best_models/config 5/autoencoder_Model_1_hidden[96, 64]_latent55_lr0.001_bs128_optadamw_actELU_dr0.1_wd0.pth"

existing_model_architecture = {
    "input_dim": 122,
    "hidden_dims": [96, 64],
    "latent_dim": 55,
    "activation_type": "ELU",
    "negative_slope": 0.02,
    "dropout_rate": 0.1,
    "output_activation_type": "Sigmoid",
}

In [2]:
# CHANGE
override_dbscan_tuning = False
dbscan_override_params = {
    "eps": 0.5,
    "min_samples": 5,
    "distance_metric": "manhattan",
    "n_clusters": 5,
    "cluster_data_points": {"-1": 293, "0": 66992, "1": 57},
}

import dataset

In [3]:
import pandas as pd

train_df = pd.read_csv(train_set_path)

if use_sample:
    train_df = train_df.sample(frac=sample_size, random_state=42).reset_index(drop=True)

print(train_df.shape)
train_df.head(1)

(2020, 122)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,5.833486e-07,2.572642e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
from sklearn.model_selection import train_test_split

X_train_full = train_df.values

X_train, X_val = train_test_split(train_df, test_size=0.2, random_state=42)
X_train = X_train.values
X_val = X_val.values

print(X_train.shape, X_val.shape, X_train_full.shape)

(1616, 122) (404, 122) (2020, 122)


In [5]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
X_val_tensor = torch.FloatTensor(X_val)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor)
val_dataset = TensorDataset(X_val_tensor)

input_dim = X_train_full.shape[1]

training the autoencoder or use existing

In [6]:
from torch import nn
from models import BatchNormAutoencoderV2

autoencoder = BatchNormAutoencoderV2(
    input_dim=existing_model_architecture["input_dim"],
    hidden_dims=existing_model_architecture["hidden_dims"],
    latent_dim=existing_model_architecture["latent_dim"],
    activation_type=existing_model_architecture["activation_type"],
    negative_slope=existing_model_architecture["negative_slope"],
    dropout_rate=existing_model_architecture["dropout_rate"],
    output_activation_type=existing_model_architecture["output_activation_type"],
)

In [7]:
# Load best model
checkpoint = torch.load(existing_model_path)
autoencoder.load_state_dict(checkpoint["model_state_dict"])

autoencoder.eval()

BatchNormAutoencoderV2(
  (encoder): Sequential(
    (0): Linear(in_features=122, out_features=96, bias=True)
    (1): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=96, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ELU(alpha=1.0)
    (7): Dropout(p=0.1, inplace=False)
    (8): Linear(in_features=64, out_features=55, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=55, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=64, out_features=96, bias=True)
    (5): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ELU(alpha=1.0)
    (7): Dropout(p=0.1, inplace=False)
    (8): Line

dbscan tuning

In [8]:
import numpy as np

# extract encoded features
X_train_full_tensor = torch.FloatTensor(X_train_full)
X_train_full_dataset = TensorDataset(X_train_full_tensor)
X_train_full_loader = DataLoader(X_train_full_dataset, batch_size=256)

# Extract in batches to prevent memory issues
if use_full_train_set:
    X_encoded = []
    with torch.no_grad():
        for data in X_train_full_loader:
            data_x = data[0].to(device)
            encoded = autoencoder.encode(data_x)
            X_encoded.append(encoded.cpu().numpy())
    X_encoded = np.vstack(X_encoded)
else:
    X_encoded = []
    with torch.no_grad():
        for data in train_loader:
            data_x = data[0].to(device)
            encoded = autoencoder.encode(data_x)
            X_encoded.append(encoded.cpu().numpy())
    X_encoded = np.vstack(X_encoded)

In [9]:
from utils import find_eps_range_with_elbow_method

input_dim_encoded = X_encoded.shape[1]

k_for_elbow = int((20 + input_dim_encoded * 2) / 2)
# CHANGE
if not override_dbscan_tuning:
    min_eps, max_eps = find_eps_range_with_elbow_method(
        X_encoded,
        k=k_for_elbow,
        plot=False,
    )
    min_eps, max_eps

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from utils import objective_dbscan
import optuna

dbscan_objective_lambda = lambda trial: objective_dbscan(
    trial,
    X_encoded=X_encoded,
    evaluation_metric=dbscan_tuning_parameters["evaluation_metric"],
    eps_range=(min_eps, max_eps),
    min_samples_range=(1, input_dim_encoded * 2),
    distance_metric=dbscan_tuning_parameters["distance_metric"],
    n_jobs=-1,
)

if with_storage_dbscan:
    dbscan_study = optuna.create_study(
        direction="maximize",
        storage=dbscan_optuna_storage_path,
        study_name="dbscan_study",
        load_if_exists=True,
    )
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_tuning_parameters["trials"],
    )
else:
    dbscan_study = optuna.create_study(direction="maximize")
    dbscan_study.optimize(
        dbscan_objective_lambda,
        n_trials=dbscan_tuning_parameters["trials"],
    )

[I 2025-03-12 05:09:53,563] A new study created in memory with name: no-name-d55daba5-4607-45c0-a2d3-24b27bfa4056
[I 2025-03-12 05:09:53,875] Trial 0 finished with value: 0.5637109875679016 and parameters: {'eps': 19.269543744356973, 'min_samples': 89}. Best is trial 0 with value: 0.5637109875679016.


2
{0: 997, 1: 206, -1: 817}


[I 2025-03-12 05:09:54,158] Trial 1 finished with value: 0.5789753794670105 and parameters: {'eps': 15.682731990668852, 'min_samples': 109}. Best is trial 1 with value: 0.5789753794670105.


2
{0: 969, 1: 194, -1: 857}


[I 2025-03-12 05:09:54,492] Trial 2 finished with value: 0.29372191429138184 and parameters: {'eps': 10.110531299468047, 'min_samples': 3}. Best is trial 1 with value: 0.5789753794670105.


44
{0: 84, 1: 985, 2: 5, 3: 3, 4: 30, 5: 210, 6: 68, 7: 32, 8: 73, 9: 8, 10: 4, 11: 13, 12: 17, 13: 4, 14: 10, 15: 8, 16: 10, 17: 14, 18: 17, 19: 6, 20: 9, 21: 3, 22: 3, 23: 7, 24: 3, 25: 6, 26: 9, 27: 5, 28: 4, 29: 5, 30: 3, 31: 4, 32: 3, 33: 9, 34: 3, 35: 4, 36: 3, 37: 6, 38: 3, 39: 5, 40: 3, 41: 3, 42: 3, 43: 3, -1: 310}


[I 2025-03-12 05:09:54,732] Trial 3 finished with value: 0.582797110080719 and parameters: {'eps': 12.030707220608225, 'min_samples': 38}. Best is trial 3 with value: 0.582797110080719.


2
{0: 956, 1: 190, -1: 874}


[I 2025-03-12 05:09:54,963] Trial 4 finished with value: 0.593856692314148 and parameters: {'eps': 10.70385898524255, 'min_samples': 101}. Best is trial 4 with value: 0.593856692314148.


2
{0: 917, 1: 121, -1: 982}


[I 2025-03-12 05:09:55,196] Trial 5 finished with value: 0.5871599316596985 and parameters: {'eps': 7.258377560212691, 'min_samples': 12}. Best is trial 4 with value: 0.593856692314148.


4
{0: 920, 1: 186, 2: 25, 3: 15, -1: 874}


[I 2025-03-12 05:09:55,460] Trial 6 finished with value: 0.5608515739440918 and parameters: {'eps': 19.965497747791662, 'min_samples': 80}. Best is trial 4 with value: 0.593856692314148.


2
{0: 1000, 1: 211, -1: 809}


[I 2025-03-12 05:09:55,679] Trial 7 finished with value: 0.652356743812561 and parameters: {'eps': 6.2558061395139255, 'min_samples': 45}. Best is trial 7 with value: 0.652356743812561.


3
{0: 400, 1: 457, 2: 85, -1: 1078}


[I 2025-03-12 05:09:55,959] Trial 8 finished with value: 0.4493907690048218 and parameters: {'eps': 16.960051140036814, 'min_samples': 6}. Best is trial 7 with value: 0.652356743812561.


21
{0: 100, 1: 1005, 2: 82, 3: 16, 4: 53, 5: 32, 6: 6, 7: 216, 8: 93, 9: 12, 10: 8, 11: 9, 12: 13, 13: 31, 14: 10, 15: 8, 16: 12, 17: 20, 18: 7, 19: 13, 20: 6, -1: 268}


[I 2025-03-12 05:09:56,277] Trial 9 finished with value: -0.05791078880429268 and parameters: {'eps': 8.415041377064306, 'min_samples': 1}. Best is trial 7 with value: 0.652356743812561.


382
{0: 80, 1: 945, 2: 2, 3: 2, 4: 1, 5: 1, 6: 1, 7: 202, 8: 1, 9: 5, 10: 2, 11: 29, 12: 1, 13: 2, 14: 1, 15: 1, 16: 63, 17: 4, 18: 1, 19: 1, 20: 29, 21: 1, 22: 61, 23: 1, 24: 2, 25: 1, 26: 1, 27: 5, 28: 2, 29: 8, 30: 2, 31: 4, 32: 1, 33: 3, 34: 1, 35: 7, 36: 13, 37: 1, 38: 1, 39: 1, 40: 4, 41: 10, 42: 8, 43: 1, 44: 2, 45: 1, 46: 1, 47: 1, 48: 10, 49: 2, 50: 1, 51: 2, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 2, 63: 2, 64: 11, 65: 1, 66: 1, 67: 1, 68: 6, 69: 7, 70: 20, 71: 1, 72: 1, 73: 5, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 2, 83: 1, 84: 2, 85: 1, 86: 3, 87: 1, 88: 1, 89: 1, 90: 1, 91: 1, 92: 2, 93: 1, 94: 1, 95: 1, 96: 2, 97: 2, 98: 1, 99: 1, 100: 1, 101: 7, 102: 1, 103: 1, 104: 3, 105: 1, 106: 6, 107: 1, 108: 1, 109: 1, 110: 1, 111: 1, 112: 1, 113: 1, 114: 1, 115: 1, 116: 1, 117: 1, 118: 1, 119: 6, 120: 1, 121: 1, 122: 2, 123: 1, 124: 1, 125: 1, 126: 1, 127: 4, 128: 1, 129: 2, 130: 1, 131: 1, 132: 1, 133: 4, 134: 1, 135: 1, 136

In [11]:
import pprint

# CHANGE
if override_dbscan_tuning:
    eps = dbscan_override_params["eps"]
    min_samples = dbscan_override_params["min_samples"]
else:
    eps = dbscan_study.best_params["eps"]
    min_samples = dbscan_study.best_params["min_samples"]

# get dbscan best trial
if override_dbscan_tuning:
    n_clusters = dbscan_override_params["n_clusters"]
    cluster_data_points = dbscan_override_params["cluster_data_points"]
else:
    best_trial_dbscan = dbscan_study.best_trial
    best_trial_dbscan_user_attrs = best_trial_dbscan.user_attrs

    n_clusters = best_trial_dbscan_user_attrs["n_clusters"]
    cluster_data_points = best_trial_dbscan_user_attrs["cluster_data_points"]
    

print(f"eps = {eps}")
print(f"min_samples = {min_samples}")
print(f"n_clusters = {n_clusters}")
print("cluster_data_points")
pprint.pprint(cluster_data_points)

eps = 6.2558061395139255
min_samples = 45
n_clusters = 3
cluster_data_points
{-1: 1078, 0: 400, 1: 457, 2: 85}


fit the DBSCAN

In [12]:
from models import DBOCSVM_V2

# Create DB-OC-SVM model with default ocsvm parameters
dbocsvm = DBOCSVM_V2(
    kernel="rbf",
    gamma="auto",
    nu=0.2,
    eps=eps,
    min_samples=min_samples,
    dbscan_metric=dbscan_tuning_parameters["distance_metric"],
    algorithm=dbocsvm_tree_algorithm,  # ball_tree, kd_tree,
)

In [13]:
dbocsvm.fit_cluster(X_encoded, verbose=True)

Fitting DBSCAN...
DBSCAN Fitted...
Unique Clusters: [-1  0  1  2]
Cluster Sizes: {-1: 1078, 0: 400, 1: 457, 2: 85}


importing test set

In [14]:
test_df = pd.read_csv(test_set_path)
print(test_df.shape)
test_df.head(1)

(22543, 125)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary,attack_categorical,attack_class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,neptune,DoS


In [15]:
# Splitting into X and y
X_test = test_df.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).values
y_test = test_df["attack_binary"].values
y_test_class = test_df["attack_class"]

print(X_test.shape, y_test.shape)

(22543, 122) (22543,)


reconstruction error inspection

In [16]:
# Separate normal and anomaly samples from test set
X_test_normal = X_test[y_test == 1]
X_test_anomaly = X_test[y_test == -1]

print(f"Normal test samples: {X_test_normal.shape[0]}")
print(f"Anomaly test samples: {X_test_anomaly.shape[0]}")

# Convert test data to PyTorch tensors
X_test_normal_tensor = torch.FloatTensor(X_test_normal).to(device)
X_test_anomaly_tensor = torch.FloatTensor(X_test_anomaly).to(device)

# Create DataLoaders for test data evaluation
normal_test_dataset = TensorDataset(X_test_normal_tensor)
anomaly_test_dataset = TensorDataset(X_test_anomaly_tensor)
normal_test_loader = DataLoader(normal_test_dataset, batch_size=256, shuffle=False)
anomaly_test_loader = DataLoader(anomaly_test_dataset, batch_size=256, shuffle=False)


def calculate_reconstruction_error(model, loader):
    model.eval()
    total_loss = 0
    total_samples = 0
    criterion = nn.MSELoss(reduction="none")

    with torch.no_grad():
        for batch in loader:
            x = batch[0]
            outputs = model(x)
            # Calculate MSE for each sample
            loss = criterion(outputs, x)
            loss = loss.mean(dim=1)
            total_loss += torch.sum(loss).item()
            total_samples += x.size(0)

    return total_loss / total_samples


# Function to evaluate a model's reconstruction performance
def evaluate_model(model):
    normal_loss = calculate_reconstruction_error(model, normal_test_loader)
    anomaly_loss = calculate_reconstruction_error(model, anomaly_test_loader)
    loss_difference = anomaly_loss - normal_loss

    return {
        "normal_loss": normal_loss,
        "anomaly_loss": anomaly_loss,
        "loss_difference": loss_difference,
    }


reconstruction_error = evaluate_model(autoencoder)
reconstruction_error

Normal test samples: 9711
Anomaly test samples: 12832


{'normal_loss': 0.000964479972240596,
 'anomaly_loss': 0.010680428181224482,
 'loss_difference': 0.009715948208983886}

extract features from test data

In [17]:
X_test_tensor = torch.FloatTensor(X_test).to(device)

test_dataset = TensorDataset(X_test_tensor, torch.zeros(len(X_test_tensor)))
test_loader = DataLoader(test_dataset, batch_size=128)

X_test_encoded = []
with torch.no_grad():
    for data, _ in test_loader:
        encoded = autoencoder.encode(data)
        X_test_encoded.append(encoded.cpu().numpy())

X_test_encoded = np.vstack(X_test_encoded)
print(X_test_encoded.shape)

(22543, 55)


tuning the ocsvms

In [18]:
from utils import objective_dbocsvm_fit_ocsvm

# Inner Optuna study for DBSCAN
dbocsvm_fit_ocsvm_objective_lambda = lambda trial: objective_dbocsvm_fit_ocsvm(
    trial,
    model=dbocsvm,
    X_encoded_train=X_encoded,
    X_encoded_test=X_test_encoded,
    y_test=y_test,
    cluster_count=n_clusters,
)

if with_storage_dbocsvm:
    dbocsvm_study = optuna.create_study(
        direction="maximize",
        storage=ocsvm_optuna_storage_path,
        study_name="dbocsvm_study",
        load_if_exists=True,
    )
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )
else:
    dbocsvm_study = optuna.create_study(direction="maximize")
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )

[I 2025-03-12 05:09:57,623] A new study created in memory with name: no-name-a545d241-fdfa-4a73-9802-8c24d6076529
[I 2025-03-12 05:10:00,236] Trial 0 finished with value: 0.7404072217539813 and parameters: {'gamma_0': 0.604611269094318, 'nu_0': 0.4050757831678222, 'gamma_1': 0.8543166422512686, 'nu_1': 0.20280665296236203, 'gamma_2': 0.31813037818859724, 'nu_2': 0.0560313380356262}. Best is trial 0 with value: 0.7404072217539813.


{'accuracy': '74.04', 'f1': '81.42', 'precision': '68.70', 'recall': '99.91'}


[I 2025-03-12 05:10:02,992] Trial 1 finished with value: 0.7129042274763785 and parameters: {'gamma_0': 0.2331303744264942, 'nu_0': 0.4215780824416861, 'gamma_1': 0.9519999663309572, 'nu_1': 0.42104527654723367, 'gamma_2': 0.9916322831895353, 'nu_2': 0.23610281851733605}. Best is trial 0 with value: 0.7404072217539813.


{'accuracy': '71.29', 'f1': '79.82', 'precision': '66.53', 'recall': '99.73'}


[I 2025-03-12 05:10:05,686] Trial 2 finished with value: 0.7440447145455352 and parameters: {'gamma_0': 0.7107213333113137, 'nu_0': 0.3564683317310839, 'gamma_1': 0.24375856858725567, 'nu_1': 0.27076367814019553, 'gamma_2': 0.34296136339917155, 'nu_2': 0.13007508827654315}. Best is trial 2 with value: 0.7440447145455352.


{'accuracy': '74.40', 'f1': '81.63', 'precision': '69.00', 'recall': '99.92'}


[I 2025-03-12 05:10:08,447] Trial 3 finished with value: 0.7411613361132059 and parameters: {'gamma_0': 0.053377347279043835, 'nu_0': 0.29327745908634534, 'gamma_1': 0.9104357264409647, 'nu_1': 0.258967558224491, 'gamma_2': 0.24557585236909718, 'nu_2': 0.09545761092347047}. Best is trial 2 with value: 0.7440447145455352.


{'accuracy': '74.12', 'f1': '81.37', 'precision': '68.93', 'recall': '99.28'}


[I 2025-03-12 05:10:11,335] Trial 4 finished with value: 0.7451537062502772 and parameters: {'gamma_0': 0.9062955629765135, 'nu_0': 0.29591401455812694, 'gamma_1': 0.4884336671489512, 'nu_1': 0.3006595450044127, 'gamma_2': 0.8766677433631176, 'nu_2': 0.2983960648583581}. Best is trial 4 with value: 0.7451537062502772.


{'accuracy': '74.52', 'f1': '81.69', 'precision': '69.10', 'recall': '99.90'}


In [19]:
parameter_list = {}

for key, value in dbocsvm_study.best_params.items():
    cluster = key.split("_")[1]
    cluster = int(cluster)

    parameter_list[cluster] = {
        "kernel": "rbf",
        "gamma": dbocsvm_study.best_params[f"gamma_{cluster}"],
        "nu": dbocsvm_study.best_params[f"nu_{cluster}"],
    }

best parameters and values

In [20]:
autoencoder_architecture = {
    "input_dim": existing_model_architecture["input_dim"],
    "hidden_dims": existing_model_architecture["hidden_dims"],
    "latent_dim": existing_model_architecture["latent_dim"],
    "activation_type": existing_model_architecture["activation_type"],
    "negative_slope": existing_model_architecture["negative_slope"],
    "dropout_rate": existing_model_architecture["dropout_rate"],
    "output_activation_type": existing_model_architecture["output_activation_type"],
    "val_loss": checkpoint["val_loss"],
}

print("Best autoencoder model:")
pprint.pprint(autoencoder_architecture, sort_dicts=False)
print("")

print("Reconstruction error:")
pprint.pprint(reconstruction_error, sort_dicts=False)
print("")

best_dbscan_parameters = {
    "eps": eps,
    "min_samples": min_samples,
    "distance_metric": dbscan_tuning_parameters["distance_metric"],
    "evaluation_metric": dbscan_tuning_parameters["evaluation_metric"],
    "score": best_trial_dbscan.value,
    "n_clusters": n_clusters,
    "cluster_data_points": cluster_data_points,
}

print("Best dbscan parameters")
pprint.pprint(best_dbscan_parameters, sort_dicts=False)
print("")

print("Best ocsvm parameters")
print(f"Tree algorithm: {dbocsvm_tree_algorithm}")
print(f"Accuracy: {dbocsvm_study.best_value}")
pprint.pprint(parameter_list, sort_dicts=False)

Best autoencoder model:
{'input_dim': 122,
 'hidden_dims': [96, 64],
 'latent_dim': 55,
 'activation_type': 'ELU',
 'negative_slope': 0.02,
 'dropout_rate': 0.1,
 'output_activation_type': 'Sigmoid',
 'val_loss': 0.00011326836558407006}

Reconstruction error:
{'normal_loss': 0.000964479972240596,
 'anomaly_loss': 0.010680428181224482,
 'loss_difference': 0.009715948208983886}

Best dbscan parameters
{'eps': 6.2558061395139255,
 'min_samples': 45,
 'distance_metric': 'manhattan',
 'evaluation_metric': 'silhouette',
 'score': 0.652356743812561,
 'n_clusters': 3,
 'cluster_data_points': {0: 400, 1: 457, 2: 85, -1: 1078}}

Best ocsvm parameters
Tree algorithm: kd_tree
Accuracy: 0.7451537062502772
{0: {'kernel': 'rbf', 'gamma': 0.9062955629765135, 'nu': 0.29591401455812694},
 1: {'kernel': 'rbf', 'gamma': 0.4884336671489512, 'nu': 0.3006595450044127},
 2: {'kernel': 'rbf', 'gamma': 0.8766677433631176, 'nu': 0.2983960648583581}}


In [21]:
import json

tuning_result = {
    "dbscan": best_dbscan_parameters,
    "ocsvm": {
        "tree_algorithm": dbocsvm_tree_algorithm,
        "accuracy": dbocsvm_study.best_value,
        "parameters": parameter_list,
    },
}

results = {
    "max_score": 0,
    "autoencoder_architecture": autoencoder_architecture,
    "reconstruction_error": reconstruction_error,
    "tuning_results": {},
}

os.makedirs("tuning_results", exist_ok=True)
if os.path.exists(results_path):
    with open(results_path, "r") as file:
        existing_results = json.load(file)
        if existing_results["max_score"] < dbocsvm_study.best_value:
            with open(results_path, "w") as f:
                existing_results["max_score"] = dbocsvm_study.best_value
                tuning_result_id = len(existing_results["tuning_results"])
                tuning_result["score"] = dbocsvm_study.best_value
                existing_results["tuning_results"][tuning_result_id] = tuning_result
                json.dump(existing_results, f)
else:
    with open(results_path, "w") as f:
        results["max_score"] = dbocsvm_study.best_value
        tuning_result["score"] = dbocsvm_study.best_value
        results["tuning_results"][0] = tuning_result
        json.dump(results, f)