In [None]:
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

dbscan_optuna_storage_path = "sqlite:///optuna_storage/dbscan_study.db"
ocsvm_optuna_storage_path = "sqlite:///optuna_storage/dbocsvm_study.db"

train_set_path = "../../../datasets/train_set.csv"
test_set_path = "../../../datasets/test_set.csv"

results_path = "tuning_results/results_dbocsvm_config.json"

test_run = True

if test_run:
    with_storage_dbscan = False
    with_storage_dbocsvm = False
    sample_size = 0.01
    use_sample = True
    ocsvm_trials = 10
else:
    os.makedirs("optuna_storage", exist_ok=True)
    with_storage_dbscan = True
    with_storage_dbocsvm = True
    sample_size = 0.3
    use_sample = True
    ocsvm_trials = 100


dbscan_tuning_parameters = {
    "evaluation_metric": "silhouette",  # silhouette, calinski_harabasz, davies_bouldin
    "distance_metric": "manhattan",  # manhattan, euclidean, cosine
    "trials": 0,
}
dbocsvm_tree_algorithm = "ball_tree"  # "ball_tree" or "kd_tree"

existing_model_path = "../../../autoencoder/autoencoder.pth"

existing_model_architecture = {
    "input_dim": 15,
    "hidden_dims": [13, 11],
    "latent_dim": 9,
    "activation_type": "ReLU",
    "negative_slope": 1,
    "output_activation_type": "Sigmoid",
}

In [None]:
# CHANGE
override_dbscan_tuning = True
dbscan_override_params = {
    "eps": 0.10361559446127108,
    "min_samples": 12,
    "distance_metric": "manhattan",
    "score": 0.9616295695304871,
    "n_clusters": 29,
    "cluster_data_points": {
        "0": 15666,
        "1": 11398,
        "2": 10639,
        "3": 6686,
        "4": 858,
        "5": 3568,
        "6": 51,
        "7": 2269,
        "8": 85,
        "9": 478,
        "10": 87,
        "11": 85,
        "12": 693,
        "13": 158,
        "14": 129,
        "15": 158,
        "16": 34,
        "17": 24,
        "18": 201,
        "19": 28,
        "20": 161,
        "21": 17,
        "22": 79,
        "23": 31,
        "24": 31,
        "25": 34,
        "26": 31,
        "27": 14,
        "28": 24,
        "-1": 283,
    },
}

import dataset

In [None]:
import pandas as pd

train_df = pd.read_csv(train_set_path)

if use_sample:
    train_df = train_df.sample(frac=sample_size, random_state=42).reset_index(drop=True)

print(train_df.shape)
train_df.head(1)

In [None]:
X_train = train_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_train = train_df["attack_binary"].values

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor)

input_dim = X_train.shape[1]
print(f"Input dimension: {input_dim}")

use existing autoencoder

In [None]:
from torch import nn
from autoencoder import BatchNormAutoencoder

autoencoder = BatchNormAutoencoder(
    input_dim=existing_model_architecture["input_dim"],
    hidden_dims=existing_model_architecture["hidden_dims"],
    latent_dim=existing_model_architecture["latent_dim"],
    activation_type=existing_model_architecture["activation_type"],
    negative_slope=existing_model_architecture["negative_slope"],
    output_activation_type=existing_model_architecture["output_activation_type"],
)

In [None]:
# Load best model
checkpoint = torch.load(existing_model_path)
autoencoder.load_state_dict(checkpoint["model_state_dict"])

autoencoder.eval()

dbscan tuning

In [None]:
import numpy as np

# extract encoded features
X_train_tensor = torch.FloatTensor(X_train)
X_train_dataset = TensorDataset(X_train_tensor)
X_train_loader = DataLoader(X_train_dataset, batch_size=256)


X_encoded = []
with torch.no_grad():
    for data in X_train_loader:
        data_x = data[0].to(device)
        encoded = autoencoder.encode(data_x)
        X_encoded.append(encoded.cpu().numpy())
X_encoded = np.vstack(X_encoded)

In [None]:
from utils_final import find_eps_range_with_elbow_method

input_dim_encoded = X_encoded.shape[1]

k_for_elbow = int((20 + input_dim_encoded * 2) / 2)
# CHANGE
if not override_dbscan_tuning:
    min_eps, max_eps = find_eps_range_with_elbow_method(
        X_encoded,
        k=k_for_elbow,
        plot=False,
    )
    min_eps, max_eps
    print(min_eps, max_eps)

In [None]:
from utils_final import objective_dbscan
import optuna

dbscan_objective_lambda = lambda trial: objective_dbscan(
    trial,
    X_encoded=X_encoded,
    evaluation_metric=dbscan_tuning_parameters["evaluation_metric"],
    eps_range=(min_eps, max_eps),
    min_samples_range=(1, input_dim_encoded * 2),
    distance_metric=dbscan_tuning_parameters["distance_metric"],
    n_jobs=-1,
)

if not override_dbscan_tuning:
    print("Starting DBSCAN tuning...")
    if with_storage_dbscan:
        dbscan_study = optuna.create_study(
            direction="maximize",
            storage=dbscan_optuna_storage_path,
            study_name="dbscan_study",
            load_if_exists=True,
        )
        dbscan_study.optimize(
            dbscan_objective_lambda,
            n_trials=dbscan_tuning_parameters["trials"],
        )
    else:
        dbscan_study = optuna.create_study(direction="maximize")
        dbscan_study.optimize(
            dbscan_objective_lambda,
            n_trials=dbscan_tuning_parameters["trials"],
        )

In [None]:
import optuna
from plotly.io import show

if not override_dbscan_tuning:
    fig = optuna.visualization.plot_optimization_history(dbscan_study)
    show(fig)

In [None]:
if not override_dbscan_tuning:
    fig = optuna.visualization.plot_edf([dbscan_study])

    show(fig)

In [None]:
import pprint

if override_dbscan_tuning:
    eps = dbscan_override_params["eps"]
    min_samples = dbscan_override_params["min_samples"]
else:
    eps = dbscan_study.best_params["eps"]
    min_samples = dbscan_study.best_params["min_samples"]

if override_dbscan_tuning:
    n_clusters = dbscan_override_params["n_clusters"]
    cluster_data_points = dbscan_override_params["cluster_data_points"]
else:
    best_trial_dbscan = dbscan_study.best_trial
    best_trial_dbscan_user_attrs = best_trial_dbscan.user_attrs

    n_clusters = best_trial_dbscan_user_attrs["n_clusters"]
    cluster_data_points = best_trial_dbscan_user_attrs["cluster_data_points"]


print(f"eps = {eps}")
print(f"min_samples = {min_samples}")
print(f"n_clusters = {n_clusters}")
print("cluster_data_points")
pprint.pprint(cluster_data_points)

fit the DBSCAN

In [None]:
from db_ocsvm import DBOCSVM

# Create DB-OC-SVM model with default ocsvm parameters

if override_dbscan_tuning:
    dbscan_distance_metric = dbscan_override_params["distance_metric"]
else:
    dbscan_distance_metric = dbscan_tuning_parameters["distance_metric"]

dbocsvm = DBOCSVM(
    kernel="rbf",
    gamma="auto",
    nu=0.2,
    eps=eps,
    min_samples=min_samples,
    dbscan_distance_metric=dbscan_distance_metric,
    tree_algorithm=dbocsvm_tree_algorithm,
)

In [None]:
dbocsvm.fit_cluster(X_encoded, verbose=True)

importing test set

In [None]:
test_dataset = pd.read_csv(test_set_path)

if test_run:
    test_dataset = test_dataset.sample(
        frac=sample_size * 2, random_state=42
    ).reset_index(drop=True)



print(f"test set count: {test_dataset.shape[0]:,}")


print(f"unique values: {test_dataset['attack_categorical'].value_counts()}")


test_dataset.head(3)

Splitting into test and validation set

In [None]:
from sklearn.model_selection import train_test_split

test_df, val_df = train_test_split(
    test_dataset,
    test_size=0.5,
    random_state=42,
    stratify=test_dataset["attack_categorical"],
)

test set

In [None]:
# Splitting into X and y
X_test = test_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_test = test_df["attack_binary"].values
y_test_class = test_df["attack_categorical"].values

print(f"test set count: {test_df.shape[0]:,}")
print(f"unique values: {test_df['attack_categorical'].value_counts()}")
test_df.head(3)

validation set

In [None]:
# Splitting into X and y
X_val = val_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_val = val_df["attack_binary"].values
y_val_class = val_df["attack_categorical"].values

print(f"val set count: {val_df.shape[0]:,}")
print(f"unique values: {val_df['attack_categorical'].value_counts()}")
val_df.head(3)

Applying SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np

# First, display the original distribution
print("Before SMOTE:")
print(f"Val set count: {X_val.shape[0]:,}")
before_counts = pd.Series(y_val_class).value_counts()
print(before_counts)

# Apply SMOTE to training data using class labels
if test_run:
    sampling_strategy = {
        "dos": 100,
        "portScan": 100,
        "bruteForce": 100,
        "pingScan": 100,
    }
else:
    sampling_strategy = {
        "dos": 4000,
        "portScan": 4000,
        "bruteForce": 4000,
        "pingScan": 4000,
    }

smote = SMOTE(random_state=42, k_neighbors=3, sampling_strategy=sampling_strategy)
X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val_class)

# Display the distribution after SMOTE
print("\nAfter SMOTE:")
print(f"Val set count: {X_val_resampled.shape[0]:,}")
after_counts = pd.Series(y_val_resampled).value_counts()
print(after_counts)

# If you need binary labels for further processing, convert back
y_val_resampled = np.where(y_val_resampled == "benign", 1, -1)

reconstruction error inspection

In [None]:
# Separate normal and anomaly samples from test set
X_test_normal = X_test[y_test == 1]
X_test_anomaly = X_test[y_test == -1]

print(f"Normal test samples: {X_test_normal.shape[0]}")
print(f"Anomaly test samples: {X_test_anomaly.shape[0]}")

# Convert test data to PyTorch tensors
X_test_normal_tensor = torch.FloatTensor(X_test_normal).to(device)
X_test_anomaly_tensor = torch.FloatTensor(X_test_anomaly).to(device)

# Create DataLoaders for test data evaluation
normal_test_dataset = TensorDataset(X_test_normal_tensor)
anomaly_test_dataset = TensorDataset(X_test_anomaly_tensor)
normal_test_loader = DataLoader(normal_test_dataset, batch_size=256, shuffle=False)
anomaly_test_loader = DataLoader(anomaly_test_dataset, batch_size=256, shuffle=False)


def calculate_reconstruction_error(model, loader):
    model.eval()
    total_loss = 0
    total_samples = 0
    criterion = nn.MSELoss(reduction="none")

    with torch.no_grad():
        for batch in loader:
            x = batch[0]
            outputs = model(x)
            # Calculate MSE for each sample
            loss = criterion(outputs, x)
            loss = loss.mean(dim=1)
            total_loss += torch.sum(loss).item()
            total_samples += x.size(0)

    return total_loss / total_samples


# Function to evaluate a model's reconstruction performance
def evaluate_model(model):
    normal_loss = calculate_reconstruction_error(model, normal_test_loader)
    anomaly_loss = calculate_reconstruction_error(model, anomaly_test_loader)
    loss_difference = anomaly_loss - normal_loss

    return {
        "normal_loss": normal_loss,
        "anomaly_loss": anomaly_loss,
        "loss_difference": loss_difference,
    }


reconstruction_error = evaluate_model(autoencoder)
reconstruction_error

extract features from validation data

In [None]:
X_val_tensor = torch.FloatTensor(X_val_resampled).to(device)

X_val_dataset_tensor = TensorDataset(X_val_tensor, torch.zeros(len(X_val_tensor)))
X_val_loader = DataLoader(X_val_dataset_tensor, batch_size=128)

X_val_encoded = []
with torch.no_grad():
    for data, _ in X_val_loader:
        encoded = autoencoder.encode(data)
        X_val_encoded.append(encoded.cpu().numpy())

X_val_encoded = np.vstack(X_val_encoded)
print(X_val_encoded.shape)

extract features from test data

In [None]:
X_test_tensor = torch.FloatTensor(X_test).to(device)

test_dataset = TensorDataset(X_test_tensor, torch.zeros(len(X_test_tensor)))
test_loader = DataLoader(test_dataset, batch_size=128)

X_test_encoded = []
with torch.no_grad():
    for data, _ in test_loader:
        encoded = autoencoder.encode(data)
        X_test_encoded.append(encoded.cpu().numpy())

X_test_encoded = np.vstack(X_test_encoded)
print(X_test_encoded.shape)

tuning the ocsvms

In [None]:
from utils_final import objective_dbocsvm_fit_ocsvm

# Inner Optuna study for DBSCAN
dbocsvm_fit_ocsvm_objective_lambda = lambda trial: objective_dbocsvm_fit_ocsvm(
    trial,
    model=dbocsvm,
    X_encoded_train=X_encoded,
    X_encoded_validation=X_val_encoded,
    y_validation=y_val_resampled,
    X_encoded_test=X_test_encoded,
    y_test=y_test,
    cluster_count=n_clusters,
    metric="f1",
)

if with_storage_dbocsvm:
    dbocsvm_study = optuna.create_study(
        direction="maximize",
        storage=ocsvm_optuna_storage_path,
        study_name="dbocsvm_study",
        load_if_exists=True,
    )
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )
else:
    dbocsvm_study = optuna.create_study(direction="maximize")
    dbocsvm_study.optimize(
        dbocsvm_fit_ocsvm_objective_lambda,
        n_trials=ocsvm_trials,
    )

In [None]:
import optuna
from plotly.io import show

fig = optuna.visualization.plot_optimization_history(dbocsvm_study)
show(fig)

In [None]:
fig = optuna.visualization.plot_edf([dbocsvm_study])
show(fig)

In [None]:
parameter_list = {}

for key, value in dbocsvm_study.best_params.items():
    cluster = key.split("_")[1]
    cluster = int(cluster)

    parameter_list[cluster] = {
        "kernel": "rbf",
        "gamma": dbocsvm_study.best_params[f"gamma_{cluster}"],
        "nu": dbocsvm_study.best_params[f"nu_{cluster}"],
    }

best parameters and values

In [None]:
autoencoder_architecture = {
    "input_dim": existing_model_architecture["input_dim"],
    "hidden_dims": existing_model_architecture["hidden_dims"],
    "latent_dim": existing_model_architecture["latent_dim"],
    "activation_type": existing_model_architecture["activation_type"],
    "negative_slope": existing_model_architecture["negative_slope"],
    "output_activation_type": existing_model_architecture["output_activation_type"],
    "val_loss": checkpoint["val_loss"],
}

print("Best autoencoder model:")
pprint.pprint(autoencoder_architecture, sort_dicts=False)
print("")

print("Reconstruction error:")
pprint.pprint(reconstruction_error, sort_dicts=False)
print("")

best_dbscan_parameters = {
    "eps": eps,
    "min_samples": min_samples,
    "distance_metric": dbscan_tuning_parameters["distance_metric"],
    "evaluation_metric": dbscan_tuning_parameters["evaluation_metric"],
    "score": best_trial_dbscan.value if not override_dbscan_tuning else dbscan_override_params["score"],
    "n_clusters": n_clusters,
    "cluster_data_points": cluster_data_points,
}

print("Best dbscan parameters")
pprint.pprint(best_dbscan_parameters, sort_dicts=False)
print("")

print("Best ocsvm parameters")
print(f"Tree algorithm: {dbocsvm_tree_algorithm}")
print(f"Accuracy: {dbocsvm_study.best_value}")
pprint.pprint(parameter_list, sort_dicts=False)

In [None]:
import json

tuning_result = {
    "dbscan": best_dbscan_parameters,
    "ocsvm": {
        "tree_algorithm": dbocsvm_tree_algorithm,
        "accuracy": dbocsvm_study.best_value,
        "parameters": parameter_list,
    },
}

results = {
    "max_score": 0,
    "autoencoder_architecture": autoencoder_architecture,
    "reconstruction_error": reconstruction_error,
    "tuning_results": {},
}

os.makedirs("tuning_results", exist_ok=True)
if os.path.exists(results_path):
    with open(results_path, "r") as file:
        existing_results = json.load(file)
        if existing_results["max_score"] < dbocsvm_study.best_value:
            with open(results_path, "w") as f:
                existing_results["max_score"] = dbocsvm_study.best_value
                tuning_result_id = len(existing_results["tuning_results"])
                tuning_result["score"] = dbocsvm_study.best_value
                existing_results["tuning_results"][tuning_result_id] = tuning_result
                json.dump(existing_results, f)
else:
    with open(results_path, "w") as f:
        results["max_score"] = dbocsvm_study.best_value
        tuning_result["score"] = dbocsvm_study.best_value
        results["tuning_results"][0] = tuning_result
        json.dump(results, f)