In [None]:
import os
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


train_set_path = "../../../datasets/train_set.csv"
test_set_path = "../../../datasets/test_set.csv"

existing_model_path = "../../../autoencoder/autoencoder.pth"

tuning_metric = "f1"  # f1 or accuracy
test_run = True

if test_run:
    use_sample = True
    train_frac = 0.01
    with_storage = False
    trials = 10
else:
    os.makedirs("optuna_storage", exist_ok=True)
    storage_path = "sqlite:///optuna_storage/dbscan_study.db"
    use_sample = True
    train_frac = 0.3
    with_storage = False
    trials = 100

# Autoencoder Setup

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


def extract_encoded_features(X_data, autoencoder, device, batch_size=256):
    # Convert to PyTorch tensor and prepare data loader
    X_tensor = torch.FloatTensor(X_data)
    X_dataset = TensorDataset(X_tensor)
    X_loader = DataLoader(X_dataset, batch_size=batch_size)

    # Extract encoded features
    X_encoded = []
    with torch.no_grad():
        for data in X_loader:
            data_x = data[0].to(device)
            encoded = autoencoder.encode(data_x)
            X_encoded.append(encoded.cpu().numpy())

    return np.vstack(X_encoded)

In [None]:
from autoencoder import BatchNormAutoencoder


existing_model_architecture = {
    "input_dim": 15,
    "hidden_dims": [13, 11],
    "latent_dim": 9,
    "activation_type": "ReLU",
    "negative_slope": 0,
    "output_activation_type": "Sigmoid",
}

autoencoder = BatchNormAutoencoder(
    input_dim=existing_model_architecture["input_dim"],
    hidden_dims=existing_model_architecture["hidden_dims"],
    latent_dim=existing_model_architecture["latent_dim"],
    activation_type=existing_model_architecture["activation_type"],
    negative_slope=existing_model_architecture["negative_slope"],
    output_activation_type=existing_model_architecture["output_activation_type"],
).to(device)

# Load best model
checkpoint = torch.load(existing_model_path)
autoencoder.load_state_dict(checkpoint["model_state_dict"])

autoencoder.eval()

# Dataset setup

Import train dataset

In [None]:
import pandas as pd

train_dataset = pd.read_csv(train_set_path)

if use_sample:
    train_dataset = train_dataset.sample(frac=train_frac, random_state=42)

X_train = train_dataset.drop(columns=["attack_binary", "attack_categorical"]).values
y_train = train_dataset["attack_binary"].values

print(f"train set count: {train_dataset.shape[0]:,}")
print(f"unique values: {train_dataset['attack_categorical'].unique()}")
train_dataset.head(3)

import test set

In [None]:
test_dataset = pd.read_csv(test_set_path)

if test_run:
    test_dataset = test_dataset.sample(
        frac=train_frac * 2, random_state=42
    ).reset_index(drop=True)


print(f"test set count: {test_dataset.shape[0]:,}")

print(f"unique values: {test_dataset['attack_categorical'].value_counts()}")

test_dataset.head(3)

splitting into validation and test set

In [None]:
from sklearn.model_selection import train_test_split

test_df, val_df = train_test_split(
    test_dataset,
    test_size=0.5,
    random_state=42,
    stratify=test_dataset["attack_categorical"],
)

test set

In [None]:
# Splitting into X and y
X_test = test_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_test = test_df["attack_binary"].values
y_test_class = test_df["attack_categorical"].values

print(f"test set count: {test_df.shape[0]:,}")
print(f"unique values: {test_df['attack_categorical'].value_counts()}")
test_df.head(3)

validation set

In [None]:
# Splitting into X and y
X_val = val_df.drop(columns=["attack_binary", "attack_categorical"]).values
y_val = val_df["attack_binary"].values
y_val_class = val_df["attack_categorical"].values

print(f"test set count: {val_df.shape[0]:,}")
print(f"unique values: {val_df['attack_categorical'].value_counts()}")
val_df.head(3)

Applying  SMOTE to validation set

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np

# First, display the original distribution
print("Before SMOTE:")
print(f"Val set count: {X_val.shape[0]:,}")
before_counts = pd.Series(y_val_class).value_counts()
print(before_counts)

# Apply SMOTE to training data using class labels
if test_run:
    sampling_strategy = {
        "dos": 100,
        "portScan": 100,
        "bruteForce": 100,
        "pingScan": 100,
    }
else:
    sampling_strategy = {
        "dos": 4000,
        "portScan": 4000,
        "bruteForce": 4000,
        "pingScan": 4000,
    }
smote = SMOTE(random_state=42, k_neighbors=3, sampling_strategy=sampling_strategy)
X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val_class)

# Display the distribution after SMOTE
print("\nAfter SMOTE:")
print(f"Val set count: {X_val_resampled.shape[0]:,}")
after_counts = pd.Series(y_val_resampled).value_counts()
print(after_counts)

# If you need binary labels for further processing, convert back
y_val_resampled = np.where(y_val_resampled == "benign", 1, -1)

# Hyperparameter tuning

objective function, maximizing the f1-score

In [None]:
X_train_encoded = extract_encoded_features(X_train, autoencoder, device)
X_val_encoded = extract_encoded_features(X_val_resampled, autoencoder, device)
X_test_encoded = extract_encoded_features(X_test, autoencoder, device)
print(X_train_encoded.shape)
print(X_val_encoded.shape)
print(X_test_encoded.shape)

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import optuna


def objective(trial):
    nu = trial.suggest_float("nu", 0.01, 0.5)
    gamma = trial.suggest_float("gamma", 0.01, 1.0)

    ocsvm = OneClassSVM(kernel="rbf", nu=nu, gamma=gamma)
    ocsvm.fit(X_train_encoded)

    y_pred_val = ocsvm.predict(X_val_encoded)
    y_pred_test = ocsvm.predict(X_test_encoded)

    acc_val = accuracy_score(y_val_resampled, y_pred_val)
    f1_val = f1_score(y_val_resampled, y_pred_val, pos_label=-1)
    precision_val = precision_score(y_val_resampled, y_pred_val, pos_label=-1)
    recall_val = recall_score(y_val_resampled, y_pred_val, pos_label=-1)
    print("Validation Results:")
    print(
        {
            "accuracy": f"{acc_val * 100:.2f}",
            "f1": f"{f1_val * 100:.2f}",
            "precision": f"{precision_val * 100:.2f}",
            "recall": f"{recall_val * 100:.2f}",
        }
    )

    print("\nTest Results:")
    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, pos_label=-1)
    precision_test = precision_score(y_test, y_pred_test, pos_label=-1)
    recall_test = recall_score(y_test, y_pred_test, pos_label=-1)
    print(
        {
            "accuracy": f"{acc_test * 100:.2f}",
            "f1": f"{f1_test * 100:.2f}",
            "precision": f"{precision_test * 100:.2f}",
            "recall": f"{recall_test * 100:.2f}",
        }
    )

    if tuning_metric == "f1":
        return f1_val
    elif tuning_metric == "accuracy":
        return acc_val
    else:
        raise ValueError("Invalid tuning metric")


if with_storage:
    storage_path = "sqlite:////work/base_ocsvm_study.db"
    study = optuna.create_study(
        direction="maximize",
        storage=storage_path,
        study_name="base_ocsvm_study",
        load_if_exists=True,
    )
    study.optimize(objective, n_trials=trials)
else:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=trials)


print(f"Best score: {study.best_value:.3f}")
print(f"Best parameters: {study.best_params}")

In [None]:
import optuna
from plotly.io import show

fig = optuna.visualization.plot_optimization_history(study)
show(fig)

In [None]:
fig = optuna.visualization.plot_edf([study])
show(fig)

# Train the final model

In [None]:
nu = study.best_params["nu"]
gamma = study.best_params["gamma"]

best_ocsvm = OneClassSVM(kernel="rbf", nu=nu, gamma=gamma)

best_ocsvm.fit(X_train_encoded)

# Evaluating the model

In [None]:
y_pred = best_ocsvm.predict(X_test_encoded)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred, labels=[-1, 1])


def plot_confusion_matrix(cm, labels, title):
    plt.figure(figsize=(5, 4))
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    plt.title(title)
    plt.show()


plot_confusion_matrix(cm, ["Anomaly", "Normal"], "Confusion Matrix (Anomaly vs Normal)")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


def plot_confusion_matrix(cm, labels, title):
    plt.figure(figsize=(5, 4))
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    plt.title(title)
    plt.show()


plot_confusion_matrix(cm, ["Anomaly", "Normal"], "Confusion Matrix (Anomaly vs Normal)")

In [None]:
from sklearn.metrics import (
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Anomaly", "Normal"]))

precision = precision_score(y_test, y_pred, pos_label=-1)
recall = recall_score(y_test, y_pred, pos_label=-1)
f1 = f1_score(y_test, y_pred, pos_label=-1)
acc = accuracy_score(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")

In [None]:
import numpy as np


def create_multiclass_cm(y_true_class, y_pred_binary):
    """
    Create a confusion matrix showing how each attack class was classified.

    For attack classes (DoS, Probe, R2L, U2R), correct detection is when y_pred = -1 (anomaly)
    For normal class, correct detection is when y_pred = 1 (normal)
    """
    classes = np.unique(y_true_class)
    cm = np.zeros((len(classes), 2))

    for i, cls in enumerate(classes):
        # Get predictions for this class
        cls_indices = y_true_class == cls
        preds = y_pred_binary[cls_indices]

        # Count correct and incorrect predictions
        if cls == "normal":
            cm[i, 0] = np.sum(preds == -1)  # incorrectly detected as anomaly
            cm[i, 1] = np.sum(preds == 1)  # correctly detected as normal
        else:
            cm[i, 0] = np.sum(preds == -1)  # correctly detected as anomaly
            cm[i, 1] = np.sum(preds == 1)  # incorrectly detected as normal

    return cm, classes


# Create and plot the multi-class confusion matrix
cm_multi, classes = create_multiclass_cm(y_test_class, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_multi,
    annot=True,
    fmt="g",
    cmap="Blues",
    xticklabels=["Detected as Anomaly", "Detected as Normal"],
    yticklabels=classes,
)
plt.ylabel("True Attack Class")
plt.title("Confusion Matrix by Attack Class")
plt.tight_layout()
plt.show()

In [None]:
# Calculate detection rates for each class
print("Detection rates by class:")
class_metrics = {}
for cls in np.unique(y_test_class):
    # Get indices for this class
    class_indices = y_test_class == cls

    # True values and predictions for this class
    y_true_cls = y_test[class_indices]
    y_pred_cls = y_pred[class_indices]

    # Calculate metrics
    if cls == "Normal":
        # For normal class, we want to detect 1 (normal)
        correct = np.sum((y_pred_cls == 1))
        precision = precision_score(
            y_true_cls, y_pred_cls, pos_label=1, zero_division=0
        )
        recall = recall_score(y_true_cls, y_pred_cls, pos_label=1, zero_division=0)
    else:
        # For attack classes, we want to detect -1 (anomaly)
        correct = np.sum((y_pred_cls == -1))
        precision = precision_score(
            y_true_cls, y_pred_cls, pos_label=-1, zero_division=0
        )
        recall = recall_score(y_true_cls, y_pred_cls, pos_label=-1, zero_division=0)

    total = len(y_pred_cls)
    detection_rate = correct / total
    f1 = f1_score(
        y_true_cls, y_pred_cls, pos_label=-1 if cls != "Normal" else 1, zero_division=0
    )

    class_metrics[cls] = {
        "detection_rate": detection_rate,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "count": total,
        "correctly_detected": correct,
    }

    print(f"{cls}: {detection_rate:.4f} ({correct}/{total})")