# Tuning with PCA

In [None]:
import pandas as pd

In [None]:
# use this to run tuning with or without storage
with_storage = False

In [None]:
train_set = pd.read_csv(
    "/home/jbct/Projects/thesis/db-ocsvm/data/processed/NSL-KDD/train_set_full.csv"
)
print(train_set.shape)
train_set = train_set.sample(frac=0.1, random_state=42)
train_set.head()

In [None]:
from sklearn.decomposition import PCA

# Initialize PCA with 99998% variance retention
pca = PCA(n_components=0.99998)

# Fit PCA on the training data and transform it
train_set = pca.fit_transform(train_set)

# Print the number of components and explained variance ratio
print(f"Number of components selected: {pca.n_components_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.4f}")

In [None]:
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
)


def get_score(X, labels, metric_name, mask=None):
    if mask is not None:
        X = X[mask]
        labels = labels[mask]

    if metric_name == "silhouette":
        return silhouette_score(X, labels)
    elif metric_name == "davies_bouldin":
        return -davies_bouldin_score(X, labels)  # Negative because we want to maximize
    elif metric_name == "calinski_harabasz":
        return calinski_harabasz_score(X, labels)
    else:
        raise ValueError(f"Unknown metric: {metric_name}")

In [None]:
from sklearn.cluster import DBSCAN
import optuna

def objective(trial, metric_name="silhouette"):
    # Define the parameter search space
    eps = trial.suggest_float("eps", 0.1, 15.0)
    min_samples = trial.suggest_int("min_samples", 20, 50)
    # metric = trial.suggest_categorical("metric", ["euclidean", "manhattan", "cosine"])
    metric = "euclidean"

    # Create and fit DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=-1)

    cluster_labels = dbscan.fit_predict(train_set)

    # Calculate the evaluation metric
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

    if n_clusters < 2:
        return -float("inf")  # Penalize solutions with too few clusters

    # For silhouette score, we need to exclude noise points (-1)
    if metric_name == "silhouette":
        mask = cluster_labels != -1
        if sum(mask) < 2:
            return -float("inf")
        score = get_score(train_set, cluster_labels, metric_name, mask)
    else:
        score = get_score(train_set, cluster_labels, metric_name)

    return score

In [None]:
metric_name = "silhouette"
storage_path = "sqlite:///optuna_storage/01_dbscan.db"
trials = 10

if with_storage:
    # Study with storage
    study = optuna.create_study(
        direction="maximize",
        storage=storage_path,
        study_name="01_dbscan",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, metric_name), n_trials=trials)
else:
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, metric_name), n_trials=trials)

In [None]:
print("Best parameters:", study.best_params)
print(f"Best {metric_name} score:", study.best_value)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Apply DBSCAN with best parameters
eps = study.best_params["eps"]
min_samples = study.best_params["min_samples"]
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean", n_jobs=-1)
clusters = dbscan.fit_predict(train_set)

In [None]:
# Print cluster information
n_clusters = len(np.unique(clusters[clusters != -1]))
print(f"Number of clusters (excluding noise): {n_clusters}")

In [None]:
unique, counts = np.unique(clusters, return_counts=True)
print("\nCluster distribution:")
for label, count in zip(unique, counts):
    if label == -1:
        print(f"Noise points: {count}")
    else:
        print(f"Cluster {label}: {count} points")

In [None]:
from sklearn.manifold import TSNE

# Dimensionality reduction using t-SNE
tsne = TSNE(n_components=2, random_state=42)
reduced_data = tsne.fit_transform(train_set)


cmap_colors = plt.cm.get_cmap(
    "Paired", len(set(clusters))
)  # Adjust len(set(clusters)) if needed

# Visualize the clusters
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    # reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap="viridis", alpha=0.6
    reduced_data[:, 0],
    reduced_data[:, 1],
    c=clusters,
    cmap=cmap_colors,
    alpha=0.6,
)
plt.title("DBSCAN Clusters Visualized after t-SNE")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.colorbar(scatter, label="Cluster Label")
plt.show()