In [1]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import silhouette_score
import itertools
import hdbscan

from aencoder import Autoencoder, train_autoencoder
from utils import *

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
# Set the dataset number and output file name
dataset_number = 5
output_file = f"/Users/leo/Programming/PLR/Leo/main/final/grid_results/aencoder_grid_{dataset_number}.txt"

DATA = pd.read_csv(f"/Users/leo/Programming/PLR/Leo/data/dataset_{dataset_number}.csv").drop(columns=["Unnamed: 0"])

In [3]:
# Define the parameter grid
skew_thresholds = [0.75, 0.8, 0.9]
corr_thresholds = [0.5, 0.6, 0.9]
hidden_sizes = [32, 64]
latent_dims = [2, 8, 16, 32, 64]
learning_rates = [0.001, 0.01]
epoch_options = [500]
hdbscan_params = [{'min_cluster_size': s, 'min_samples': m} for s in [5, 10, 15, 20] for m in [10, 15, 25]]

# Preprocessing functions
def preprocess_data(df, skew_threshold, corr_threshold):
    df_unskewed = drop_skewed_features(df, skew_threshold)
    df_drop_corr = drop_correlated_features(df_unskewed, corr_threshold)
    return df_drop_corr

# Modified function to perform clustering and calculate silhouette score
def cluster_and_evaluate_with_autoencoder(data, skew_threshold, corr_threshold, hidden_size, latent_dim, learning_rate, epochs, hdbscan_param):
    preprocessed_data = preprocess_data(data, skew_threshold, corr_threshold)
    if preprocessed_data.empty:
        print("Preprocessed data is empty. Skipping this iteration.")
        return -1, 0, None, None

    input_size = preprocessed_data.shape[1]
    model = Autoencoder(input_size, hidden_size, latent_dim)
    data_tensor = torch.tensor(preprocessed_data.values, dtype=torch.float32)
    trained_model = train_autoencoder(model, data_tensor, learning_rate, epochs)

    model.eval()
    with torch.no_grad():
        data_encoded = model.encoder(data_tensor).numpy()

    clusterer = hdbscan.HDBSCAN(**hdbscan_param)
    cluster_labels = clusterer.fit_predict(data_encoded)
    
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    if 5 <= n_clusters <= 10:
        score = silhouette_score(data_encoded, cluster_labels)
        return score, n_clusters, preprocessed_data, data_encoded
    else:
        return -1, n_clusters, None, None

# Open the file for writing
with open(output_file, 'w') as file:
    # Grid search with autoencoder
    best_score_ae = -1
    best_params_ae = None
    best_cluster_number_ae = None
    best_preprocessed_data_ae = None
    best_encoded_data = None

    for skew_threshold, corr_threshold, hidden_size, latent_dim, learning_rate, epochs, hdbscan_param in itertools.product(skew_thresholds, corr_thresholds, hidden_sizes, latent_dims, learning_rates, epoch_options, hdbscan_params):
        score, n_clusters, preprocessed_data, encoded_data = cluster_and_evaluate_with_autoencoder(DATA, skew_threshold, corr_threshold, hidden_size, latent_dim, learning_rate, epochs, hdbscan_param)
        output = f"Silhouette Score: {score:.4f}, Number of Clusters: {n_clusters}, Skew Threshold: {skew_threshold}, Correlation Threshold: {corr_threshold}, Hidden Size: {hidden_size}, Latent Dim: {latent_dim}, Learning Rate: {learning_rate}, Epochs: {epochs}, HDBSCAN Params: {hdbscan_param}\n"
        print(output)
        file.write(output)
        file.flush()

    # Write the best parameters and score
    if best_score_ae != -1:
        best_params_output = f"\nBest Silhouette Score: {best_score_ae}\nBest Skew Threshold: {best_params_ae[0]}\nBest Correlation Threshold: {best_params_ae[1]}\nBest Hidden Size: {best_params_ae[2]}\nBest Latent Dim: {best_params_ae[3]}\nBest Learning Rate: {best_params_ae[4]}\nBest Epochs: {best_params_ae[5]}\nBest HDBSCAN Parameters: {best_params_ae[6]}\nNumber of Clusters: {best_cluster_number_ae}\n"
        print(best_params_output)
        file.write(best_params_output)
        file.flush()

Silhouette Score: -1.0000, Number of Clusters: 82, Skew Threshold: 0.75, Correlation Threshold: 0.5, Hidden Size: 32, Latent Dim: 2, Learning Rate: 0.001, Epochs: 500, HDBSCAN Params: {'min_cluster_size': 5, 'min_samples': 10}

Silhouette Score: -1.0000, Number of Clusters: 15, Skew Threshold: 0.75, Correlation Threshold: 0.5, Hidden Size: 32, Latent Dim: 2, Learning Rate: 0.001, Epochs: 500, HDBSCAN Params: {'min_cluster_size': 5, 'min_samples': 15}

Silhouette Score: -1.0000, Number of Clusters: 57, Skew Threshold: 0.75, Correlation Threshold: 0.5, Hidden Size: 32, Latent Dim: 2, Learning Rate: 0.001, Epochs: 500, HDBSCAN Params: {'min_cluster_size': 5, 'min_samples': 25}

Silhouette Score: -1.0000, Number of Clusters: 76, Skew Threshold: 0.75, Correlation Threshold: 0.5, Hidden Size: 32, Latent Dim: 2, Learning Rate: 0.001, Epochs: 500, HDBSCAN Params: {'min_cluster_size': 10, 'min_samples': 10}

Silhouette Score: -1.0000, Number of Clusters: 54, Skew Threshold: 0.75, Correlation Th