In [1]:
from sklearn.metrics import silhouette_score
import itertools
# Custom Helper Functions
from helper_functions import print_boxed_text, save_boxed_text_to_file, phi_coefficient, one_hot_encode, standardize_numeric

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import hdbscan
from sklearn.metrics import silhouette_score

In [2]:
DATA = pd.read_csv("/Users/leo/Programming/PLR/Leo/data/dataset_2.csv").drop(columns=["Unnamed: 0"])

In [3]:
# Search space

# Define thresholds and t-SNE parameters
skew_thresholds = [0.7, 0.8, 0.9, 1.0]
corr_thresholds = [0.5, 0.6, 0.7, 0.8]
tsne_params = [
    {'n_components': n, 'perplexity': p, 'learning_rate': lr}
    for n in [2, 3]
    for p in [30, 40, 50, 60]
    for lr in [200, 500, 1000]
]
hdbscan_params = [{'min_cluster_size': s, 'min_samples': m} for s in [5, 10, 15, 20] for m in [1, 5, 10, 15]]

In [4]:
#Drop skewed features
def drop_skewed_features(df, threshold=0.5):
    """
    Drop features that are skewed towards 0 or 1.
    
    Parameters:
        df (DataFrame): The input DataFrame with binary features.
        threshold (float): The skewness threshold. Features with skewness above this value will be dropped.
        
    Returns:
        DataFrame: A new DataFrame with skewed features removed.
    """
    lines = ["Dropped Skewed Features"]
    is_title = [True]
    
    dropped_features = []
    
    for col in df.columns:
        # Calculate the skewness for each feature
        skewness = df[col].mean()
        
        # Check if the feature is skewed towards 0 or 1
        if skewness > threshold or skewness < (1 - threshold):
            line = f"Dropping {col} with skewness {skewness:.4f}"
            lines.append(line)
            is_title.append(False)
            dropped_features.append(col)
            
    # Drop the skewed features
    df_dropped = df.drop(columns=dropped_features)
    
    return df_dropped


In [5]:
def drop_correlated_features(df, threshold=0.4):
    """
    Drop highly correlated features based on phi coefficient.

    Parameters:
        df (DataFrame): The input DataFrame with binary features.
        threshold (float): The phi coefficient threshold. Pairs of features with a phi coefficient above this value will be considered for dropping.

    Returns:
        DataFrame: A new DataFrame with highly correlated features removed.
    """
    # Initialize an empty dataframe to store phi values
    phi_values = pd.DataFrame(index=df.columns, columns=df.columns)

    # Calculate phi values
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 >= col2:  # Avoid duplicate calculations
                continue
            contingency_table = pd.crosstab(df[col1], df[col2])
            phi = phi_coefficient(contingency_table)
            phi_values.loc[col1, col2] = phi
            phi_values.loc[col2, col1] = phi  # Symmetric matrix

    # Convert to float
    phi_values = phi_values.astype(float)

    # Identify columns to drop
    to_drop = set()
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 == col2 or col1 in to_drop or col2 in to_drop:
                continue
            phi_value = abs(phi_values.loc[col1, col2])
            if phi_value > threshold:
                to_drop.add(col1)  # Choose one column from the pair to drop

    # Drop the highly correlated columns
    df_dropped = df.drop(columns=to_drop)

    return df_dropped


In [6]:
# Function to preprocess data
def preprocess_data(df, skew_threshold, corr_threshold):
    df_unskewed = drop_skewed_features(df, skew_threshold)
    df_drop_corr = drop_correlated_features(df_unskewed, corr_threshold)
    return df_drop_corr

# Modified cluster_and_evaluate function for t-SNE
def cluster_and_evaluate(data, skew_threshold, corr_threshold, tsne_params, hdbscan_params):
    preprocessed_data = preprocess_data(data, skew_threshold, corr_threshold)
    if preprocessed_data.empty:
        print("Preprocessed data is empty. Skipping this iteration.")
        return -1, 0, None, None

    tsne_model = TSNE(**tsne_params, random_state=42)
    try:
        data_tsne = tsne_model.fit_transform(preprocessed_data)
    except ValueError as e:
        print(f"Error during t-SNE transformation: {e}")
        return -1, 0, None, None

    clusterer = hdbscan.HDBSCAN(**hdbscan_params)
    cluster_labels = clusterer.fit_predict(data_tsne)
    
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    if 5 <= n_clusters <= 10:
        score = silhouette_score(data_tsne, cluster_labels)
        return score, n_clusters, preprocessed_data, data_tsne
    else:
        return -1, n_clusters, None, None

# Grid search with t-SNE parameters
best_score = -1
best_params = None
best_cluster_number = None
best_preprocessed_data = None
best_data_tsne = None

for skew_threshold, corr_threshold in itertools.product(skew_thresholds, corr_thresholds):
    for tsne_param_values, hdbscan_param_values in itertools.product(tsne_params, hdbscan_params):
        score, n_clusters, preprocessed_data, data_tsne = cluster_and_evaluate(DATA, skew_threshold, corr_threshold, tsne_param_values, hdbscan_param_values)
        print(f"Silhouette Score: {score:.4f}, Number of Clusters: {n_clusters}, Skew Threshold: {skew_threshold}, Correlation Threshold: {corr_threshold}, t-SNE Parameters: {tsne_param_values}, HDBSCAN Parameters: {hdbscan_param_values}")
        if score > best_score:
            best_score = score
            best_params = (skew_threshold, corr_threshold, tsne_param_values, hdbscan_param_values)
            best_cluster_number = n_clusters
            best_preprocessed_data = preprocessed_data
            best_data_tsne = data_tsne

# Output the best parameters and score
print(f"Best Silhouette Score: {best_score}")
print(f"Best Skew Threshold: {best_params[0]}")
print(f"Best Correlation Threshold: {best_params[1]}")
print(f"Best t-SNE Parameters: {best_params[2]}")
print(f"Best HDBSCAN Parameters: {best_params[3]}")
print(f"Number of Clusters: {best_cluster_number}")

Silhouette Score: -1.0000, Number of Clusters: 566, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 1}
Silhouette Score: -1.0000, Number of Clusters: 259, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 5}
Silhouette Score: -1.0000, Number of Clusters: 4, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 10}
Silhouette Score: -1.0000, Number of Clusters: 2, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 15}
Silhouette Score: -1.0000, Number 