In [1]:
from sklearn.metrics import silhouette_score
import itertools
# Custom Helper Functions
from helper_functions import print_boxed_text, save_boxed_text_to_file, phi_coefficient, one_hot_encode, standardize_numeric

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import hdbscan
from sklearn.metrics import silhouette_score

In [2]:
DATA = pd.read_csv("/Users/leo/Programming/PLR/Leo/data/dataset_5.csv").drop(columns=["Unnamed: 0"])

In [3]:
# Search space

# Define thresholds and t-SNE parameters
skew_thresholds = [0.7, 0.8, 0.9, 1.0]
corr_thresholds = [0.5, 0.6, 0.7, 0.8]
tsne_params = [
    {'n_components': n, 'perplexity': p, 'learning_rate': lr}
    for n in [2, 3]
    for p in [30, 40, 50, 60]
    for lr in [200, 500, 1000]
]
hdbscan_params = [{'min_cluster_size': s, 'min_samples': m} for s in [5, 10, 15, 20] for m in [1, 5, 10, 15]]

In [4]:
#Drop skewed features
def drop_skewed_features(df, threshold=0.5):
    """
    Drop features that are skewed towards 0 or 1.
    
    Parameters:
        df (DataFrame): The input DataFrame with binary features.
        threshold (float): The skewness threshold. Features with skewness above this value will be dropped.
        
    Returns:
        DataFrame: A new DataFrame with skewed features removed.
    """
    lines = ["Dropped Skewed Features"]
    is_title = [True]
    
    dropped_features = []
    
    for col in df.columns:
        # Calculate the skewness for each feature
        skewness = df[col].mean()
        
        # Check if the feature is skewed towards 0 or 1
        if skewness > threshold or skewness < (1 - threshold):
            line = f"Dropping {col} with skewness {skewness:.4f}"
            lines.append(line)
            is_title.append(False)
            dropped_features.append(col)
            
    # Drop the skewed features
    df_dropped = df.drop(columns=dropped_features)
    
    return df_dropped


In [5]:
def drop_correlated_features(df, threshold=0.4):
    """
    Drop highly correlated features based on phi coefficient.

    Parameters:
        df (DataFrame): The input DataFrame with binary features.
        threshold (float): The phi coefficient threshold. Pairs of features with a phi coefficient above this value will be considered for dropping.

    Returns:
        DataFrame: A new DataFrame with highly correlated features removed.
    """
    # Initialize an empty dataframe to store phi values
    phi_values = pd.DataFrame(index=df.columns, columns=df.columns)

    # Calculate phi values
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 >= col2:  # Avoid duplicate calculations
                continue
            contingency_table = pd.crosstab(df[col1], df[col2])
            phi = phi_coefficient(contingency_table)
            phi_values.loc[col1, col2] = phi
            phi_values.loc[col2, col1] = phi  # Symmetric matrix

    # Convert to float
    phi_values = phi_values.astype(float)

    # Identify columns to drop
    to_drop = set()
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 == col2 or col1 in to_drop or col2 in to_drop:
                continue
            phi_value = abs(phi_values.loc[col1, col2])
            if phi_value > threshold:
                to_drop.add(col1)  # Choose one column from the pair to drop

    # Drop the highly correlated columns
    df_dropped = df.drop(columns=to_drop)

    return df_dropped


In [6]:
# Function to preprocess data
def preprocess_data(df, skew_threshold, corr_threshold):
    df_unskewed = drop_skewed_features(df, skew_threshold)
    df_drop_corr = drop_correlated_features(df_unskewed, corr_threshold)
    return df_drop_corr

# Modified cluster_and_evaluate function for t-SNE
def cluster_and_evaluate(data, skew_threshold, corr_threshold, tsne_params, hdbscan_params):
    preprocessed_data = preprocess_data(data, skew_threshold, corr_threshold)
    if preprocessed_data.empty:
        print("Preprocessed data is empty. Skipping this iteration.")
        return -1, 0, None, None

    tsne_model = TSNE(**tsne_params, random_state=42)
    try:
        data_tsne = tsne_model.fit_transform(preprocessed_data)
    except ValueError as e:
        print(f"Error during t-SNE transformation: {e}")
        return -1, 0, None, None

    clusterer = hdbscan.HDBSCAN(**hdbscan_params)
    cluster_labels = clusterer.fit_predict(data_tsne)
    
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    if 5 <= n_clusters <= 10:
        score = silhouette_score(data_tsne, cluster_labels)
        return score, n_clusters, preprocessed_data, data_tsne
    else:
        return -1, n_clusters, None, None

# Grid search with t-SNE parameters
best_score = -1
best_params = None
best_cluster_number = None
best_preprocessed_data = None
best_data_tsne = None

for skew_threshold, corr_threshold in itertools.product(skew_thresholds, corr_thresholds):
    for tsne_param_values, hdbscan_param_values in itertools.product(tsne_params, hdbscan_params):
        score, n_clusters, preprocessed_data, data_tsne = cluster_and_evaluate(DATA, skew_threshold, corr_threshold, tsne_param_values, hdbscan_param_values)
        print(f"Silhouette Score: {score:.4f}, Number of Clusters: {n_clusters}, Skew Threshold: {skew_threshold}, Correlation Threshold: {corr_threshold}, t-SNE Parameters: {tsne_param_values}, HDBSCAN Parameters: {hdbscan_param_values}")
        if score > best_score:
            best_score = score
            best_params = (skew_threshold, corr_threshold, tsne_param_values, hdbscan_param_values)
            best_cluster_number = n_clusters
            best_preprocessed_data = preprocessed_data
            best_data_tsne = data_tsne

# Output the best parameters and score
print(f"Best Silhouette Score: {best_score}")
print(f"Best Skew Threshold: {best_params[0]}")
print(f"Best Correlation Threshold: {best_params[1]}")
print(f"Best t-SNE Parameters: {best_params[2]}")
print(f"Best HDBSCAN Parameters: {best_params[3]}")
print(f"Number of Clusters: {best_cluster_number}")


Silhouette Score: -1.0000, Number of Clusters: 595, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 1}
Silhouette Score: -1.0000, Number of Clusters: 259, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 5}
Silhouette Score: -1.0000, Number of Clusters: 2, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 10}
Silhouette Score: -1.0000, Number of Clusters: 3, Skew Threshold: 0.7, Correlation Threshold: 0.5, t-SNE Parameters: {'n_components': 2, 'perplexity': 30, 'learning_rate': 200}, HDBSCAN Parameters: {'min_cluster_size': 5, 'min_samples': 15}
Silhouette Score: -1.0000, Number 

In [None]:
import pandas as pd

# Load the groupings
groupings_path = '/Users/leo/Programming/PLR/Leo/data/symptom_groups.csv'
symptom_groups = pd.read_csv(groupings_path)

# Create a dictionary from the symptom groups
group_dict = dict(zip(symptom_groups['symptom'], symptom_groups['group']))

def aggregate_columns(data, group_dict):
    aggregated_data = pd.DataFrame()

    # Iterate over each column in the DataFrame
    for col in data.columns:
        if col.startswith('Grouped'):
            # Keep 'Grouped' columns as they are
            aggregated_data[col] = data[col]
        elif col in group_dict:
            # Aggregate columns based on their group
            group_name = group_dict[col]
            if group_name not in aggregated_data:
                aggregated_data[group_name] = data[col]
            else:
                aggregated_data[group_name] += data[col]
        else:
            # For columns not in the group_dict, add them as is
            aggregated_data[col] = data[col]

    # Compute the average for each group
    for group in set(group_dict.values()):
        if group in aggregated_data:
            aggregated_data[group] /= len([col for col in group_dict if group_dict[col] == group])

    return aggregated_data

# Aggregate the columns
dataset_aggregated = aggregate_columns(dataset_4_drop_corr, group_dict)

dataset_aggregated

NameError: name 'dataset_4_drop_corr' is not defined

In [None]:
import seaborn as sns

def plot_cluster_averages(data, cluster_labels):
    cluster_averages = pd.DataFrame()

    for label in np.unique(cluster_labels):
        if label == -1:  # Skip noise points
            continue

        cluster_data = data[data['cluster'] == label]

        # Exclude 'Grouped' columns
        filtered_data = cluster_data[[col for col in cluster_data.columns if not col.startswith('Grouped')]]

        # Exclude the 'Cluster' column
        filtered_data = filtered_data.drop('cluster', axis=1)

        cluster_avg = filtered_data.mean()
        cluster_averages[f'Cluster {label}'] = cluster_avg

    # Plotting the heatmap
    plt.figure(figsize=(len(cluster_averages.columns) * 2, len(cluster_averages.index) * 0.5))
    sns.heatmap(cluster_averages, annot=True, fmt=".2f", cmap="YlGnBu")
    plt.title("Average Feature Values per Cluster (Excluding 'Grouped')")
    plt.ylabel("Feature")
    plt.xlabel("Cluster")
    plt.show()

# Example usage of the function
plot_cluster_averages(dataset_aggregated, dataset_4_drop_corr['cluster'])

NameError: name 'dataset_aggregated' is not defined

In [None]:
import pandas as pd
import numpy as np

def cluster_averages_table(data, cluster_labels):
    unique_labels = np.unique(cluster_labels)
    unique_labels = unique_labels[unique_labels != -1]  # Exclude noise points

    # Prepare a DataFrame to store the averages
    cluster_averages = pd.DataFrame()

    for label in unique_labels:
        # Filter the data for each cluster
        cluster_data = data[data['cluster'] == label]

        # Filter out 'Grouped' columns and the 'cluster' column
        filtered_columns = [col for col in cluster_data.columns if not col.startswith('Grouped') and col != 'cluster']
        cluster_data = cluster_data[filtered_columns]

        # Calculate the mean and add it to the DataFrame
        cluster_avg = cluster_data.mean()
        cluster_averages[f'Cluster {label}'] = cluster_avg

    # Transpose the DataFrame for better readability
    cluster_averages = cluster_averages.transpose()

    # Optionally, sort the columns if needed
    # cluster_averages = cluster_averages.sort_values(by='some_column', ascending=False)

    return cluster_averages

# Example usage of the function
cluster_avg_table = cluster_averages_table(dataset_aggregated, dataset_4_drop_corr['cluster'])
print(cluster_avg_table)


               Memory  Cognitive  Auditory  Headaches  Olfaction   Sensory  \
Cluster 0    0.152000   0.178000  0.097778   0.438000   0.072500  0.180000   
Cluster 1    0.120000   0.150000  0.111111   0.470000   0.137500  0.155556   
Cluster 2    0.007407   0.025926  0.049383   0.070370   0.046296  0.255144   
Cluster 3    0.097436   0.146154  0.074074   0.397436   0.073718  0.170940   
Cluster 4    0.111111   0.151852  0.086420   0.307407   0.055556  0.213992   
...               ...        ...       ...        ...        ...       ...   
Cluster 107  0.043478   0.134783  0.048309   0.113043   0.054348  0.115942   
Cluster 108  0.028571   0.028571  0.010582   0.064286   0.062500  0.074074   
Cluster 109  0.000000   0.013636  0.015152   0.109091   0.056818  0.050505   
Cluster 110  0.018182   0.018182  0.010101   0.136364   0.090909  0.060606   
Cluster 111  0.009677   0.011290  0.012545   0.035484   0.024194  0.041219   

                Sleep  Temperature     Motor   Cardiac  ...  \


  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg
  cluster_averages[f'Cluster {label}'] = cluster_avg


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_cluster_averages(data, cluster_labels):
    unique_labels = np.unique(cluster_labels)
    unique_labels = unique_labels[unique_labels != -1]  # Exclude noise points

    num_clusters = len(unique_labels)
    num_columns = 3  # Set the number of columns for the subplot grid
    num_rows = np.ceil(num_clusters / num_columns).astype(int)  # Calculate the number of rows needed

    plt.figure(figsize=(25, num_rows * 4))  # Adjust figure size as needed

    for i, label in enumerate(unique_labels):
        # Filter out 'Grouped' columns and the 'cluster' column
        filtered_columns = [col for col in data.columns if not col.startswith('Grouped') and col != 'cluster']
        cluster_data = data[data['cluster'] == label][filtered_columns]

        cluster_avg = cluster_data.mean().sort_values(ascending=False)
        cluster_averages = pd.DataFrame(cluster_avg).transpose()

        # Create a subplot for each cluster in a grid of num_rows x num_columns
        plt.subplot(num_rows, num_columns, i + 1)
        sns.heatmap(cluster_averages, annot=True, fmt=".2f", cmap="YlGnBu")
        plt.title(f"Cluster {label} Feature Values (Sorted)")
        plt.ylabel("Feature")
        plt.xlabel("Average Value")

    plt.tight_layout()
    plt.show()

# Example usage of the function
plot_cluster_averages(dataset_aggregated, dataset_4_drop_corr['cluster'])


NameError: name 'dataset_aggregated' is not defined