# Clustering Execution Runner
Run one dataset through preprocessing, clustering evaluation, and basic visuals.

In [1]:
import pandas as pd
import numpy as np
from preprocess import load_volcorr, preprocess_volcorr, load_total_df, preprocess_total_df, load_globalminds, preprocess_globalminds
from cluster_eval import evaluate_all
from visualize import visualize_by_config, cluster_heatmap

## Pick dataset key
Options: `'volcorr'`, `'total_df'`, `'globalminds'`

In [2]:
dataset_key = 'total_df'

## Load and preprocess

In [3]:
DATASETS = {
    'volcorr': (load_volcorr, preprocess_volcorr, 'volcorr', 'exchange'),
    'total_df': (load_total_df, preprocess_total_df, 'total_df', 'exchange'),
    'globalminds': (load_globalminds, preprocess_globalminds, 'globalminds', 'country')
}
load_fn, pre_fn, data_name, id_col = DATASETS[dataset_key]
df = load_fn()
X_df = pre_fn(df)
ids = df[id_col] if id_col in df.columns else X_df.index
X = X_df.to_numpy()
print(X.shape)


(124, 14)


In [None]:
X

## Cluster evaluation

In [None]:
summary, stats_df, all_members = evaluate_all(X, ids.to_numpy())
summary

## Inspect top configurations

In [None]:
stats_df.head()

## Stats_df screening

In [None]:
screened_stats_df = stats_df.copy()

if dataset_key == 'total_df':
    screened_stats_df = screened_stats_df[screened_stats_df['method'].isin(['pca_kmeans', 'tsne_dbscan', 'umap_hdbscan'])]
    screened_stats_df = screened_stats_df[screened_stats_df['k'] > 4]
    screened_stats_df = screened_stats_df[screened_stats_df['sil'] > 0.3]
elif dataset_key == 'globalminds':
    screened_stats_df = screened_stats_df[screened_stats_df['method'].isin(['pca_kmeans', 'tsne_dbscan', 'umap_hdbscan'])]
    screened_stats_df = screened_stats_df[screened_stats_df['k'] > 4]
    screened_stats_df = screened_stats_df[screened_stats_df['sil'] > 0.4]
elif dataset_key == 'volcorr': 
    screened_stats_df = screened_stats_df[screened_stats_df['method'].isin(['pca_kmeans', 'tsne_dbscan', 'umap_hdbscan'])]

## Generate visuals

In [None]:
visualize_by_config(df, screened_stats_df, pre_fn, data_name, id_col=id_col)

In [None]:
cluster_heatmap(df, screened_stats_df, pre_fn, data_name, id_col = id_col)

Outputs are saved under `plots/<data_name>`.