In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import jaccard_score

# Load data
path = "/Users/ksanaka/Library/CloudStorage/OneDrive-Emory/NHANES CKM Cascade/working/new diabetes/knn clusters.csv"
df = pd.read_csv(path)

# Variables to use for clustering
cluster_vars = ['bmi', 'dm_age', 'glycohemoglobin', 'homa2b', 'homa2ir']
df = df.dropna(subset=cluster_vars).reset_index(drop=True)

# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df[cluster_vars])

# Run original KMeans
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
df['orig_cluster'] = kmeans.fit_predict(data_scaled)

# For reproducibility
np.random.seed(42)

# Run bootstrapping
n_bootstraps = 100
jaccard_scores = np.zeros((n_bootstraps, n_clusters))

for b in range(n_bootstraps):
    # Sample with replacement
    sample_indices = np.random.choice(df.index, size=len(df), replace=True)
    bootstrap_df = df.loc[sample_indices].copy()
    bootstrap_df = bootstrap_df.reset_index(drop=True)  # Reset index to align after sampling

    # Re-scale bootstrap data
    boot_scaled = scaler.transform(bootstrap_df[cluster_vars])
    
    # Run KMeans on bootstrap sample
    boot_kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    bootstrap_df['boot_cluster'] = boot_kmeans.fit_predict(boot_scaled)

    # Determine shared original indices (without duplicates)
    shared_indices = np.intersect1d(df.index, sample_indices)

    if len(shared_indices) == 0:
        continue

    # Get original cluster labels for shared indices
    orig_labels_shared = df.loc[shared_indices, 'orig_cluster']

    # Find positions in the bootstrap sample that correspond to these shared original indices
    # This uses the sample_indices array to map back to positions
    boot_positions = [i for i, idx in enumerate(sample_indices) if idx in shared_indices]
    boot_labels_shared = bootstrap_df.loc[boot_positions, 'boot_cluster']
    matched_orig_labels = [df.loc[sample_indices[i], 'orig_cluster'] for i in boot_positions]

    # Convert to pandas Series to align and match
    orig_labels_shared = pd.Series(matched_orig_labels).reset_index(drop=True)
    boot_labels_shared = boot_labels_shared.reset_index(drop=True)

    for orig_c in range(n_clusters):
        orig_mask = (orig_labels_shared == orig_c).astype(int)
        max_jaccard = 0
        for boot_c in range(n_clusters):
            boot_mask = (boot_labels_shared == boot_c).astype(int)
            if np.sum(orig_mask | boot_mask) > 0:
                jaccard = jaccard_score(orig_mask, boot_mask)
                max_jaccard = max(max_jaccard, jaccard)
        jaccard_scores[b, orig_c] = max_jaccard

# Summarize
summary = pd.DataFrame({
    'Cluster': [f'Cluster {i}' for i in range(n_clusters)],
    'Mean_Jaccard': np.mean(jaccard_scores, axis=0),
    'Std_Jaccard': np.std(jaccard_scores, axis=0)
})

print("=== Cluster Stability (Jaccard Index) ===")
print(summary)


  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist

=== Cluster Stability (Jaccard Index) ===
     Cluster  Mean_Jaccard  Std_Jaccard
0  Cluster 0      0.813922     0.122386
1  Cluster 1      0.681733     0.177840
2  Cluster 2      0.932968     0.035562
3  Cluster 3      0.639506     0.199867
