In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score

# Load the data
path = '/Users/krishnasanaka/Library/CloudStorage/OneDrive-Emory/NHANES CKM Cascade/working/new diabetes/knn clusters.csv'
analytic_dataset = pd.read_csv(path)

# Select variables
selected_variables = ['year', 'bmi', 'glycohemoglobin', 'dm_age', 'homa2b', 'homa2ir', 'triglyceride', 'ldl', 'sbp', 'dbp', 'hdl', 'respondentid', 'race', 'gender']

# Drop missing values in the selected variables
analytic_dataset = analytic_dataset[selected_variables]
analytic_dataset = analytic_dataset.dropna()

year_sites = analytic_dataset['year'].unique()

# Standardize the entire dataset
scaler = StandardScaler()
var_5 = ['bmi', 'glycohemoglobin', 'dm_age', 'homa2b', 'homa2ir']
cluster_v5 = scaler.fit_transform(analytic_dataset[var_5])

# Perform KMeans clustering on the entire dataset
kmeans = KMeans(init="random", n_clusters=4, n_init=10, max_iter=300, random_state=57)
kmeans.fit(cluster_v5)
original_labels = kmeans.labels_

# Leave-one-out analysis
results = []

for year_site in year_sites:
    # Exclude the current study site from the dataset
    excluded_dataset = analytic_dataset[analytic_dataset['year'] != year_site]
    excluded_sample_size = len(analytic_dataset[analytic_dataset['year'] == year_site])
    
    # Standardize the excluded dataset
    excluded_cluster_v5 = scaler.fit_transform(excluded_dataset[var_5])
    
    # Perform KMeans clustering on the excluded dataset
    kmeans_excluded = KMeans(init="random", n_clusters=4, n_init=10, max_iter=300, random_state=57)
    kmeans_excluded.fit(excluded_cluster_v5)
    excluded_labels = kmeans_excluded.labels_

    # Adjust original_labels to exclude the study site
    original_labels_excluded = original_labels[analytic_dataset['year'] != year_site]

    # Calculate Adjusted Rand Index (ARI)
    ARI = adjusted_rand_score(original_labels_excluded, excluded_labels)

    # Append the results
    results.append({'Year Site Removed': year_site, 'Sample Size': excluded_sample_size, 'ARI': ARI})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Define the desired order
results_df['Year Site Removed'] = pd.Categorical(results_df['Year Site Removed'])
results_df = results_df.sort_values('Year Site Removed')

# Print the results
print(results_df)
# Save results to a CSV file
results_df.to_csv('/Users/krishnasanaka/Library/CloudStorage/OneDrive-Emory/NHANES CKM Cascade/working/new diabetes/dec_an10a_sensitivity_analysis_results_ari_clean.csv', index=False)





  Year Site Removed  Sample Size       ARI
0          19992000          174  0.987238
1          20012002          188  0.946703
2          20032004          185  0.970880
3          20052006          175  0.466276
4          20072008          318  0.485049
5          20092010          318  0.972631
6          20112012          271  0.947596
7          20132014          233  0.943403
8          20152016          294  0.982960
9          20172018          284  0.939378
