In [None]:

# Sample sizes, means, and SDs for each subphenotype (from ANDIS)
n = {'SIDD': 1575, 'SIRD': 1373, 'MOD': 1942, 'MARD': 3513}

# Pooled mean and SD calculation
def pooled_mean_sd(means, sds, sample_sizes):
    pooled_mean = np.sum(sample_sizes * means) / np.sum(sample_sizes)
    variance_within = np.sum((sample_sizes - 1) * sds**2)
    variance_between = np.sum(sample_sizes * (means - pooled_mean)**2)
    pooled_sd = np.sqrt((variance_within + variance_between) / (np.sum(sample_sizes) - 1))
    return pooled_mean, pooled_sd

# ANDIS means and SDs for each variable (from the provided image)
hba1c_means = np.array([101.85, 54.07, 57.70, 50.08])  # mmol/mol
hba1c_sds = np.array([19.26, 15.46, 16.07, 9.85])      # mmol/mol
bmi_means = np.array([28.86, 33.85, 35.71, 27.94])      # kg/m²
bmi_sds = np.array([4.77, 5.24, 5.43, 3.44])
age_means = np.array([56.74, 65.25, 48.96, 67.37])      # years
age_sds = np.array([11.14, 9.34, 5.43, 8.55])
homa2b_means = np.array([47.64, 150.47, 95.03, 86.59])
homa2b_sds = np.array([28.93, 47.20, 32.45, 26.37])
homa2ir_means = np.array([3.18, 5.54, 3.35, 2.55])
homa2ir_sds = np.array([1.73, 2.74, 1.21, 0.84])

# Convert HbA1c to % from mmol/mol
def convert_hba1c(mmol):
    return (mmol + 2.15) / 10.929

hba1c_means_percent = convert_hba1c(hba1c_means)
hba1c_sds_percent = hba1c_sds / 10.929

# Calculate the pooled means and SDs for each variable
pooled_mean_hba1c, pooled_sd_hba1c = pooled_mean_sd(hba1c_means_percent, hba1c_sds_percent, np.array(list(n.values())))
pooled_mean_bmi, pooled_sd_bmi = pooled_mean_sd(bmi_means, bmi_sds, np.array(list(n.values())))
pooled_mean_age, pooled_sd_age = pooled_mean_sd(age_means, age_sds, np.array(list(n.values())))
pooled_mean_homa2b, pooled_sd_homa2b = pooled_mean_sd(homa2b_means, homa2b_sds, np.array(list(n.values())))
pooled_mean_homa2ir, pooled_sd_homa2ir = pooled_mean_sd(homa2ir_means, homa2ir_sds, np.array(list(n.values())))

# Print pooled means and SDs for each variable
print(f"Pooled mean and SD for HbA1c (%): {pooled_mean_hba1c}, {pooled_sd_hba1c}")
print(f"Pooled mean and SD for BMI: {pooled_mean_bmi}, {pooled_sd_bmi}")
print(f"Pooled mean and SD for Age: {pooled_mean_age}, {pooled_sd_age}")
print(f"Pooled mean and SD for HOMA2-B: {pooled_mean_homa2b}, {pooled_sd_homa2b}")
print(f"Pooled mean and SD for HOMA2-IR: {pooled_mean_homa2ir}, {pooled_sd_homa2ir}")

# Standardize the dataset using pooled means and SDs
def standardize_data(df, pooled_means, pooled_sds, var_names):
    df_standardized = df.copy()
    for i, var in enumerate(var_names):
        df_standardized[var] = (df[var] - pooled_means[i]) / pooled_sds[i]
    return df_standardized

pooled_means = [pooled_mean_hba1c, pooled_mean_bmi, pooled_mean_age, pooled_mean_homa2b, pooled_mean_homa2ir]
pooled_sds = [pooled_sd_hba1c, pooled_sd_bmi, pooled_sd_age, pooled_sd_homa2b, pooled_sd_homa2ir]


# apply the standardization to the dataset
analytic_dataset_standardized = standardize_data(analytic_dataset, pooled_means, pooled_sds, var_5)

# Split dataset into female and male

data_female_standardized = analytic_dataset_standardized[analytic_dataset_standardized['female'] == 1]
data_male_standardized = analytic_dataset_standardized[analytic_dataset_standardized['female'] == 0]

# Standardize the ANDIS centroids (for female and male)
published_centroids_female = np.array([
    [1.8702613, -0.2415449, -0.1929637, -0.97446899, 0.056469],  # SIDD
    [-0.254848, 0.5189057, 0.3214557, 1.35581907, 1.1801933],   # SIRD
    [-0.3003478, 0.6683606, -0.9388278, -0.03556857, -0.1405151],  # MOD
    [-0.4582762, -0.5854255, 0.5980858, -0.14552652, -0.4254893]  # MARD
])
published_centroids_male = np.array([
    [1.52185804, -0.4284673, -0.4017103, -0.98397328, -0.1630751],  # SIDD
    [-0.39080167, 0.5396294, 0.4235841, 1.29059153, 1.1801031],   # SIRD
    [-0.06915764, 1.0305317, -1.0157681, 0.15742215, 0.1343923],  # MOD
    [-0.5367578, -0.4776681, 0.5031031, -0.09004338, -0.4233873]  # MARD
])

# Standardize the centroids using the pooled means and SDs
published_centroids_female_standardized = (published_centroids_female - pooled_means) / pooled_sds
published_centroids_male_standardized = (published_centroids_male - pooled_means) / pooled_sds

# Calculate RMSE between each case and the standardized centroids
def calculate_rmse(X, centroids):
    rmse_matrix = np.sqrt(np.mean((X[:, np.newaxis] - centroids) ** 2, axis=2))
    return rmse_matrix

# Calculate RMSE and assign clusters based on RMSE for females
rmse_female = calculate_rmse(data_female_standardized[var_5].values, published_centroids_female_standardized)
cluster_assignments_female = np.argmin(rmse_female, axis=1)

# Calculate RMSE and assign clusters based on RMSE for males
rmse_male = calculate_rmse(data_male_standardized[var_5].values, published_centroids_male_standardized)
cluster_assignments_male = np.argmin(rmse_male, axis=1)

# Add cluster assignments back to the original dataset
data_female_standardized['published_cluster'] = cluster_assignments_female
data_male_standardized['published_cluster'] = cluster_assignments_male

# Combine female and male datasets back into one DataFrame
data_combined = pd.concat([data_female_standardized, data_male_standardized])

# Output the combined dataset with assigned clusters
data_combined.head()

# relabel the cluster labels
data_combined['published_cluster'] = data_combined['published_cluster'].replace({0:'SIDD', 1:'SIRD', 2:'MOD', 3:'MARD'})

# check the sample size by the published cluster
data_combined['published_cluster'].value_counts()

In [None]:
##### Kmeans clustering by gender ##### 

# Run k-means for females
kmeans_female = KMeans(n_clusters=4, random_state=57)
kmeans_female.fit(X_female_normalized)
data_female['kmeans_cluster'] = kmeans_female.labels_

# Run k-means for males
kmeans_male = KMeans(n_clusters=4, random_state=57)
kmeans_male.fit(X_male_normalized)
data_male['kmeans_cluster'] = kmeans_male.labels_

# k-means centroids for females
kmeans_centroids_female = kmeans_female.cluster_centers_
print(f"K-means Centroids (female):\n{kmeans_centroids_female}")
print(f"Published Centroids (female):\n{published_centroids_female}")

# k-means centroids for males
kmeans_centroids_male = kmeans_male.cluster_centers_
print(f"K-means Centroids (male):\n{kmeans_centroids_male}")
print(f"Published Centroids (male):\n{published_centroids_male}")

# Combine the two datasets back into one
data_combined = pd.concat([data_female, data_male])

# check the mean values for the clusters for the kmeans clustering for the variables in var_5
data_combined[var_5].groupby(data_combined['kmeans_cluster']).mean()



In [None]:
############## Scale Method 1: Z-score normalization by our study sample and centroids normalization by our sample ##############

# Convert HbA1c in your dataset from % to mmol/mol
def convert_hba1c_to_mmol(hba1c_percent):
    return 10.929 * hba1c_percent - 2.15

# Convert HbA1c in your dataset for females and males
data_female['hba1c_mmol'] = convert_hba1c_to_mmol(data_female['hba1c'])
data_male['hba1c_mmol'] = convert_hba1c_to_mmol(data_male['hba1c'])

# Update the variable list to include HbA1c in mmol/mol
var_5_mmol = ['hba1c_mmol', 'bmi', 'dmagediag', 'homa2b', 'homa2ir']

# Standardize the dataset using HbA1c in mmol/mol and the rest of the variables
scaler_female_mmol = StandardScaler()
X_female_normalized_mmol = scaler_female_mmol.fit_transform(data_female[var_5_mmol])

scaler_male_mmol = StandardScaler()
X_male_normalized_mmol = scaler_male_mmol.fit_transform(data_male[var_5_mmol])

# Get means and SDs for each gender's dataset (for HbA1c in mmol/mol and the other variables)
female_means_mmol = scaler_female_mmol.mean_
female_sds_mmol = np.sqrt(scaler_female_mmol.var_)

male_means_mmol = scaler_male_mmol.mean_
male_sds_mmol = np.sqrt(scaler_male_mmol.var_)

# Standardize the centroids using the means and SDs of your dataset (for females)
published_centroids_female_standardized = (published_centroids_female - female_means_mmol) / female_sds_mmol

# Standardize the centroids using the means and SDs of your dataset (for males)
published_centroids_male_standardized = (published_centroids_male - male_means_mmol) / male_sds_mmol

# Calculate Euclidean distance and assign clusters for females (standardized centroids)
distances_female_standardized = cdist(X_female_normalized_mmol, published_centroids_female_standardized, metric='euclidean')
cluster_assignments_female_standardized = np.argmin(distances_female_standardized, axis=1)

# Calculate Euclidean distance and assign clusters for males (standardized centroids)
distances_male_standardized = cdist(X_male_normalized_mmol, published_centroids_male_standardized, metric='euclidean')
cluster_assignments_male_standardized = np.argmin(distances_male_standardized, axis=1)

# Add cluster assignments back to the original dataset
data_female['published_cluster_standardized'] = cluster_assignments_female_standardized
data_male['published_cluster_standardized'] = cluster_assignments_male_standardized

# Combine female and male datasets back into one DataFrame
data_combined_standardized = pd.concat([data_female, data_male])

# Output the combined dataset with standardized centroids and assigned clusters
data_combined_standardized.head()

# relabel the cluster labels for the standardized method
data_combined_standardized['published_cluster_standardized'] = data_combined_standardized['published_cluster_standardized'].replace({0:'SIDD', 1:'SIRD', 2:'MOD', 3:'MARD'})

# Check the cluster sample size for the standardized published clusters
data_combined_standardized['published_cluster_standardized'].value_counts()

