In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from ast import literal_eval
import csv

First, we can convert the embeddings to the proper vector form from string in CSV

In [None]:
def convert_to_array(embedding_str):
    return np.fromstring(embedding_str[1:-1], sep=' ')

file_path = "/home/imadejski/ctds-search-model/data/mimic/embedding_libraries/mimic_validate_embedding_library.csv"

Then, we can do k-means clustering directly with the MIMIC embeddings. 

In [None]:
embeddings_df = pd.read_csv(file_path)
embeddings_df['embedding'] = embeddings_df['embedding'].apply(convert_to_array)
embeddings_stack = np.stack(embeddings_df['embedding'].values)

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings_stack)

embeddings_df['cluster'] = kmeans.labels_

Then we can use PCA to reduce the embeddings and visualize the k-means clustered original embeddings. 

In [None]:
embeddings_pca_df = pd.read_csv(file_path)

embeddings_pca_df['embedding'] = embeddings_pca_df['embedding'].apply(convert_to_array)
embeddings_pca_stack = np.stack(embeddings_pca_df['embedding'].values)

# Apply PCA to reduce the dimensionality
#pca = PCA(n_components=3)
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings_pca_stack)

# Plotting in 3D
#fig = plt.figure(figsize=(10, 6))
#ax = fig.add_subplot(111, projection='3d')

#Plotting in 2D
fig, ax = plt.subplots(figsize=(10, 6))

#ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], reduced_embeddings[:, 2], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.6)
ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.6)
ax.set_title('2D Visualization of KMeans Clustering with Original Embeddings')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
#ax.set_zlabel('PCA Component 3')

plt.show()

Then we can visualize the reduced embeddings clustered by k-means on the PCA plot. 

In [None]:
# Perform KMeans clustering on the reduced embeddings
n_clusters_pca = 10
kmeans_pca = KMeans(n_clusters=n_clusters, random_state=0).fit(reduced_embeddings)

# Assign the cluster labels to the DataFrame
embeddings_pca_df['cluster'] = kmeans_pca.labels_

# Plotting in 3D, coloring points by their cluster label
#fig = plt.figure(figsize=(10, 6))
#ax = fig.add_subplot(111, projection='3d')
fig, ax = plt.subplots(figsize=(10, 6))

# Use the 'cluster' column to color the datapoints
#scatter = ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], reduced_embeddings[:, 2], c=embeddings_pca_df['cluster'], cmap='viridis', s=50, alpha=0.6)
scatter = ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=embeddings_pca_df['cluster'], cmap='viridis', s=50, alpha=0.6)


ax.set_title('2D Visualization of KMeans Clustering with Reduced Embeddings')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
#ax.set_zlabel('PCA Component 3')

plt.show()

In [None]:
chexpert_file_path = "/opt/gpudata/mimic-cxr/mimic-cxr-2.0.0-chexpert.csv.gz"
chexpert_df = pd.read_csv(chexpert_file_path)

merged_df = pd.merge(embeddings_df, chexpert_df, on=['subject_id', 'study_id'])

label_columns = ["Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Enlarged Cardiomediastinum", "Fracture", "Lung Lesion",
"Lung Opacity", "No Finding", "Pleural Effusion", "Pleural Other", "Pneumonia", "Pneumothorax", "Support Devices"] 

label_prevalence = {label: {} for label in label_columns}
cluster_sample_counts = {}

for label in label_columns:
    for cluster in merged_df['cluster'].unique():
        cluster_df = merged_df[merged_df['cluster'] == cluster]
        positive_count = cluster_df[label].eq(1).sum()
        total_count = cluster_df[label].shape[0]
        cluster_sample_counts[cluster] = total_count
        prevalence = positive_count / total_count if total_count > 0 else np.nan
        label_prevalence[label][cluster] = prevalence

prevalence_df = pd.DataFrame(label_prevalence)

#Add total number of samples in each cluster 
prevalence_df['Total_Samples'] = prevalence_df.index.map(cluster_sample_counts)

#Add row that averages the prevalence of each label for sanity check
# Calculate weighted averages for each label
weighted_averages = {}
for label in label_columns:
    weighted_sum = 0
    total_samples = 0
    for cluster, prevalence in label_prevalence[label].items():
        samples_in_cluster = cluster_sample_counts[cluster]
        if not pd.isna(prevalence):
            weighted_sum += prevalence * samples_in_cluster
            total_samples += samples_in_cluster
    weighted_average = weighted_sum / total_samples if total_samples > 0 else np.nan
    weighted_averages[label] = weighted_average

# Append the row of weighted averages to prevalence_df
weighted_averages['Total_Samples'] = total_samples
weighted_average_row = pd.DataFrame([weighted_averages], index=['Weighted_Avg'])
prevalence_df = pd.concat([prevalence_df, weighted_average_row])


prevalence_df.to_csv('/home/imadejski/ctds-search-model/data/mimic/label_prevalence/mimic_label_prevalence_15_clusters.csv', index_label='Cluster')


Check that the prevalence adds up

In [None]:
total_samples_sum = prevalence_df['Total_Samples'].sum() - prevalence_df['Total_Samples']["Weighted_Avg"]
print("Total samples clustered: ", total_samples_sum)

embeddings_file_path = '/home/imadejski/ctds-search-model/data/mimic/embedding_libraries/mimic_validate_embedding_library.csv'
embeddings_df = pd.read_csv(embeddings_file_path)
print("Total embeddings: ", embeddings_df.shape[0])

In [None]:
print(prevalence_df.mean())