In [None]:
import os
import scanpy as sc
sc.set_figure_params(dpi=100, dpi_save=1000, vector_friendly=True)
import matplotlib

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
# sns.set(style="ticks")

In [None]:
adata = sc.read("../xenium_outs/adata_polygon_reduced_pseudotime_X26updated_with_quadrants.h5ad")

In [None]:
X = adata.obsm["X_pca"]

In [None]:
sns.set(style="ticks")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer

inertia = []
silhouette_scores = []
k_range = range(2, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
    
    if k > 1: 
        labels = kmeans.labels_
        silhouette_scores.append(silhouette_score(X, labels))

# Plot Elbow Method
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (WCSS)')
plt.title('Elbow Method')
plt.grid(False)
plt.savefig("elbow.pdf", bbox_inches="tight")


# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, 'ro-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.grid(False)
plt.tight_layout()
plt.savefig("silhouette.pdf", bbox_inches="tight")
plt.show()

visualizer = KElbowVisualizer(KMeans(random_state=42), k=(2, 15))
visualizer.fit(X)
visualizer.finalize()
plt.savefig("elbow_yellowbrick.pdf", bbox_inches="tight")

optimal_k = 5
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X)

for i in range(optimal_k):
    print(f"Cluster {i}: {np.sum(cluster_labels == i)} samples")

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(10, 8))
for i in range(optimal_k):
    plt.scatter(X_pca[cluster_labels == i, 0], X_pca[cluster_labels == i, 1], label=f'Cluster {i}')
plt.legend()
plt.title(f'K-means Clustering with {optimal_k} clusters (PCA visualization)')
plt.show()

## redo KMeans on only unique polygons

In [None]:
import os
import scanpy as sc
sc.set_figure_params(dpi=100, dpi_save=1000, vector_friendly=True)
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
font_dirs = "/data/projects/robin/fonts"
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
matplotlib.pyplot.rcParams['font.family'] = "Arial"

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
# sns.set(style="ticks")

In [None]:
adata = sc.read("../xenium_outs/adata_polygon_reduced_pseudotime_X26updated_with_quadrants.h5ad")

In [None]:
adata_unique = adata[~adata.obs.index.str.contains(",")]

In [None]:
adata_unique.obs["Sample_ID"] = [idx.split("_")[-1] for idx in adata_unique.obs.index]

In [None]:
sc.pp.highly_variable_genes(adata_unique, n_top_genes=100)
sc.tl.pca(adata_unique, use_highly_variable=True)
sc.pp.neighbors(adata_unique, use_rep="X_pca")

In [None]:
from sklearn.cluster import KMeans
clf = KMeans(5, random_state=42, n_init=10)
adata_unique.obs["KMeans_cluster"] = clf.fit_predict(adata_unique.obsm["X_pca"])

In [None]:
tab = pd.crosstab(adata_unique.obs["Sample_ID"], adata_unique.obs["KMeans_cluster"])
tab[tab>1] = 1
tab.sum(0)

In [None]:
tab.shape

In [None]:
for cluster in sorted(adata_unique.obs["KMeans_cluster"].unique()):
    print(cluster)
    sub = adata_unique[adata_unique.obs["KMeans_cluster"]==cluster]
    display(pd.DataFrame(sub.obs["Sample_ID"].value_counts()))

In [None]:
pd.crosstab(adata_unique.obs["Disease"], adata_unique.obs["KMeans_cluster"])

In [None]:
adata_unique.obs["KMeans_cluster"] = adata_unique.obs["KMeans_cluster"].astype("category")

In [None]:
adata_unique.obs["KMeans_cluster"] = adata_unique.obs["KMeans_cluster"].astype(str).astype("category")
adata_unique.obs["KMeans_cluster"].replace({"4": "C1",
                                           "2": "C1",
                                           "1": "C2",
                                           "0": "C3",
                                           "3": "C4"}, inplace=True)
cats = ["C1", "C2", "C3", "C4"]
adata_unique.obs["KMeans_cluster"] = adata_unique.obs["KMeans_cluster"].cat.reorder_categories(cats)

In [None]:
sc.pl.pca(adata_unique, color=["KMeans_cluster", "Disease"], title=["Cluster", "Condition"], show=False)
plt.savefig("pcs_cluster_conditions.pdf", bbox_inches="tight")

In [None]:
adata_unique.write("../xenium_outs/adata_polygon_reduced_pseudotime_X26updated_with_quadrants_with_clusters.h5ad")