# üßæ Notebook 4: Metadata Integration & Cluster Evaluation

This notebook integrates sample metadata (e.g., subtype labels, clinical traits) and compares them to discovered clusters.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score, confusion_matrix

# Load PCA + cluster labels
pca_df = pd.read_csv('../results/tables/pca_with_clusters.csv', index_col=0)

# Simulated metadata for demo purposes
# Real datasets can load metadata using GEOparse or provided clinical files
metadata = pd.DataFrame({
    "Sample": pca_df.index,
    "Subtype": ["LuminalA", "LuminalB", "HER2", "Basal"] * 12 + ["Normal", "LuminalA"],
})
metadata.set_index("Sample", inplace=True)

# Merge metadata with cluster labels
merged = pca_df.join(metadata)
merged.head()


## üîç Visualize Clusters vs. Known Subtypes

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=merged, x="PC1", y="PC2", hue="Subtype", style="KMeans_Label", palette="Set1")
plt.title("Discovered Clusters vs Known Subtypes")
plt.grid(True)
plt.show()


## üìä Clustering Evaluation

In [None]:
# Encode subtypes numerically for ARI
subtype_codes = merged["Subtype"].astype("category").cat.codes
cluster_labels = merged["KMeans_Label"]

# Adjusted Rand Index
ari = adjusted_rand_score(subtype_codes, cluster_labels)
print("Adjusted Rand Index (KMeans vs Subtypes):", round(ari, 3))

# Confusion matrix
cm = confusion_matrix(subtype_codes, cluster_labels)
print("Confusion Matrix:
", cm)


In [None]:
# Save merged file
merged.to_csv("../results/tables/merged_with_metadata.csv")
