# 🧩 Notebook 3: Clustering and Subtype Discovery

This notebook applies clustering algorithms (K-Means, Hierarchical, DBSCAN) to reduced gene expression data to discover cancer subtypes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

# Load previous results
pca_df = pd.read_csv('../results/tables/pca_result.csv') if os.path.exists('../results/tables/pca_result.csv') else None
tsne_df = pd.read_csv('../results/tables/tsne_result.csv') if os.path.exists('../results/tables/tsne_result.csv') else None
umap_df = pd.read_csv('../results/tables/umap_result.csv') if os.path.exists('../results/tables/umap_result.csv') else None

# For demo: reload PCA from Notebook 2 manually
from sklearn.decomposition import PCA
expression_data = pd.read_csv('../data/processed/gene_expression_matrix.csv', index_col=0)
data_t = expression_data.T
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_t)
pca_df = pd.DataFrame(data=pca_result, columns=["PC1", "PC2"])


## 🎯 K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
pca_df['KMeans_Label'] = kmeans.fit_predict(pca_df[["PC1", "PC2"]])
score = silhouette_score(pca_df[["PC1", "PC2"]], pca_df["KMeans_Label"])
print("Silhouette Score (KMeans):", score)

# Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="KMeans_Label", palette="Set2")
plt.title("KMeans Clustering (PCA space)")
plt.grid(True)
plt.show()


## 🪜 Hierarchical Clustering (Dendrogram)

In [None]:
linkage_matrix = linkage(pca_df[["PC1", "PC2"]], method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 5))
dendrogram(linkage_matrix)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()


## 🌀 DBSCAN Clustering

In [None]:
db = DBSCAN(eps=1.5, min_samples=3)
pca_df['DBSCAN_Label'] = db.fit_predict(pca_df[["PC1", "PC2"]])

# Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="DBSCAN_Label", palette="tab10")
plt.title("DBSCAN Clustering (PCA space)")
plt.grid(True)
plt.show()


In [None]:
# Save result
pca_df.to_csv('../results/tables/pca_with_clusters.csv')
