# 📉 Notebook 2: Dimensionality Reduction

This notebook performs PCA, t-SNE, and UMAP on the gene expression matrix to reduce dimensionality and visualize sample-level clustering.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

# Load processed data
expression_data = pd.read_csv('../data/processed/gene_expression_matrix.csv', index_col=0)

# Transpose: samples as rows
data_t = expression_data.T
print("Shape:", data_t.shape)


## 🧪 Principal Component Analysis (PCA)

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_t)

# Create DataFrame
pca_df = pd.DataFrame(data=pca_result, columns=["PC1", "PC2"])
pca_df["Sample"] = data_t.index

# Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2")
plt.title("PCA: First Two Principal Components")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
plt.grid(True)
plt.show()


## 🔁 t-Distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
tsne = TSNE(n_components=2, perplexity=10, random_state=42)
tsne_result = tsne.fit_transform(data_t)

# Create DataFrame
tsne_df = pd.DataFrame(data=tsne_result, columns=["tSNE1", "tSNE2"])
tsne_df["Sample"] = data_t.index

# Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=tsne_df, x="tSNE1", y="tSNE2")
plt.title("t-SNE Visualization of Samples")
plt.grid(True)
plt.show()


## 🔀 Uniform Manifold Approximation and Projection (UMAP)

In [None]:
reducer = umap.UMAP(random_state=42)
umap_result = reducer.fit_transform(data_t)

# Create DataFrame
umap_df = pd.DataFrame(data=umap_result, columns=["UMAP1", "UMAP2"])
umap_df["Sample"] = data_t.index

# Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=umap_df, x="UMAP1", y="UMAP2")
plt.title("UMAP Visualization of Samples")
plt.grid(True)
plt.show()
