# Lecture 5: Quality Control, Normalization, and Preprocessing - SOLUTION

**Date:** December 19, 2025

---

## Setup

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(6, 6))
print("Setup complete!")

## Task 1: Quality Control Filtering (25 points)

In [None]:
# Load PBMC 3k dataset
adata = sc.datasets.pbmc3k()
print(f"Loaded: {adata.n_obs} cells × {adata.n_vars} genes")

# Calculate QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Visualize QC metrics before filtering
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], 
             multi_panel=True, ax=axes, show=False)
plt.suptitle('QC Metrics Before Filtering', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.show()

# Apply filters
print("\nApplying filters...")
n_before = adata.n_obs
g_before = adata.n_vars

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_cells(adata, min_counts=500)
adata = adata[adata.obs['total_counts'] < 10000, :]
adata = adata[adata.obs['pct_counts_mt'] < 5, :]
sc.pp.filter_genes(adata, min_cells=3)

print(f"\nCells: {n_before} → {adata.n_obs} (removed {n_before - adata.n_obs})")
print(f"Genes: {g_before} → {adata.n_vars} (removed {g_before - adata.n_vars})")

## Task 2: Normalization (20 points)

In [None]:
# Visualize distribution before normalization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Before normalization
if hasattr(adata.X, 'toarray'):
    sample_expr = adata.X[:100, :100].toarray().flatten()
else:
    sample_expr = adata.X[:100, :100].flatten()

axes[0].hist(sample_expr[sample_expr > 0], bins=50, color='steelblue', alpha=0.7)
axes[0].set_xlabel('Expression (raw counts)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Before Normalization')
axes[0].set_yscale('log')

# Store raw counts
adata.raw = adata

# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# Log-transform
sc.pp.log1p(adata)

# After normalization
if hasattr(adata.X, 'toarray'):
    sample_expr_norm = adata.X[:100, :100].toarray().flatten()
else:
    sample_expr_norm = adata.X[:100, :100].flatten()

axes[1].hist(sample_expr_norm[sample_expr_norm > 0], bins=50, color='coral', alpha=0.7)
axes[1].set_xlabel('Expression (log1p normalized)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('After Normalization & Log Transform')

plt.tight_layout()
plt.show()

print("Normalization complete!")
print(f"Raw counts stored in adata.raw")

## Task 3: Feature Selection (25 points)

In [None]:
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat')

print(f"Highly variable genes: {adata.var['highly_variable'].sum()}")

# Plot variance vs mean
sc.pl.highly_variable_genes(adata)

# Top 20 HVGs by variance
hvg_data = adata.var[adata.var['highly_variable']].sort_values('variances_norm', ascending=False)
print("\nTop 20 highly variable genes:")
print(hvg_data.head(20)[['means', 'dispersions', 'dispersions_norm']])

# Subset to HVGs
adata = adata[:, adata.var.highly_variable]
print(f"\nDataset after HVG selection: {adata.n_obs} cells × {adata.n_vars} genes")

## Task 4: Dimensionality Reduction with PCA (20 points)

In [None]:
# Scale data
sc.pp.scale(adata, max_value=10)

# Perform PCA
sc.tl.pca(adata, svd_solver='arpack', n_comps=50)

# Plot variance ratio (elbow plot)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sc.pl.pca_variance_ratio(adata, n_pcs=50, log=False, ax=axes[0], show=False)
axes[0].set_title('Variance Ratio per PC')

# Cumulative variance
cumvar = np.cumsum(adata.uns['pca']['variance_ratio'])
axes[1].plot(range(1, 51), cumvar[:50], 'o-')
axes[1].axhline(y=0.9, color='r', linestyle='--', label='90% variance')
axes[1].set_xlabel('Number of PCs')
axes[1].set_ylabel('Cumulative Variance Explained')
axes[1].set_title('Cumulative Variance Explained')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Determine optimal PCs
n_pcs_90 = np.where(cumvar >= 0.9)[0][0] + 1
print(f"\nPCs needed for 90% variance: {n_pcs_90}")
print(f"Recommendation: Use ~30 PCs for downstream analysis")

# Visualize PC space
sc.pl.pca(adata, color=['total_counts', 'n_genes_by_counts'], ncols=2)

## Task 5: Cell Cycle Scoring (10 points)

In [None]:
# Load cell cycle genes
cell_cycle_genes = [x.strip() for x in """
MCM5 PCNA TYMS FEN1 MCM2 MCM4 RRM1 UNG GINS2 MCM6 CDCA7 DTL PRIM1 UHRF1 CENPU HELLS RFC2 POLR1B NASP RAD51AP1 GMNN WDR76 SLBP CCNE2 UBR7 POLD3 MSH2 ATAD2 RAD51 RRM2 CDC45 CDC6 EXO1 TIPIN DSCC1 BLM CASP8AP2 USP1 CLSPN POLA1 CHAF1B BRIP1 E2F8
HMGB2 CDK1 NUSAP1 UBE2C BIRC5 TPX2 TOP2A NDC80 CKS2 NUF2 CKS1B MKI67 TMPO CENPF TACC3 PIMREG DLGAP5 CDCA2 CDCA3 KIF11 PBKAURKB BUB1 KIF20B DIAPH3 CENPE KIF2C RANGAP1 NCAPD2 DLGAP5 CDCA2 ANLN LBR CKAP5 CENPE CTCF NEK2 G2E3 GAS2L3 CBX5 CENPA
""".split()]

s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]

# Score cell cycle
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

# Visualize
print("Cell cycle phase distribution:")
print(adata.obs['phase'].value_counts())

# Plot on PC space
sc.pl.pca(adata, color='phase', title='Cell Cycle Phase in PC Space')

print("\nCell cycle scoring complete!")
print("Note: If cell cycle dominates PC1/PC2, consider regressing it out")

---

## Summary

Preprocessing complete! Data is now ready for:
- ✓ Neighborhood graph construction
- ✓ Clustering
- ✓ UMAP visualization
- ✓ Marker gene identification