# Lecture 6: Downstream Analysis and Batch Correction - SOLUTION

**Date:** December 20, 2025

---

In [None]:
import scanpy as sc
import numpy as np

sc.settings.verbosity = 3

# Load preprocessed data (or continue from Lecture 5)
adata = sc.datasets.pbmc3k_processed()
print(f"Loaded: {adata.n_obs} cells × {adata.n_vars} genes")

## Task 1: Neighborhood Graph and UMAP (20 points)

In [None]:
# Compute neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

# Calculate UMAP
sc.tl.umap(adata)

# Visualize
sc.pl.umap(adata, color=['total_counts', 'n_genes'], ncols=2, cmap='viridis')

print("UMAP embedding computed successfully!")

## Task 2: Clustering (25 points)

In [None]:
# Try different resolutions
for res in [0.4, 0.8, 1.2]:
    sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')
    print(f"Resolution {res}: {len(adata.obs[f'leiden_{res}'].unique())} clusters")

# Visualize
sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.8', 'leiden_1.2'], ncols=3)

# Use resolution 0.8 for downstream
adata.obs['leiden'] = adata.obs['leiden_0.8']

# Cluster sizes
print("\nCluster sizes (resolution 0.8):")
print(adata.obs['leiden'].value_counts().sort_index())

## Task 3: Marker Gene Identification (25 points)

In [None]:
# Find marker genes
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

# Visualize
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False)

# Plot known markers
marker_genes = ['CD3D', 'CD3E', 'CD79A', 'MS4A1', 'CD14', 'LYZ', 'NKG7', 'GNLY']
sc.pl.dotplot(adata, marker_genes, groupby='leiden', standard_scale='var')
sc.pl.umap(adata, color=marker_genes, ncols=4, cmap='viridis')

print("Marker genes identified!")

## Task 4: Cell Type Annotation (20 points)

In [None]:
# Annotate based on markers
cluster_names = {
    '0': 'CD4 T cells',
    '1': 'CD14 Monocytes',
    '2': 'B cells',
    '3': 'CD8 T cells',
    '4': 'NK cells',
    '5': 'FCGR3A Monocytes',
    '6': 'Dendritic cells',
    '7': 'Megakaryocytes'
}

adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_names).fillna('Unknown')

# Visualize annotations
sc.pl.umap(adata, color='cell_type', legend_loc='on data', 
           title='Annotated Cell Types', frameon=False)

print("\nCell type distribution:")
print(adata.obs['cell_type'].value_counts())

## Task 5: Batch Correction with scVI (10 points)

In [None]:
try:
    import scvi
    
    # Setup and train scVI
    scvi.model.SCVI.setup_anndata(adata, batch_key='batch')
    model = scvi.model.SCVI(adata, n_latent=30)
    model.train()
    
    # Get corrected representation
    adata.obsm['X_scvi'] = model.get_latent_representation()
    
    # Compute UMAP on corrected data
    sc.pp.neighbors(adata, use_rep='X_scvi')
    sc.tl.umap(adata)
    
    sc.pl.umap(adata, color=['batch', 'cell_type'], ncols=2)
    print("Batch correction complete!")
    
except ImportError:
    print("scvi-tools not installed. Skipping batch correction.")
    print("Install with: pip install scvi-tools")

---

## Summary

Completed:
- ✓ UMAP visualization
- ✓ Leiden clustering
- ✓ Marker gene identification
- ✓ Cell type annotation
- ✓ Batch correction (scVI)