# 04 - Clustering and Cell Type Annotation

**COVID-19 GSE171524 Single-Cell Analysis**

This notebook performs clustering and cell type annotation.

## Objectives
1. Leiden clustering at multiple resolutions
2. Identify major cell types using marker genes
3. Fine-grained annotation of subtypes
4. Automated scoring with decoupler
5. Save annotated data

## Cell Types Expected
Based on Melms et al. 2021:
- Epithelial (AT1, AT2, DATP, basal, club, ciliated)
- Myeloid (AM, MDM, monocytes, DC)
- Fibroblasts (alveolar, adventitial, pathological)
- Endothelial
- T cells (CD4, CD8, Treg)
- B cells / Plasma cells
- NK cells

In [None]:
# Import libraries
import os
import sys
import warnings
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

warnings.filterwarnings('ignore')

# Add scripts to path
sys.path.insert(0, '../scripts')
from markers import (
    MAJOR_CELL_TYPES, EPITHELIAL_SUBTYPES, MYELOID_SUBTYPES,
    FIBROBLAST_SUBTYPES, TCELL_SUBTYPES, BCELL_SUBTYPES,
    DATP_SIGNATURE, EXHAUSTION_SIGNATURE, FIBROSIS_SIGNATURE
)
from plotting import COVID_COLORS, CELL_TYPE_COLORS
from utils import score_gene_signature

# Settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')

print(f"Scanpy: {sc.__version__}")

In [None]:
# Define paths
INPUT_PATH = Path('../data/processed_data/adata_integrated.h5ad')
OUTPUT_DIR = Path('../data/processed_data')
FIGURE_DIR = Path('../results/figures/clustering')
FIGURE_DIR.mkdir(parents=True, exist_ok=True)

# Load integrated data
print(f"Loading: {INPUT_PATH}")
adata = sc.read_h5ad(INPUT_PATH)
print(f"Loaded: {adata.n_obs:,} cells, {adata.n_vars:,} genes")

## Leiden Clustering

In [None]:
# Cluster at multiple resolutions
# Requires: pip install igraph leidenalg
resolutions = [0.3, 0.5, 0.8, 1.0, 1.5]

for res in resolutions:
    key = f'leiden_{res}'
    sc.tl.leiden(adata, resolution=res, key_added=key, flavor='igraph', n_iterations=2)
    n_clusters = adata.obs[key].nunique()
    print(f"Resolution {res}: {n_clusters} clusters")

In [None]:
# Visualize clustering at different resolutions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for ax, res in zip(axes, resolutions):
    key = f'leiden_{res}'
    sc.pl.umap(
        adata,
        color=key,
        title=f'Resolution {res} ({adata.obs[key].nunique()} clusters)',
        ax=ax,
        show=False,
        legend_loc='on data',
        legend_fontsize=6
    )

axes[-1].set_visible(False)
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'leiden_resolutions.png', dpi=150)
plt.show()

In [None]:
# Use resolution 0.8 for main clustering
adata.obs['leiden'] = adata.obs['leiden_0.8']
print(f"Using leiden_0.8: {adata.obs['leiden'].nunique()} clusters")

# Cluster sizes
print("\nCluster sizes:")
print(adata.obs['leiden'].value_counts().sort_index())

## Marker Gene Analysis

In [None]:
# Find marker genes per cluster
sc.tl.rank_genes_groups(
    adata,
    groupby='leiden',
    method='wilcoxon',
    n_genes=100
)

print("Computed marker genes per cluster")

In [None]:
# Visualize top markers
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=False, show=False)
plt.savefig(FIGURE_DIR / 'cluster_markers.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Check major cell type markers
print("\nMajor cell type markers:")
for cell_type, genes in MAJOR_CELL_TYPES.items():
    available = [g for g in genes if g in adata.var_names]
    print(f"  {cell_type}: {', '.join(available)}")

In [None]:
# Plot major markers on UMAP
major_markers = ['EPCAM', 'CD68', 'COL1A1', 'PECAM1', 'CD3D', 'CD79A', 'NKG7', 'JCHAIN']
available_markers = [g for g in major_markers if g in adata.var_names]

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for ax, gene in zip(axes, available_markers):
    sc.pl.umap(
        adata,
        color=gene,
        cmap='viridis',
        ax=ax,
        show=False,
        title=gene
    )

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'umap_major_markers.png', dpi=150)
plt.show()

In [None]:
# Dotplot of major markers per cluster
sc.pl.dotplot(
    adata,
    var_names=MAJOR_CELL_TYPES,
    groupby='leiden',
    standard_scale='var',
    show=False
)
plt.savefig(FIGURE_DIR / 'dotplot_major_markers.png', dpi=150, bbox_inches='tight')
plt.show()

## Cell Type Annotation

In [None]:
# Score cells for major cell types using marker genes
for cell_type, genes in MAJOR_CELL_TYPES.items():
    available = [g for g in genes if g in adata.var_names]
    if available:
        sc.tl.score_genes(
            adata,
            available,
            score_name=f'{cell_type}_score',
            ctrl_size=50
        )
        print(f"Scored: {cell_type} ({len(available)} genes)")

In [None]:
# Plot scores on UMAP
score_cols = [f'{ct}_score' for ct in MAJOR_CELL_TYPES.keys() if f'{ct}_score' in adata.obs.columns]

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for ax, col in zip(axes, score_cols):
    sc.pl.umap(
        adata,
        color=col,
        cmap='RdYlBu_r',
        ax=ax,
        show=False,
        title=col.replace('_score', '')
    )

for j in range(len(score_cols), len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'umap_celltype_scores.png', dpi=150)
plt.show()

In [None]:
# Manual annotation based on marker expression
# This mapping should be adjusted based on actual cluster marker patterns

# First, examine mean scores per cluster
cluster_scores = adata.obs.groupby('leiden')[score_cols].mean()

# Display as heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(
    cluster_scores,
    cmap='RdYlBu_r',
    annot=True,
    fmt='.2f',
    center=0
)
plt.title('Cell Type Scores per Cluster')
plt.xlabel('Cell Type Score')
plt.ylabel('Cluster')
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'cluster_score_heatmap.png', dpi=150)
plt.show()

In [None]:
# Assign major cell type based on highest score
# Get the cell type with maximum score for each cluster

def assign_celltype_by_score(row):
    """Assign cell type based on highest score."""
    scores = {col.replace('_score', ''): row[col] for col in score_cols}
    return max(scores, key=scores.get)

# Assign to clusters first, then propagate to cells
cluster_celltype = cluster_scores.apply(assign_celltype_by_score, axis=1)
print("Cluster to cell type mapping:")
print(cluster_celltype)

In [None]:
# Map clusters to cell types
adata.obs['cell_type_major'] = adata.obs['leiden'].map(cluster_celltype)

print("\nMajor cell type distribution:")
print(adata.obs['cell_type_major'].value_counts())

In [None]:
# Visualize major cell types
fig, ax = plt.subplots(figsize=(10, 8))
sc.pl.umap(
    adata,
    color='cell_type_major',
    palette=CELL_TYPE_COLORS,
    title='Major Cell Types',
    ax=ax,
    show=False,
    legend_loc='right margin'
)
plt.savefig(FIGURE_DIR / 'umap_celltype_major.png', dpi=150, bbox_inches='tight')
plt.show()

## Fine-grained Annotation

Score cells for subtypes within each major category.

In [None]:
# Score for DATP (critical COVID finding)
available_datp = [g for g in DATP_SIGNATURE if g in adata.var_names]
if available_datp:
    sc.tl.score_genes(adata, available_datp, score_name='DATP_score')
    print(f"DATP signature genes: {', '.join(available_datp)}")

In [None]:
# Score epithelial subtypes
for subtype, genes in EPITHELIAL_SUBTYPES.items():
    available = [g for g in genes if g in adata.var_names]
    if available:
        sc.tl.score_genes(adata, available, score_name=f'{subtype}_score')

# Score myeloid subtypes
for subtype, genes in MYELOID_SUBTYPES.items():
    available = [g for g in genes if g in adata.var_names]
    if available:
        sc.tl.score_genes(adata, available, score_name=f'{subtype}_score')

# Score fibroblast subtypes
for subtype, genes in FIBROBLAST_SUBTYPES.items():
    available = [g for g in genes if g in adata.var_names]
    if available:
        sc.tl.score_genes(adata, available, score_name=f'{subtype}_score')

# Score T cell subtypes
for subtype, genes in TCELL_SUBTYPES.items():
    available = [g for g in genes if g in adata.var_names]
    if available:
        sc.tl.score_genes(adata, available, score_name=f'{subtype}_score')

print("Computed subtype scores")

In [None]:
# COVID-specific signatures
# Exhaustion signature for T cells
available_exhaust = [g for g in EXHAUSTION_SIGNATURE if g in adata.var_names]
if available_exhaust:
    sc.tl.score_genes(adata, available_exhaust, score_name='exhaustion_score')

# Fibrosis signature
available_fibrosis = [g for g in FIBROSIS_SIGNATURE if g in adata.var_names]
if available_fibrosis:
    sc.tl.score_genes(adata, available_fibrosis, score_name='fibrosis_score')

print(f"Exhaustion genes: {', '.join(available_exhaust)}")
print(f"Fibrosis genes: {', '.join(available_fibrosis)}")

In [None]:
# Plot COVID-relevant signatures
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

signatures = ['DATP_score', 'fibrosis_score', 'exhaustion_score', 'pFB_score']
titles = ['DATP (KRT8+/CLDN4+)', 'Fibrosis', 'T Cell Exhaustion', 'Pathological Fibroblast']

for ax, sig, title in zip(axes.flatten(), signatures, titles):
    if sig in adata.obs.columns:
        sc.pl.umap(
            adata,
            color=sig,
            cmap='RdYlBu_r',
            ax=ax,
            show=False,
            title=title
        )

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'umap_covid_signatures.png', dpi=150)
plt.show()

## Compare COVID vs Control

In [None]:
# Cell type proportions by condition
ct_props = pd.crosstab(
    adata.obs['sample_id'],
    adata.obs['cell_type_major'],
    normalize='index'
)

# Add condition
sample_condition = adata.obs.groupby('sample_id')['condition'].first()
ct_props['condition'] = sample_condition

# Melt for plotting
ct_melt = ct_props.melt(
    id_vars='condition',
    var_name='cell_type',
    value_name='proportion'
)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
sns.boxplot(
    data=ct_melt,
    x='cell_type',
    y='proportion',
    hue='condition',
    palette=COVID_COLORS,
    ax=ax
)
ax.tick_params(axis='x', rotation=45)
ax.set_xlabel('Cell Type')
ax.set_ylabel('Proportion')
ax.set_title('Cell Type Proportions: COVID vs Control')
ax.legend(title='Condition')
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'celltype_proportions.png', dpi=150)
plt.show()

In [None]:
# Signature scores by condition
sig_cols = ['DATP_score', 'fibrosis_score', 'exhaustion_score']
sig_cols = [c for c in sig_cols if c in adata.obs.columns]

fig, axes = plt.subplots(1, len(sig_cols), figsize=(5*len(sig_cols), 4))
if len(sig_cols) == 1:
    axes = [axes]

for ax, sig in zip(axes, sig_cols):
    data = adata.obs[[sig, 'condition']].copy()
    sns.violinplot(
        data=data,
        x='condition',
        y=sig,
        palette=COVID_COLORS,
        ax=ax
    )
    ax.set_title(sig.replace('_score', ' Score'))
    ax.set_xlabel('')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'signature_scores_condition.png', dpi=150)
plt.show()

In [None]:
# Create final cell type annotation (combining clusters + refinement)
# This is a simplified version - in practice, you'd examine clusters more carefully

adata.obs['cell_type'] = adata.obs['cell_type_major'].copy()

print("\nFinal cell type distribution:")
print(adata.obs['cell_type'].value_counts())

In [None]:
# Final UMAP with cell types
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# By cell type
sc.pl.umap(
    adata,
    color='cell_type',
    palette=CELL_TYPE_COLORS,
    title='Cell Types',
    ax=axes[0],
    show=False,
    legend_loc='right margin'
)

# By condition
sc.pl.umap(
    adata,
    color='condition',
    palette=COVID_COLORS,
    title='Condition',
    ax=axes[1],
    show=False
)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'umap_final.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Save annotated data
output_path = OUTPUT_DIR / 'adata_annotated.h5ad'

print(f"Saving to: {output_path}")
adata.write_h5ad(output_path, compression='gzip')

print(f"\nFile saved: {output_path}")
print(f"File size: {output_path.stat().st_size / 1e9:.2f} GB")

## Summary

### Clustering
- Leiden clustering at resolution 0.8
- Multiple resolutions tested (0.3-1.5)

### Cell Type Annotation
- Major cell types identified using marker genes
- COVID-specific signatures scored:
  - DATP (damage-associated transient progenitors)
  - Fibrosis signature
  - T cell exhaustion

### Output
- `data/processed_data/adata_annotated.h5ad` - Annotated AnnData

### Key Observations
- Cell type proportions differ between COVID and Control
- COVID samples show elevated DATP and fibrosis signatures

### Next Steps
â†’ **05_differential_expression.ipynb**: COVID vs Control DE analysis

In [None]:
# Session info
print("\n=== Session Info ===")
print(f"Scanpy: {sc.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")