# Lecture 2: Introduction to Single-Cell Technology - SOLUTION

**Course:** Single-Cell Neurogenomics  
**Date:** December 6, 2025  
**Estimated Time:** 60 minutes  

---

## Setup

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix

# Set scanpy settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(6, 6))

print("Libraries imported successfully!")
print(f"Scanpy version: {sc.__version__}")
print(f"AnnData version: {ad.__version__}")

---

## Task 1: Understanding Single-Cell Data Structure (20 points)

In [None]:
# Create simulated single-cell data
np.random.seed(42)

n_cells = 50
n_genes = 20

# Create expression matrix using Poisson distribution
# Mean of 1.5 creates realistic sparse single-cell counts
X = np.random.poisson(1.5, size=(n_cells, n_genes))

# Make it more sparse (70% zeros is realistic for scRNA-seq)
mask = np.random.random((n_cells, n_genes)) < 0.7
X[mask] = 0

# Convert to sparse matrix (efficient storage for scRNA-seq)
X_sparse = csr_matrix(X)

# Create cell metadata
cell_types = np.random.choice(['T cell', 'B cell', 'Monocyte'], n_cells)
batches = np.random.choice(['batch1', 'batch2'], n_cells)
obs_df = pd.DataFrame({
    'cell_type': cell_types,
    'batch': batches
}, index=[f'Cell_{i}' for i in range(n_cells)])

# Create gene metadata
gene_names = [f'Gene_{i}' for i in range(n_genes)]
highly_variable = np.random.choice([True, False], n_genes, p=[0.3, 0.7])
var_df = pd.DataFrame({
    'gene_name': gene_names,
    'highly_variable': highly_variable
}, index=gene_names)

# Create AnnData object
adata = ad.AnnData(X=X_sparse, obs=obs_df, var=var_df)

# Display structure
print("="*70)
print("AnnData Object Structure:")
print("="*70)
print(adata)
print("\n")

# Print summary
print("="*70)
print("Dataset Summary:")
print("="*70)
print(f"Number of cells: {adata.n_obs}")
print(f"Number of genes: {adata.n_vars}")
print(f"Data type: {type(adata.X)}")
print(f"Matrix shape: {adata.X.shape}")
print(f"Sparsity: {(1 - adata.X.nnz / (adata.n_obs * adata.n_vars)) * 100:.1f}%")
print("\n")

print("Cell Metadata (first 5 rows):")
print(adata.obs.head())
print("\n")

print("Gene Metadata (first 5 rows):")
print(adata.var.head())

---

## Task 2: Exploring Data Sparsity (20 points)

In [None]:
# Calculate sparsity
X_dense = adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X
total_elements = X_dense.size
zero_elements = np.sum(X_dense == 0)
sparsity = (zero_elements / total_elements) * 100

print("="*70)
print("Data Sparsity Analysis:")
print("="*70)
print(f"Total elements: {total_elements}")
print(f"Zero elements: {zero_elements}")
print(f"Sparsity: {sparsity:.2f}%")
print(f"Non-zero elements: {total_elements - zero_elements}")
print("\n")

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Distribution of non-zero genes per cell
genes_per_cell = np.sum(X_dense > 0, axis=1)
axes[0, 0].hist(genes_per_cell, bins=15, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Number of Genes Detected', fontsize=11)
axes[0, 0].set_ylabel('Number of Cells', fontsize=11)
axes[0, 0].set_title('Distribution of Genes per Cell', fontsize=12, fontweight='bold')
axes[0, 0].axvline(genes_per_cell.mean(), color='red', linestyle='--', label=f'Mean: {genes_per_cell.mean():.1f}')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Number of cells expressing each gene
cells_per_gene = np.sum(X_dense > 0, axis=0)
axes[0, 1].bar(range(len(cells_per_gene)), cells_per_gene, color='coral', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Gene Index', fontsize=11)
axes[0, 1].set_ylabel('Number of Cells', fontsize=11)
axes[0, 1].set_title('Cells Expressing Each Gene', fontsize=12, fontweight='bold')
axes[0, 1].axhline(cells_per_gene.mean(), color='red', linestyle='--', label=f'Mean: {cells_per_gene.mean():.1f}')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3, axis='y')

# 3. Heatmap of expression matrix (subset for clarity)
sns.heatmap(X_dense[:20, :], cmap='viridis', ax=axes[1, 0], cbar_kws={'label': 'Expression'})
axes[1, 0].set_xlabel('Gene Index', fontsize=11)
axes[1, 0].set_ylabel('Cell Index', fontsize=11)
axes[1, 0].set_title('Expression Matrix Heatmap (First 20 cells)', fontsize=12, fontweight='bold')

# 4. Sparsity visualization
sparsity_data = pd.DataFrame({
    'Category': ['Non-zero', 'Zero'],
    'Count': [total_elements - zero_elements, zero_elements]
})
colors_pie = ['#2ecc71', '#e74c3c']
axes[1, 1].pie(sparsity_data['Count'], labels=sparsity_data['Category'], autopct='%1.1f%%',
               colors=colors_pie, startangle=90)
axes[1, 1].set_title('Data Sparsity Distribution', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print("Key Observations:")
print(f"- Average genes detected per cell: {genes_per_cell.mean():.1f}")
print(f"- Average cells per gene: {cells_per_gene.mean():.1f}")
print(f"- Most sparsely expressed gene: detected in {cells_per_gene.min()} cells")
print(f"- Most widely expressed gene: detected in {cells_per_gene.max()} cells")

---

## Task 3: Working with Real Single-Cell Data (25 points)

In [None]:
# Load PBMC 3k dataset
adata_pbmc = sc.datasets.pbmc3k()

print("="*70)
print("PBMC 3k Dataset Information:")
print("="*70)
print(adata_pbmc)
print("\n")

# Display basic info
print("Dataset Dimensions:")
print(f"Number of cells: {adata_pbmc.n_obs}")
print(f"Number of genes: {adata_pbmc.n_vars}")
print("\n")

# Show cell metadata
print("Cell Metadata (obs):")
print(adata_pbmc.obs.head())
print("\n")

# Show gene metadata
print("Gene Metadata (var):")
print(adata_pbmc.var.head())
print("\n")

# Calculate statistics
# Total counts per cell
total_counts = np.array(adata_pbmc.X.sum(axis=1)).flatten()
# Genes detected per cell
genes_per_cell = np.array((adata_pbmc.X > 0).sum(axis=1)).flatten()

print("="*70)
print("Basic Statistics:")
print("="*70)
print(f"Mean total counts per cell: {total_counts.mean():.0f}")
print(f"Median total counts per cell: {np.median(total_counts):.0f}")
print(f"Mean genes per cell: {genes_per_cell.mean():.0f}")
print(f"Median genes per cell: {np.median(genes_per_cell):.0f}")
print("\n")

# Find top 10 highly expressed genes
total_counts_per_gene = np.array(adata_pbmc.X.sum(axis=0)).flatten()
top_genes_idx = np.argsort(total_counts_per_gene)[-10:][::-1]
top_genes = adata_pbmc.var_names[top_genes_idx]
top_counts = total_counts_per_gene[top_genes_idx]

print("Top 10 Most Highly Expressed Genes:")
print("="*70)
for i, (gene, count) in enumerate(zip(top_genes, top_counts), 1):
    print(f"{i:2d}. {gene:15s} - Total counts: {count:>10.0f}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Total counts distribution
axes[0].hist(total_counts, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Total Counts per Cell', fontsize=11)
axes[0].set_ylabel('Number of Cells', fontsize=11)
axes[0].set_title('Distribution of Total Counts', fontsize=12, fontweight='bold')
axes[0].axvline(total_counts.mean(), color='red', linestyle='--', label=f'Mean: {total_counts.mean():.0f}')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Genes per cell distribution
axes[1].hist(genes_per_cell, bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Genes Detected per Cell', fontsize=11)
axes[1].set_ylabel('Number of Cells', fontsize=11)
axes[1].set_title('Distribution of Genes per Cell', fontsize=12, fontweight='bold')
axes[1].axvline(genes_per_cell.mean(), color='red', linestyle='--', label=f'Mean: {genes_per_cell.mean():.0f}')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

---

## Task 4: Quality Control Metrics (20 points)

In [None]:
# Identify mitochondrial genes (start with 'MT-')
adata_pbmc.var['mt'] = adata_pbmc.var_names.str.startswith('MT-')
print(f"Number of mitochondrial genes: {adata_pbmc.var['mt'].sum()}")
print(f"Mitochondrial genes: {', '.join(adata_pbmc.var_names[adata_pbmc.var['mt']])}")
print("\n")

# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata_pbmc, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

print("QC Metrics Added to AnnData:")
print("="*70)
print("Cell-level metrics (obs):")
print(adata_pbmc.obs.columns.tolist())
print("\nGene-level metrics (var):")
print(adata_pbmc.var.columns.tolist())
print("\n")

# Display QC statistics
print("QC Statistics Summary:")
print("="*70)
print(adata_pbmc.obs[['n_genes_by_counts', 'total_counts', 'pct_counts_mt']].describe())
print("\n")

# Create violin plots
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Violin plot 1: Total counts
sc.pl.violin(adata_pbmc, 'total_counts', ax=axes[0], show=False)
axes[0].set_title('Total Counts per Cell', fontsize=12, fontweight='bold')

# Violin plot 2: Number of genes
sc.pl.violin(adata_pbmc, 'n_genes_by_counts', ax=axes[1], show=False)
axes[1].set_title('Number of Genes per Cell', fontsize=12, fontweight='bold')

# Violin plot 3: Mitochondrial percentage
sc.pl.violin(adata_pbmc, 'pct_counts_mt', ax=axes[2], show=False)
axes[2].set_title('Mitochondrial Gene Percentage', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Scatter plots to identify relationships
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Total counts vs genes detected
axes[0].scatter(adata_pbmc.obs['total_counts'], adata_pbmc.obs['n_genes_by_counts'], 
                alpha=0.5, s=10, c='steelblue')
axes[0].set_xlabel('Total Counts', fontsize=11)
axes[0].set_ylabel('Number of Genes', fontsize=11)
axes[0].set_title('Total Counts vs Genes Detected', fontsize=12, fontweight='bold')
axes[0].grid(alpha=0.3)

# Total counts vs mitochondrial percentage
axes[1].scatter(adata_pbmc.obs['total_counts'], adata_pbmc.obs['pct_counts_mt'], 
                alpha=0.5, s=10, c='coral')
axes[1].set_xlabel('Total Counts', fontsize=11)
axes[1].set_ylabel('Mitochondrial Percentage', fontsize=11)
axes[1].set_title('Total Counts vs Mitochondrial %', fontsize=12, fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Suggest filtering thresholds
print("="*70)
print("Suggested Filtering Thresholds:")
print("="*70)
print("Based on the QC metrics, we recommend the following thresholds:\n")

# Calculate thresholds
min_genes = 200
max_genes = int(adata_pbmc.obs['n_genes_by_counts'].quantile(0.98))
max_counts = int(adata_pbmc.obs['total_counts'].quantile(0.98))
max_mt = 5

print(f"1. Minimum genes per cell: {min_genes}")
print(f"   Rationale: Remove low-quality/empty droplets\n")

print(f"2. Maximum genes per cell: {max_genes}")
print(f"   Rationale: Remove potential doublets (98th percentile)\n")

print(f"3. Maximum total counts: {max_counts}")
print(f"   Rationale: Remove potential doublets (98th percentile)\n")

print(f"4. Maximum mitochondrial percentage: {max_mt}%")
print(f"   Rationale: Remove dying/stressed cells\n")

# Calculate how many cells would be filtered
n_cells_before = adata_pbmc.n_obs
filter_mask = (
    (adata_pbmc.obs['n_genes_by_counts'] >= min_genes) &
    (adata_pbmc.obs['n_genes_by_counts'] <= max_genes) &
    (adata_pbmc.obs['total_counts'] <= max_counts) &
    (adata_pbmc.obs['pct_counts_mt'] <= max_mt)
)
n_cells_after = filter_mask.sum()
n_cells_removed = n_cells_before - n_cells_after
pct_removed = (n_cells_removed / n_cells_before) * 100

print(f"Impact of filtering:")
print(f"  Cells before filtering: {n_cells_before}")
print(f"  Cells after filtering: {n_cells_after}")
print(f"  Cells removed: {n_cells_removed} ({pct_removed:.1f}%)")

---

## Task 5: Comparing Bulk vs Single-Cell Data (15 points)

In [None]:
# Select genes of interest (known cell type markers)
genes_of_interest = ['CD3D', 'CD79A', 'CST3']  # T cell, B cell, Monocyte markers

print("="*70)
print("Selected Marker Genes:")
print("="*70)
print("CD3D  - T cell marker (T cell receptor component)")
print("CD79A - B cell marker (B cell receptor component)")
print("CST3  - Monocyte marker (Cystatin C, highly expressed in myeloid cells)")
print("\n")

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

for idx, gene in enumerate(genes_of_interest):
    if gene in adata_pbmc.var_names:
        # Get expression values
        expr = adata_pbmc[:, gene].X.toarray().flatten()
        
        # Calculate bulk (mean) value
        bulk_mean = expr.mean()
        
        # Histogram of single-cell distribution
        axes[0, idx].hist(expr, bins=50, color=colors[idx], edgecolor='black', alpha=0.7)
        axes[0, idx].axvline(bulk_mean, color='red', linestyle='--', linewidth=2, 
                            label=f'Bulk mean: {bulk_mean:.2f}')
        axes[0, idx].set_xlabel('Expression Level', fontsize=11)
        axes[0, idx].set_ylabel('Number of Cells', fontsize=11)
        axes[0, idx].set_title(f'{gene} Expression Distribution', fontsize=12, fontweight='bold')
        axes[0, idx].legend()
        axes[0, idx].grid(alpha=0.3)
        
        # Box plot showing heterogeneity
        # Separate expressing vs non-expressing cells
        expressing = expr[expr > 0]
        non_expressing = expr[expr == 0]
        
        bp = axes[1, idx].boxplot([non_expressing, expressing], 
                                   labels=['Non-expressing', 'Expressing'],
                                   patch_artist=True)
        for patch in bp['boxes']:
            patch.set_facecolor(colors[idx])
            patch.set_alpha(0.7)
        
        axes[1, idx].set_ylabel('Expression Level', fontsize=11)
        axes[1, idx].set_title(f'{gene} Expression Groups', fontsize=12, fontweight='bold')
        axes[1, idx].grid(alpha=0.3, axis='y')
        
        # Print statistics
        pct_expressing = (expr > 0).sum() / len(expr) * 100
        print(f"{gene}:")
        print(f"  Bulk (mean) value: {bulk_mean:.2f}")
        print(f"  Cells expressing: {pct_expressing:.1f}%")
        print(f"  Mean in expressing cells: {expressing.mean():.2f}")
        print(f"  Range: {expr.min():.2f} - {expr.max():.2f}")
        print(f"  Std deviation: {expr.std():.2f}")
        print()

plt.tight_layout()
plt.show()

# Discussion
print("\n" + "="*70)
print("IMPORTANCE OF SINGLE-CELL RESOLUTION:")
print("="*70)
print("""
1. CELLULAR HETEROGENEITY:
   - Bulk measurements (red dashed line) represent population averages
   - Single-cell data reveals bimodal distributions: cells either express or don't express
   - The bulk mean can be misleading when cell populations are heterogeneous

2. CELL-TYPE-SPECIFIC EXPRESSION:
   - CD3D is highly expressed in T cells but absent in other cell types
   - CD79A is specific to B cells
   - CST3 is enriched in monocytes
   - Bulk data would average across all cell types, losing this specificity

3. BIOLOGICAL INSIGHTS:
   - Identifies rare cell populations (<1% of total)
   - Reveals cell states and transitions
   - Enables cell-type-specific disease mechanisms
   - Critical for understanding developmental processes

4. CLINICAL APPLICATIONS:
   - Tumor heterogeneity and cancer resistance
   - Immune cell composition in disease
   - Drug response varies by cell type
   - Personalized medicine requires cell-level resolution
""")

---

## Reflection Questions (Bonus: 10 points)

### Question 1: Technical challenges in scRNA-seq vs bulk RNA-seq

**Answer:**
- **Lower input material:** Single cells contain ~10 pg RNA vs micrograms in bulk, requiring extensive amplification
- **Technical noise:** Amplification bias, dropout events, and batch effects are more pronounced
- **Higher cost:** Per-cell costs are significant, especially for large studies
- **Computational complexity:** Analyzing millions of cells requires specialized algorithms and infrastructure
- **Sparsity:** Dropout (false zeros) due to limited sampling of lowly expressed genes
- **Cell isolation artifacts:** Stress-induced gene expression during tissue dissociation

### Question 2: Why is sparsity a major characteristic of scRNA-seq?

**Answer:**
Sparsity in scRNA-seq arises from:
- **Biological factors:**
  - Genes are naturally not expressed in all cells (cell-type-specific expression)
  - Transcriptional bursting creates temporal variation
- **Technical factors (dropout):**
  - Limited mRNA molecules captured from single cells (~10-50% efficiency)
  - Stochastic sampling during library preparation
  - Low-abundance transcripts often missed entirely
  - Amplification bias favors highly expressed genes
- **Impact:** Distinguishing biological zeros from technical dropout is a key challenge

### Question 3: How do cell barcodes and UMIs work together?

**Answer:**
- **Cell Barcodes (10-16 bp):**
  - Unique sequence added to all molecules from one cell/droplet
  - Enables pooling of thousands of cells in one reaction
  - During analysis, reads with the same barcode are grouped as one cell
- **UMIs (8-12 bp):**
  - Unique sequence added to each individual mRNA molecule
  - Distinguishes PCR duplicates from unique molecules
  - Enables accurate molecule counting by collapsing duplicates
- **Combined workflow:**
  1. mRNA captured with beads containing cell barcode + UMI
  2. After PCR, multiple reads may come from same molecule
  3. Cell barcode groups reads by cell
  4. UMI counts unique molecules, removing amplification bias

### Question 4: Biological scenarios requiring single-cell analysis

**Answer:**
- **Heterogeneous tissues:**
  - Brain regions with diverse neuronal and glial subtypes
  - Tumors with multiple cancer cell clones and immune infiltrates
- **Rare populations:**
  - Stem cells (<1% of tissue)
  - Circulating tumor cells in blood
  - Specific T cell clones responding to pathogens
- **Dynamic processes:**
  - Cell differentiation and development
  - Immune response activation
  - Disease progression over time
- **Cell-cell interactions:**
  - Ligand-receptor communication
  - Spatial organization in tissues
- **Disease mechanisms:**
  - Cell-type-specific drug responses
  - Identifying disease-driving cell populations
  - Understanding cellular resistance mechanisms

---

## Summary

In this assignment, you've learned to:
- ✓ Create and manipulate AnnData objects
- ✓ Understand data sparsity in single-cell datasets
- ✓ Load and explore real single-cell data (PBMC 3k)
- ✓ Calculate and visualize quality control metrics
- ✓ Compare single-cell resolution with bulk measurements
- ✓ Appreciate the biological insights enabled by single-cell technology

These skills form the foundation for all downstream single-cell analyses!