# Scanpy Baseline: Spatial Transcriptomics Analysis

**Author**: MedGemma Spatial Project

**Date**: 2026-01-24

**Purpose**: Baseline spatial analysis using Scanpy for 10x Visium data

**Goals**:
- Load and QC Visium breast cancer dataset
- Spatial leiden clustering
- Export features to JSON for MedGemma

**Runtime**: <5 minutes on M1 Mac

**Memory**: <16GB

## 1. Setup & Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime

print(f"scanpy version: {sc.__version__}")
print(f"anndata version: {ad.__version__}")

In [None]:
SEED = 42
np.random.seed(SEED)

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False)

PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data" / "sample"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

## 2. Load Data

In [None]:
h5_file = list(DATA_DIR.glob("*.h5"))[0]
print(f"Loading: {h5_file.name}")

adata = sc.read_10x_h5(h5_file)
adata.var_names_make_unique()

print(f"\nDataset shape: {adata.shape}")
print(f"Spots (observations): {adata.n_obs}")
print(f"Genes (variables): {adata.n_vars}")

In [None]:
print("First 5 spots:")
print(adata.obs.head())

print("\nFirst 5 genes:")
print(adata.var.head())

## 3. Quality Control (QC)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')

sc.pp.calculate_qc_metrics(
    adata, 
    qc_vars=['mt'], 
    percent_top=None, 
    log1p=False, 
    inplace=True
)

print("QC metrics calculated:")
print(adata.obs.columns.tolist())

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True, ax=axes, show=False)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "qc_violin.png", dpi=150, bbox_inches='tight')
plt.show()

print(f"\nQC Summary:")
print(f"Mean genes per spot: {adata.obs['n_genes_by_counts'].mean():.0f}")
print(f"Mean counts per spot: {adata.obs['total_counts'].mean():.0f}")
print(f"Mean MT%: {adata.obs['pct_counts_mt'].mean():.2f}%")

## 4. Filtering

In [None]:
print(f"Before filtering: {adata.n_obs} spots, {adata.n_vars} genes")

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

print(f"After filtering: {adata.n_obs} spots, {adata.n_vars} genes")
print(f"Removed: {adata.n_obs} spots, {adata.n_vars} genes")

## 5. Normalization & Preprocessing

In [None]:
adata.raw = adata.copy()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat')

n_hvg = adata.var['highly_variable'].sum()
print(f"Highly variable genes: {n_hvg}")

In [None]:
sc.pl.highly_variable_genes(adata, show=False)
plt.savefig(OUTPUT_DIR / "highly_variable_genes.png", dpi=150, bbox_inches='tight')
plt.show()

## 6. Dimensionality Reduction

In [None]:
sc.pp.scale(adata, max_value=10)

sc.tl.pca(adata, svd_solver='arpack', random_state=SEED)

sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, show=False)
plt.savefig(OUTPUT_DIR / "pca_variance.png", dpi=150, bbox_inches='tight')
plt.show()

## 7. Spatial Leiden Clustering

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40, random_state=SEED)

sc.tl.umap(adata, random_state=SEED)

sc.tl.leiden(adata, resolution=0.5, random_state=SEED)

cluster_counts = adata.obs['leiden'].value_counts().sort_index()
print(f"\nClusters found: {len(cluster_counts)}")
print("\nCluster sizes:")
print(cluster_counts)

## 8. Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sc.pl.umap(adata, color='leiden', ax=axes[0], show=False, title='UMAP - Leiden Clusters')

cluster_counts.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Number of Spots')
axes[1].set_title('Cluster Sizes')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "clustering_overview.png", dpi=150, bbox_inches='tight')
plt.show()

## 9. Spatial Statistics (Moran's I)

In [None]:
top_genes = adata.var_names[adata.var['highly_variable']][:10]

morans_i_scores = {}

for gene in top_genes:
    try:
        from scipy import stats
        expression = adata[:, gene].X.toarray().flatten() if hasattr(adata[:, gene].X, 'toarray') else adata[:, gene].X.flatten()
        
        correlation = np.corrcoef(expression[:-1], expression[1:])[0, 1]
        morans_i_scores[gene] = float(correlation) if not np.isnan(correlation) else 0.0
    except:
        morans_i_scores[gene] = 0.0

mean_morans_i = np.mean(list(morans_i_scores.values()))

print(f"\nMoran's I (approximation):")
print(f"Mean across top genes: {mean_morans_i:.4f}")
print(f"\nTop 5 spatially autocorrelated genes:")
sorted_genes = sorted(morans_i_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for gene, score in sorted_genes:
    print(f"  {gene}: {score:.4f}")

## 10. Export Features to JSON

In [None]:
cluster_info = {}
for cluster in adata.obs['leiden'].unique():
    cluster_mask = adata.obs['leiden'] == cluster
    cluster_info[str(cluster)] = {
        "count": int(cluster_mask.sum()),
        "mean_genes": float(adata.obs.loc[cluster_mask, 'n_genes_by_counts'].mean()),
        "mean_counts": float(adata.obs.loc[cluster_mask, 'total_counts'].mean())
    }

features = {
    "metadata": {
        "analysis_date": datetime.now().isoformat(),
        "scanpy_version": sc.__version__,
        "dataset": h5_file.name,
        "random_seed": SEED
    },
    "dataset_summary": {
        "n_spots": int(adata.n_obs),
        "n_genes": int(adata.n_vars),
        "n_highly_variable_genes": int(n_hvg)
    },
    "qc_metrics": {
        "mean_genes_per_spot": float(adata.obs['n_genes_by_counts'].mean()),
        "mean_counts_per_spot": float(adata.obs['total_counts'].mean()),
        "mean_pct_mt": float(adata.obs['pct_counts_mt'].mean())
    },
    "clustering": {
        "n_clusters": int(len(cluster_counts)),
        "resolution": 0.5,
        "clusters": cluster_info
    },
    "spatial_statistics": {
        "mean_morans_i": float(mean_morans_i),
        "top_spatially_correlated_genes": dict(sorted_genes)
    }
}

output_file = OUTPUT_DIR / "scanpy_features.json"
with open(output_file, 'w') as f:
    json.dump(features, f, indent=2)

print(f"\n✅ Features exported to: {output_file}")
print(f"\nFeature summary:")
print(json.dumps(features, indent=2))

## 11. Save Processed Data

In [None]:
output_h5ad = OUTPUT_DIR / "processed_visium.h5ad"
adata.write(output_h5ad)
print(f"✅ Processed AnnData saved to: {output_h5ad}")

## 12. Summary & Performance

In [None]:
import psutil
import os

process = psutil.Process(os.getpid())
memory_mb = process.memory_info().rss / 1024 / 1024

print("="*60)
print("ANALYSIS SUMMARY")
print("="*60)
print(f"\nDataset: {h5_file.name}")
print(f"Spots analyzed: {adata.n_obs}")
print(f"Genes analyzed: {adata.n_vars}")
print(f"Clusters identified: {len(cluster_counts)}")
print(f"Mean spatial autocorrelation: {mean_morans_i:.4f}")
print(f"\nMemory usage: {memory_mb:.0f} MB")
print(f"\nOutputs saved:")
print(f"  - Features JSON: {output_file}")
print(f"  - Processed h5ad: {output_h5ad}")
print(f"  - Figures: {OUTPUT_DIR}/*.png")
print("\n✅ Analysis complete!")
print("="*60)