# Xenium Lung Preview — Full Analysis (Python, step by step)

We will:
1) Locate and read `cells.parquet` (robustly detect XY columns)  
2) Stream `transcripts.parquet` in two passes to build a sparse cell×gene matrix  
3) Add QC fields, run adaptive QC, normalize, HVGs, PCA, neighbors, UMAP, Leiden  
4) Make ≥12 figures (QC, UMAPs, spatial, markers, neighborhood enrichment)  
5) Save final object (`.h5ad`) and a short summary

> Dataset: 10x *Xenium Human Lung Preview (FFPE, non-diseased)*  
> Run inside your `xenium` env (`uv run jupyter lab` or `uv run python`).

In [1]:
# Core stdlib
import os, sys, gc, pathlib
from datetime import datetime

# Data science stack
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from scipy.sparse import coo_matrix, csr_matrix
import anndata as ad
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: quiet a dask warning some packages emit
try:
    import dask
    dask.config.set({"dataframe.query-planning": True})
except Exception:
    pass

# Paths (assumes this notebook lives in repo/notebooks or repo/)
ROOT = pathlib.Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parents[0]
DATA = ROOT / "data"
RESULTS = ROOT / "results"; RESULTS.mkdir(exist_ok=True)
FIGS = RESULTS / "figures"; FIGS.mkdir(parents=True, exist_ok=True)

# Plot look
sc.set_figure_params(dpi=120, frameon=False)
sns.set_context("talk", font_scale=0.9)

# Find the unpacked dataset folder (created by your unpack script)
FOLDER_CANDS = sorted(DATA.glob("xenium_*lung*_ffpe"))
assert FOLDER_CANDS, "No unpacked Xenium folder found under data/."
FOLDER = FOLDER_CANDS[0]
print(f"[INFO] Using dataset folder: {FOLDER}")

[INFO] Using dataset folder: /home/juliors/Documents/SPATIAL-OMICS/xenium-lung-preview-tutorial/data/xenium_preview_human_non_diseased_lung_with_add_on_ffpe



Xenium output uses slightly different column names across versions.  
We normalize **x/y** names to `x_location`, `y_location` and keep `cell_id` as index.

In [2]:

# Read the cells table (arrow→pandas)
cells_path = FOLDER / "cells.parquet"
assert cells_path.exists(), f"Missing {cells_path}"
cells_df = pq.read_table(cells_path).to_pandas()

# Decode byte columns to strings (10x often stores 'binary' columns)
for col in cells_df.columns:
    if cells_df[col].dtype == object:
        cells_df[col] = cells_df[col].apply(
            lambda x: x.decode("utf-8", "ignore") if isinstance(x, (bytes, bytearray)) else x
        )

# Ensure we have cell IDs
assert "cell_id" in cells_df.columns, f"'cell_id' missing; saw: {cells_df.columns.tolist()}"

# Robust XY detection (handle multiple historical names)
lower_to_orig = {c.lower(): c for c in cells_df.columns}
x_candidates = ["x_location","x_centroid","center_x","x"]
y_candidates = ["y_location","y_centroid","center_y","y"]

x_col = next((lower_to_orig[c] for c in x_candidates if c in lower_to_orig), None)
y_col = next((lower_to_orig[c] for c in y_candidates if c in lower_to_orig), None)
assert x_col and y_col, f"Could not find XY in cells.parquet. Columns: {cells_df.columns.tolist()}"

# Standardize names for downstream tools
cells_df = cells_df.rename(columns={x_col: "x_location", y_col: "y_location"})
cells_df["x_location"] = pd.to_numeric(cells_df["x_location"], errors="coerce")
cells_df["y_location"] = pd.to_numeric(cells_df["y_location"], errors="coerce")

# Keep a clean obs table (you can add more columns if you like)
keep_cols = ["cell_id","x_location","y_location"]
for extra in ["fov_name","area","nucleus_area"]:
    if extra in cells_df.columns: keep_cols.append(extra)

cells_df = cells_df[keep_cols].set_index("cell_id", drop=False).sort_index()
print(f"[LOAD] cells.parquet → {len(cells_df):,} cells; coords = ({x_col}, {y_col})")

# Quick sanity figure (spatial density)
plt.figure(figsize=(5.6,5.6))
plt.hist2d(cells_df["x_location"], cells_df["y_location"], bins=300)
plt.gca().invert_yaxis(); plt.axis("equal"); plt.title("Spatial: cell density (all cells)")
plt.tight_layout(); plt.savefig(FIGS/"00_spatial_density_all_cells.png", dpi=200); plt.close()


[LOAD] cells.parquet → 295,883 cells; coords = (x_centroid, y_centroid)


We’ll stream **transcripts.parquet**. First detect the **gene column** name (varies: `feature_name`, `gene`, etc.).

In [3]:
# === Build a cell × gene matrix from transcripts.parquet (QV≥20, genes-only, in-cell) ===
# Keeps behavior consistent with 10x cell_feature_matrix.  (QV≥20; exclude controls)

import numpy as np, pandas as pd, gc, gzip, pathlib
import pyarrow.dataset as ds
from scipy.sparse import coo_matrix, csr_matrix

# ---- paths ----
tx_path = FOLDER / "transcripts.parquet"
cfm_dir = FOLDER / "cell_feature_matrix"           # contains features.tsv.gz
feat_tsv = cfm_dir / "features.tsv.gz"
assert tx_path.exists(), f"Missing {tx_path}"
assert feat_tsv.exists(), f"Missing {feat_tsv} (needed to drop controls)."

# ---- load 'Gene Expression' feature names from features.tsv.gz ----
# columns: [ensembl_id, feature_name, feature_type]
with gzip.open(feat_tsv, "rt") as fh:
    feats = pd.read_csv(fh, sep="\t", header=None, names=["ensembl_id","feature_name","feature_type"])
gene_names = pd.Index(feats.loc[feats["feature_type"]=="Gene Expression","feature_name"].astype(str).unique())
gene_name_set = set(gene_names)

# ---- dataset & schema ----
dataset = ds.dataset(tx_path)
cols = {c.lower(): c for c in dataset.schema.names}
GENE_COL = next((cols[k] for k in ["feature_name","gene","gene_name","target"] if k in cols), None)
CELL_COL = cols.get("cell_id")
QV_COL   = next((cols[k] for k in ["qv","quality","q"] if k in cols), None)
assert GENE_COL and CELL_COL and QV_COL, f"Missing required columns; saw: {list(cols.values())}"

def bytes_to_str(s):
    if s.dtype == object:
        return s.apply(lambda x: x.decode("utf-8","ignore") if isinstance(x,(bytes,bytearray)) else x)
    return s

# ---- Pass 1: gene inventory under filters ----
PASS1_BATCH = 5_000_000
gene_totals = {}

need_p1 = [GENE_COL, CELL_COL, QV_COL]
for i, batch in enumerate(dataset.to_batches(columns=need_p1, batch_size=PASS1_BATCH), start=1):
    df = batch.to_pandas()

    # normalize dtypes
    df[GENE_COL] = bytes_to_str(df[GENE_COL]).astype(str)
    # CRITICAL: make cell_id dtype match cells_df.index (string is safest)
    df[CELL_COL] = df[CELL_COL].astype(str)

    # filters: QV≥20, in known cells, genes only (exclude controls)
    df = df[(df[QV_COL] >= 20) & (df[CELL_COL].isin(cells_df.index.astype(str))) & (df[GENE_COL].isin(gene_name_set))]
    if df.empty:
        del df; gc.collect(); continue

    vc = df[GENE_COL].value_counts()
    for g, n in vc.items():
        gene_totals[g] = gene_totals.get(g, 0) + int(n)

    if i % 4 == 0:
        print(f"[pass1] batches={i} (genes so far: {len(gene_totals):,})")
    del df, vc
    gc.collect()

genes = pd.Index(sorted(gene_totals, key=gene_totals.get, reverse=True), name="gene")
gene_to_col = {g:i for i, g in enumerate(genes)}
print(f"[pass1] unique genes: {len(genes):,}")

# ---- allocate target matrix ----
ordered_cell_ids = cells_df.index.astype(str).tolist()  # ensure string
cell_to_row = {cid:i for i, cid in enumerate(ordered_cell_ids)}
n_cells, n_genes = len(ordered_cell_ids), len(genes)
X = csr_matrix((n_cells, n_genes), dtype=np.int32)
print(f"[SHAPE] target matrix: {n_cells:,} cells × {n_genes:,} genes")

# ---- Pass 2: aggregate (cell_id, gene) counts with same filters ----
PASS2_BATCH = 2_000_000
need_p2 = [CELL_COL, GENE_COL, QV_COL]

for i, batch in enumerate(dataset.to_batches(columns=need_p2, batch_size=PASS2_BATCH), start=1):
    df = batch.to_pandas()

    df[GENE_COL] = bytes_to_str(df[GENE_COL]).astype(str)
    df[CELL_COL] = df[CELL_COL].astype(str)

    df = df[(df[QV_COL] >= 20) &
            (df[CELL_COL].isin(cell_to_row)) &               # faster than isin(cells_df.index) now
            (df[GENE_COL].isin(gene_name_set))]
    if df.empty:
        del df; gc.collect(); continue

    grp = df.groupby([CELL_COL, GENE_COL]).size().astype(np.int32)
    del df

    rows, cols_, data = [], [], []
    ar, ac, av = rows.append, cols_.append, data.append
    for (cid, g), n in grp.items():
        r = cell_to_row.get(cid)
        c = gene_to_col.get(g)
        if r is not None and c is not None:
            ar(r); ac(c); av(int(n))
    del grp

    if rows:
        coo = coo_matrix((data, (rows, cols_)), shape=(n_cells, n_genes), dtype=np.int32).tocsr()
        X += coo
        del coo, rows, cols_, data

    if i % 4 == 0:
        print(f"[pass2] batches={i:>3}  nnz={X.nnz:,}")
    gc.collect()

print(f"[BUILD] sparse matrix complete: shape={X.shape}, nnz={X.nnz:,}")


[pass1] batches=4 (genes so far: 392)
[pass1] batches=8 (genes so far: 392)
[pass1] batches=12 (genes so far: 392)
[pass1] batches=16 (genes so far: 392)
[pass1] batches=20 (genes so far: 392)
[pass1] batches=24 (genes so far: 392)
[pass1] batches=28 (genes so far: 392)
[pass1] unique genes: 392
[SHAPE] target matrix: 295,883 cells × 392 genes
[pass2] batches=  4  nnz=2,054,020
[pass2] batches=  8  nnz=4,096,753
[pass2] batches= 12  nnz=6,115,699
[pass2] batches= 16  nnz=8,086,845
[pass2] batches= 20  nnz=10,110,691
[pass2] batches= 24  nnz=12,180,785
[pass2] batches= 28  nnz=14,240,040
[BUILD] sparse matrix complete: shape=(295883, 392), nnz=14,544,817


Pass 1 collects the **unique gene list** (and abundance) with tiny memory use.

In [45]:
adata = ad.AnnData(
    X=X,                                       # sparse counts
    obs=cells_df.loc[ordered_cell_ids],        # row metadata (cells)
    var=pd.DataFrame(index=genes),             # col metadata (genes)
)

# Keep raw counts before normalization
adata.layers["raw_counts"] = adata.X.copy()

# QC and coordinates
adata.obs["total_counts"] = np.asarray(adata.X.sum(axis=1)).ravel()
adata.obs["n_genes_by_counts"] = np.asarray((adata.X > 0).sum(axis=1)).ravel()
adata.obsm["spatial"] = adata.obs[["x_location","y_location"]].to_numpy()

print(f"[ANNData] {adata.n_obs:,} cells × {adata.n_vars:,} genes (nnz={adata.X.nnz:,})")

# Figure 1—QC distribution: total counts (labels every 50)
plt.figure(figsize=(8, 6))
sns.histplot(adata.obs["total_counts"], bins=80)
plt.xlabel("Total counts / cell", fontsize=12)
plt.ylabel("Cells", fontsize=12)
plt.title("QC: total counts", fontsize=14)

max_val = int(adata.obs["total_counts"].max())
plt.xticks(ticks=np.arange(0, max_val + 1, 50), rotation=45)

plt.tight_layout()
plt.savefig(FIGS / "01_qc_hist_total_counts.png", dpi=300)
plt.close()




# Figure 2—QC distribution: genes detected (labels every 50)
plt.figure(figsize=(8, 6))
sns.histplot(adata.obs["n_genes_by_counts"], bins=80)
plt.xlabel("Detected genes / cell", fontsize=12)
plt.ylabel("Cells", fontsize=12)
plt.title("QC: genes per cell", fontsize=14)

max_val = int(adata.obs["n_genes_by_counts"].max())
plt.xticks(ticks=np.arange(0, max_val + 1, 50), rotation=45)

plt.tight_layout()
plt.savefig(FIGS / "02_qc_hist_genes_per_cell.png", dpi=300)
plt.close()





# Figure 3—QC scatter (larger, clearer, and better ticks)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="total_counts", y="n_genes_by_counts", s=5, alpha=0.35, data=adata.obs
)
plt.xlabel("Total counts / cell", fontsize=12)
plt.ylabel("Detected genes / cell", fontsize=12)
plt.title("QC: counts vs genes", fontsize=14)

# Set x and y ticks every 50
max_x = int(adata.obs["total_counts"].max())
max_y = int(adata.obs["n_genes_by_counts"].max())
plt.xticks(np.arange(0, max_x + 1, 50), rotation=45)
plt.yticks(np.arange(0, max_y + 1, 50))

plt.tight_layout()
plt.savefig(FIGS / "03_qc_scatter_counts_vs_genes.png", dpi=300)
plt.close()

[ANNData] 295,883 cells × 392 genes (nnz=14,544,817)


We set **data-driven** cutoffs at the 1% quantile (with floors) and retry with a relaxed fallback if needed.


In [46]:
cnt = adata.obs["total_counts"]
ng  = adata.obs["n_genes_by_counts"]

min_counts = max(np.quantile(cnt, 0.01), 50)   # ≥ 1st percentile or 50
min_genes  = max(np.quantile(ng,  0.01), 15)   # ≥ 1st percentile or 15

keep = (cnt >= min_counts) & (ng >= min_genes)
n_keep = int(keep.sum())
print(f"[QC] min_counts={min_counts:.1f}, min_genes={min_genes:.1f} → keep {n_keep:,}/{adata.n_obs}")

if n_keep == 0:
    # Fallback if the panel is very sparse or distributions are odd
    min_counts, min_genes = 10, 5
    keep = (cnt >= min_counts) & (ng >= min_genes)
    n_keep = int(keep.sum())
    print(f"[QC][fallback] min_counts={min_counts}, min_genes={min_genes} → keep {n_keep:,}/{adata.n_obs}")
    assert n_keep > 0, "QC still yielded 0 cells; inspect distributions."

adata = adata[keep].copy()
print(f"[QC] post-filter shape: {adata.shape}")

# Figure 4—spatial view after QC (density)
xy = adata.obsm["spatial"]
plt.figure(figsize=(5.6,5.6))
plt.hist2d(xy[:,0], xy[:,1], bins=300)
plt.gca().invert_yaxis(); plt.axis("equal"); plt.title("Spatial density (post QC)")
plt.tight_layout(); plt.savefig(FIGS/"04_spatial_density_post_qc.png", dpi=200); plt.close()


[QC] min_counts=50.0, min_genes=15.0 → keep 212,528/295883
[QC] post-filter shape: (212528, 392)


Standard Scanpy pipeline. We keep most defaults, choosing sizes appropriate to a ~500-gene panel.


In [33]:
# Normalization
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)  # safe because we ensured ≥1 cell remains

# Highly variable genes (with small panels, just take all or up to 3000)
sc.pp.highly_variable_genes(adata, n_top_genes=min(3000, adata.n_vars), subset=True)

# PCA
n_comps = min(30, max(2, adata.n_vars-1))
sc.pp.pca(adata, n_comps=n_comps)

# Figure 5—PCA scree (improved)
plt.figure(figsize=(8, 6))
var = adata.uns["pca"]["variance_ratio"]
plt.plot(np.arange(1, len(var) + 1), var, marker="o", markersize=6, lw=1.5)

plt.xlabel("Principal Component (PC)", fontsize=12)
plt.ylabel("Explained variance", fontsize=12)
plt.title("PCA variance ratio", fontsize=14)

# X-axis tick every PC
plt.xticks(np.arange(1, len(var) + 1, 1), fontsize=10)

plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.savefig(FIGS / "05_pca_scree.png", dpi=300)
plt.close()




In [34]:
# Neighbors + UMAP + Leiden (12 PCs )
n_pcs = min(12, adata.obsm["X_pca"].shape[1])
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=n_pcs)
sc.tl.umap(adata, min_dist=0.35, spread=1.0, random_state=0)
sc.tl.leiden(adata, resolution=0.5)  # keeps key "leiden"

In [43]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# --- global dark theme ---
mpl.rcParams.update({
    "figure.figsize": (8, 8),
    "figure.facecolor": "black",
    "savefig.facecolor": "black",
    "axes.facecolor": "black",
    "axes.edgecolor": "white",
    "axes.labelcolor": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "#666666",
    "grid.alpha": 0.25,
    "grid.linestyle": "-",
    "axes.grid": True,           # light grid like the HTML report
})

XY = adata.obsm["X_umap"]
x, y = XY[:, 0], XY[:, 1]

# ---------- 1) UMAP by cluster (with legend) ----------
plt.figure(figsize=(9.5, 8))  # a bit wider to fit legend
cats = adata.obs["leiden"].astype("category")
palette = sc.plotting.palettes.default_102[:len(cats.cat.categories)]

ax = plt.gca()
for k, cat in enumerate(cats.cat.categories):
    idx = (cats.values == cat)
    ax.scatter(x[idx], y[idx], s=1.0, c=[palette[k]], lw=0, alpha=0.9,
               rasterized=True, label=f"Cluster {cat}")

# label cluster medians
centroids = (pd.DataFrame({"x": x, "y": y, "c": cats.values})
             .groupby("c", observed=True)[["x","y"]].median())
for lab, (cx, cy) in centroids.iterrows():
    ax.text(cx, cy, str(lab), ha="center", va="center",
            fontsize=16, fontweight="bold", color="white",
            path_effects=[mpl.patheffects.withStroke(linewidth=3, foreground="#111111")])

ax.set_xlabel("UMAP1"); ax.set_ylabel("UMAP2")
ax.set_aspect("equal", "box")

# legend outside on the right
leg = ax.legend(title="Clusters", loc="center left", bbox_to_anchor=(1.02, 0.5),
                frameon=True, fontsize=9, markerscale=6/1.0, borderaxespad=0.5)
leg.get_frame().set_facecolor("black")
leg.get_frame().set_edgecolor("white")
leg.get_title().set_color("white")
for t in leg.get_texts(): t.set_color("white")

plt.tight_layout()
plt.savefig(FIGS / "06_umap_leiden.png", dpi=600, bbox_inches="tight")
plt.close()

# ---------- 2) UMAP by QC covariates (dark + grid + white colorbar ticks) ----------
for col, fname in [
    ("total_counts", "07_umap_total_counts"),
    ("n_genes_by_counts", "07_umap_n_genes_by_counts"),
]:
    vals = adata.obs[col].to_numpy()
    vmin, vmax = np.quantile(vals, [0.01, 0.99])
    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)

    plt.figure()
    ax = plt.gca()
    sca = ax.scatter(x, y, c=vals, s=1.0, cmap="viridis", norm=norm,
                     lw=0, alpha=0.9, rasterized=True)
    ax.set_xlabel("UMAP1"); ax.set_ylabel("UMAP2")
    ax.set_aspect("equal", "box")
    ax.set_title(col, color="white", fontsize=16, pad=6)

    cb = plt.colorbar(sca, ax=ax, fraction=0.046, pad=0.04)
    cb.outline.set_edgecolor("white")
    cb.ax.tick_params(labelsize=10, colors="white")

    plt.tight_layout()
    plt.savefig(FIGS / f"{fname}.png", dpi=600)
    plt.close()


In [None]:
import matplotlib as mpl
from matplotlib.colors import LogNorm, PowerNorm

xy = adata.obsm["spatial"]
x, y = xy[:, 0], xy[:, 1]

# --------------------------
# Figure 9 — spatial: Leiden
# --------------------------
plt.figure(figsize=(9.5, 8))
cats = adata.obs["leiden"].astype("category")
palette = sc.plotting.palettes.default_102[:len(cats.cat.categories)]

ax = plt.gca()
for k, cat in enumerate(cats.cat.categories):
    idx = (cats.values == cat)
    ax.scatter(x[idx], y[idx], s=0.8, c=[palette[k]], lw=0, alpha=0.9,
               rasterized=True, label=f"Cluster {cat}")

# label cluster medians
centroids = (pd.DataFrame({"x": x, "y": y, "c": cats.values})
             .groupby("c", observed=True)[["x","y"]].median())
for lab, (cx, cy) in centroids.iterrows():
    ax.text(cx, cy, str(lab), ha="center", va="center",
            fontsize=16, fontweight="bold", color="white",
            path_effects=[mpl.patheffects.withStroke(linewidth=3, foreground="#111111")])

ax.set_xlabel("x (µm)"); ax.set_ylabel("y (µm)")
ax.invert_yaxis(); ax.set_aspect("equal", "box")

leg = ax.legend(title="Clusters", loc="center left", bbox_to_anchor=(1.02, 0.5),
                frameon=True, fontsize=9, markerscale=6/0.8)
leg.get_frame().set_facecolor("black"); leg.get_frame().set_edgecolor("white")
leg.get_title().set_color("white")
for t in leg.get_texts(): t.set_color("white")

plt.tight_layout()
plt.savefig(FIGS / "09_spatial_leiden.png", dpi=600, bbox_inches="tight")
plt.close()

# -------------------------------------------------------
# Figure 10 — spatial density (post-QC, log color scale)
# -------------------------------------------------------
plt.figure(figsize=(9, 8))
hb = plt.hexbin(x, y, gridsize=260, bins='log', mincnt=1, cmap="magma")
ax = plt.gca()
ax.invert_yaxis(); ax.set_aspect("equal", "box")
ax.set_xlabel("x (µm)"); ax.set_ylabel("y (µm)")
cb = plt.colorbar(hb, ax=ax, fraction=0.046, pad=0.04)
cb.set_label("cells / bin")
cb.outline.set_edgecolor("white"); cb.ax.tick_params(colors="white")
plt.title("Spatial: cell density (log)", color="white", fontsize=16, pad=6)
plt.tight_layout()
plt.savefig(FIGS / "10_spatial_density_fine.png", dpi=600, bbox_inches="tight")
plt.close()

# -------------------------------------------------------
# Figure 11 — spatial feature map (one gene)
# -------------------------------------------------------
g0 = preferred[0] if 'preferred' in globals() and len(preferred) else adata.var_names[0]

col_mat = adata[:, g0].X
vals = col_mat.toarray().ravel() if issparse(col_mat) else np.asarray(col_mat).ravel()
vals = np.nan_to_num(vals, nan=0.0, posinf=0.0, neginf=0.0)

# robust scaling (ignore zeros when setting vmax)
pos = vals[vals > 0]
vmax = np.percentile(pos, 99) if pos.size else 1.0
norm = PowerNorm(gamma=0.6, vmin=0.0, vmax=vmax)  # boosts low signals on dark bg

plt.figure(figsize=(9, 8))
ax = plt.gca()
scat = ax.scatter(x, y, c=vals, s=0.8, lw=0, cmap="viridis", norm=norm,
                  alpha=0.95, rasterized=True)
ax.invert_yaxis(); ax.set_aspect("equal", "box")
ax.set_xlabel("x (µm)"); ax.set_ylabel("y (µm)")
ax.set_title(f"Spatial: {g0}", color="white", fontsize=16, pad=6)

cb = plt.colorbar(scat, ax=ax, fraction=0.046, pad=0.04)
cb.set_label("counts"); cb.outline.set_edgecolor("white"); cb.ax.tick_params(colors="white")

plt.tight_layout()
plt.savefig(FIGS / f"11_spatial_feature_{g0}.png", dpi=600, bbox_inches="tight")
plt.close()


[PCs] n_cum=31, n_bs=30, n_elbow=7 → using 7 PCs


In [44]:
from scipy.sparse import issparse

xy = adata.obsm["spatial"]

# --------------------------
# Figure 9 — spatial: Leiden
# --------------------------
plt.figure(figsize=(5.6, 5.6))
c = adata.obs["leiden"].astype("category").cat.codes.to_numpy()
plt.scatter(xy[:, 0], xy[:, 1], c=c, s=1.2, alpha=0.85)
plt.gca().invert_yaxis()
plt.axis("equal")
plt.title("Spatial: Leiden clusters")
plt.tight_layout()
plt.savefig(FIGS / "09_spatial_leiden.png", dpi=220)
plt.close()

# -------------------------------------------------------
# Figure 10 — spatial density (post-QC, fine binning)
# -------------------------------------------------------
plt.figure(figsize=(5.6, 5.6))
plt.hist2d(xy[:, 0], xy[:, 1], bins=400)
plt.gca().invert_yaxis()
plt.axis("equal")
plt.title("Spatial: cell density (fine bins)")
plt.tight_layout()
plt.savefig(FIGS / "10_spatial_density_fine.png", dpi=220)
plt.close()

# -------------------------------------------------------
# Figure 11 — spatial feature map (one gene)
# -------------------------------------------------------
g0 = preferred[0] if 'preferred' in globals() and len(preferred) else adata.var_names[0]

col_mat = adata[:, g0].X  # (n_cells, 1)
# Convert safely to a flat numeric array (dense for this single column is fine)
if issparse(col_mat):
    vals = col_mat.toarray().ravel()
else:
    vals = np.asarray(col_mat).ravel()

# Ensure lengths match and NaNs handled
assert vals.shape[0] == xy.shape[0], f"Length mismatch: vals={vals.shape}, xy={xy.shape}"
vals = np.nan_to_num(vals, nan=0.0, posinf=0.0, neginf=0.0)

plt.figure(figsize=(5.6, 5.6))
scat = plt.scatter(xy[:, 0], xy[:, 1], c=vals, s=1.2, alpha=0.95)
plt.gca().invert_yaxis()
plt.axis("equal")
plt.title(f"Spatial: {g0}")
plt.colorbar(scat, fraction=0.046, pad=0.04, label="counts")
plt.tight_layout()
plt.savefig(FIGS / f"11_spatial_feature_{g0}.png", dpi=220)
plt.close()


In [8]:
# Figure 11b — spatial: total_counts (QC covariate)
vals_tc = adata.obs["total_counts"].to_numpy()
plt.figure(figsize=(5.6, 5.6))
scat = plt.scatter(xy[:, 0], xy[:, 1], c=vals_tc, s=1.0, alpha=0.9)
plt.gca().invert_yaxis(); plt.axis("equal"); plt.title("Spatial: total_counts")
plt.colorbar(scat, fraction=0.046, pad=0.04, label="total_counts")
plt.tight_layout(); plt.savefig(FIGS / "11b_spatial_total_counts.png", dpi=220); plt.close()

# Figure 11c — spatial: genes detected
vals_ng = adata.obs["n_genes_by_counts"].to_numpy()
plt.figure(figsize=(5.6, 5.6))
scat = plt.scatter(xy[:, 0], xy[:, 1], c=vals_ng, s=1.0, alpha=0.9)
plt.gca().invert_yaxis(); plt.axis("equal"); plt.title("Spatial: n_genes_by_counts")
plt.colorbar(scat, fraction=0.046, pad=0.04, label="n_genes")
plt.tight_layout(); plt.savefig(FIGS / "11c_spatial_n_genes.png", dpi=220); plt.close()


We compute Wilcoxon markers and **neighborhood enrichment** (Squidpy) to assess micro-territories.


In [9]:
# Markers
# Find cluster markers (Wilcoxon rank-sum)
# This will compute per-cluster DE genes using the 'leiden' labels you already computed.
sc.tl.rank_genes_groups(
    adata, 
    groupby="leiden", 
    method="wilcoxon", 
    use_raw=False
)
# Figure 12 — Dotplot of top markers
# Use return_fig=True so we can save cleanly without tight_layout warnings.
fig = sc.pl.rank_genes_groups_dotplot(
    adata, 
    n_genes=4, 
    show=False, 
    return_fig=True
)
fig.savefig(FIGS / "12_markers_dotplot.png", dpi=220, bbox_inches="tight")
plt.close()



In [10]:
# Figure 13 — Heatmap of top markers
fig = sc.pl.rank_genes_groups_heatmap(
    adata, 
    n_genes=4, 
    show=False, 
    use_raw=False, 
    swap_axes=True,        # often easier to read
    dendrogram=False, 
    standard_scale="var"   # normalize genes across clusters
)
plt.gcf().savefig(FIGS / "13_markers_heatmap.png", dpi=220, bbox_inches="tight")
plt.close()


In [11]:
#Spatial neighbors + neighborhood enrichment (Squidpy

In [12]:
import squidpy as sq

# Build spatial graph from your coordinates in adata.obsm['spatial'] (units: microns)
# radius: choose ~25–35 µm for Xenium cell neighborhoods; tweak if too sparse/dense
sq.gr.spatial_neighbors(
    adata,
    coord_type="generic",   # coordinates are in plain x/y (not pixel indices)
    spatial_key="spatial",
    radius=30.0,            # try 25–35
    n_rings=1,              # one-ring neighborhoods
    set_diag=False
)

# Neighborhood enrichment requires permutations (must be > 0).
# 100 is fast-ish and gives stable z-scores; you can raise later to 1000 for publication.
sq.gr.nhood_enrichment(
    adata, 
    cluster_key="leiden", 
    n_perms=100, 
    show_progress_bar=False
)


  from pkg_resources import DistributionNotFound, get_distribution
  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)


In [13]:
# Figure 14 — Neighborhood enrichment heatmap
# Shows over/under-representation of cluster pairs in each other's neighborhood.
fig = sq.pl.nhood_enrichment(
    adata, 
    cluster_key="leiden", 
    show=False, 
    cmap="coolwarm", 
    vmin=-5, vmax=5  # clip extremes for readability
)
plt.gcf().savefig(FIGS / "14_nhood_enrichment.png", dpi=220, bbox_inches="tight")
plt.close()

In [14]:
# Figure 15 — Cluster size barplot
# Helpful to see abundance per cluster
counts = adata.obs["leiden"].value_counts().sort_index()
plt.figure(figsize=(6,3))
counts.plot(kind="bar")
plt.ylabel("cells")
plt.title("Cluster sizes (Leiden)")
plt.tight_layout()
plt.savefig(FIGS / "15_cluster_sizes.png", dpi=220)
plt.close()

In [15]:
# Figure 16 — Top marker per cluster (dotplot subset)
# Extract top gene names per cluster and plot a compact dotplot.
top = 3
marker_df = sc.get.rank_genes_groups_df(adata, group=None)
top_markers = (marker_df
               .sort_values(["group", "pvals_adj"])
               .groupby("group")
               .head(top)["names"]
               .unique().tolist())

fig = sc.pl.dotplot(
    adata, 
    var_names=top_markers, 
    groupby="leiden", 
    show=False, 
    standard_scale="var"
)
plt.gcf().savefig(FIGS / "16_top_marker_dotplot.png", dpi=220, bbox_inches="tight")
plt.close()

  .groupby("group")


In [16]:
# Figure 18 — Neighborhood graph degree (how many neighbors each cell has)
# Useful to check radius choice didn't create a too-sparse or too-dense graph.
A = adata.obsp["spatial_connectivities"]  # CSR matrix
deg = np.asarray(A.sum(axis=1)).ravel()
plt.figure(figsize=(5,3))
plt.hist(deg, bins=50)
plt.xlabel("neighbors per cell"); plt.ylabel("cells")
plt.title("Spatial graph degree")
plt.tight_layout()
plt.savefig(FIGS / "18_neighbor_degree_hist.png", dpi=220)
plt.close()