In [None]:
# Notebook 1: scRNA-seq Preprocessing and Subsetting


In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix

# Load 10x Genomics h5 file
adata = sc.read_10x_h5(
    "data/10k_PBMC_Multiome_nextgem_Chromium_X_filtered_feature_bc_matrix.h5"
)
adata.var_names_make_unique()

# Keep only RNA modality (for multiome files)
if 'feature_types' in adata.var.columns:
    rna_mask = adata.var['feature_types'] == 'Gene Expression'
    adata = adata[:, rna_mask]

# Calculate QC metrics
# Total UMI counts per cell
adata.obs['n_counts'] = adata.X.sum(axis=1).A1

# Number of genes detected per cell
adata.obs['n_genes_by_counts'] = (adata.X > 0).sum(1).A1

# Percent mitochondrial gene content
adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')
adata.obs['pct_counts_mt'] = (
    adata[:, adata.var['mt']].X.sum(1).A1 / adata.obs['n_counts']
) * 100

# Filter low-quality cells
# Criteria: min total counts > 1000, pct mitochondrial < 10%, > 200 genes expressed
cell_filter = (
    (adata.obs['n_counts'] > 1000) &
    (adata.obs['pct_counts_mt'] < 10) &
    (adata.obs['n_genes_by_counts'] > 200)
)
adata_filtered = adata[cell_filter].copy()

# Filter low-quality genes
sc.pp.filter_genes(adata_filtered, min_cells=3)

# Subsample to 4000 cells (optional)
if adata_filtered.n_obs > 4000:
    sampled_indices = adata_filtered.obs.sample(n=4000, random_state=42).index
    adata_final = adata_filtered[sampled_indices].copy()
else:
    adata_final = adata_filtered

# Save preprocessed RNA data
adata_final.write(
    "data/ag_filtered_rna_subset.h5ad"
)
print(f"Saved filtered RNA subset with {adata_final.shape[0]} cells and {adata_final.shape[1]} genes.")
