In [None]:
from pathlib import Path
import os

DAY = "20251201"
Version = "v1"

PROJECT_ROOT = Path(os.getenv("LLMSC_ROOT", ".")).resolve()

DATA_DIR = Path(os.getenv("LLMSC_DATA_DIR", PROJECT_ROOT / "input")).resolve()
OUT_DIR  = Path(os.getenv("LLMSC_OUT_DIR",  PROJECT_ROOT / "runs" / f"{DAY}.{Version}")).resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
RANDOM_SEED = 42
import random, os
import numpy as np
random.seed(RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"ðŸ”’ Random seed set to {RANDOM_SEED} for reproducibility.")

ðŸ”’ Random seed set to 42 for reproducibility.


In [None]:
import google.generativeai as genai
import scanpy as sc
import numpy as np
import pandas as pd
import scipy
import os
import gc
import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
file_path = DATA_DIR / "tabula-muris-senis-droplet-processed-official-annotations.h5ad"
adata = sc.read_h5ad(file_path)

In [None]:
adata.obs

Unnamed: 0_level_0,age,cell,cell_ontology_class,cell_ontology_id,free_annotation,method,mouse.id,n_genes,sex,subtissue,tissue,tissue_free_annotation,n_counts,louvain,leiden
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACCTGCAGGGTACA-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGCAGGGTACA,keratinocyte,,filiform,droplet,24-M-60,2107.0,male,,Tongue,Tongue,5482.0,5,8
AAACCTGCAGTAAGCG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGCAGTAAGCG,keratinocyte,,suprabasal,droplet,24-M-60,3481.0,male,,Tongue,Tongue,21855.0,19,15
AAACCTGTCATTATCC-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGTCATTATCC,keratinocyte,,suprabasal,droplet,24-M-60,2599.0,male,,Tongue,Tongue,10943.0,19,15
AAACGGGGTACAGTGG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTACAGTGG,keratinocyte,,suprabasal differentiating,droplet,24-M-60,3468.0,male,,Tongue,Tongue,20665.0,12,11
AAACGGGGTCTTCTCG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTCTTCTCG,keratinocyte,,suprabasal differentiating,droplet,24-M-60,3189.0,male,,Tongue,Tongue,12925.0,5,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10X_P8_15_TTTGTCAGTACATGTC-1,3m,10X_P8_15_TTTGTCAGTACATGTC,basal epithelial cell of tracheobronchial tree,CL:0000066,,droplet,3-M-7/8,,male,,Trachea,Trachea,5000.0,51,59
10X_P8_15_TTTGTCAGTGCGCTTG-1,3m,10X_P8_15_TTTGTCAGTGCGCTTG,mesenchymal progenitor cell,CL:0008019,,droplet,3-M-7/8,,male,,Trachea,Trachea,5984.0,11,33
10X_P8_15_TTTGTCAGTTGTCGCG-1,3m,10X_P8_15_TTTGTCAGTTGTCGCG,endothelial cell,CL:0000115,,droplet,3-M-7/8,,male,,Trachea,Trachea,6507.0,40,32
10X_P8_15_TTTGTCATCGGCTTGG-1,3m,10X_P8_15_TTTGTCATCGGCTTGG,endothelial cell,CL:0000115,,droplet,3-M-7/8,,male,,Trachea,Trachea,2589.0,40,32


In [None]:
set(adata.obs["cell_ontology_class"])

{'B cell',
 'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'DN3 thymocyte',
 'DN4 thymocyte',
 'Kupffer cell',
 'Langerhans cell',
 'NK cell',
 'Schwann cell',
 'T cell',
 'adventitial cell',
 'alveolar macrophage',
 'basal cell',
 'basal cell of epidermis',
 'basal epithelial cell of tracheobronchial tree',
 'basophil',
 'bladder cell',
 'bladder urothelial cell',
 'blood cell',
 'bronchial smooth muscle cell',
 'brush cell',
 'cardiac neuron',
 'cardiomyocyte',
 'chondrocyte',
 'ciliated columnar cell of tracheobronchial tree',
 'classical monocyte',
 'club cell of bronchiole',
 'dendritic cell',
 'double negative T cell',
 'duct epithelial cell',
 'endocardial cell',
 'endothelial cell',
 'endothelial cell of coronary artery',
 'endothelial cell of hepatic sinusoid',
 'endothelial cell of lymphatic vessel',
 'enterocyte of epithelium of large intestine',
 'epidermal cell',
 'epithelial cell',
 'epithelial cell of large intestine',
 'epithelial cell of proxim

In [None]:
mask_b = (
    adata.obs["cell_ontology_class"].str.contains(r"\bB cell\b", regex=True, na=False)
    & ~adata.obs["cell_ontology_class"].str.contains(r"pancreatic B cell", na=False)
)
adata_b = adata[mask_b].copy()
set(adata_b.obs["cell_ontology_class"])

{'B cell',
 'immature B cell',
 'late pro-B cell',
 'naive B cell',
 'precursor B cell'}

In [None]:

# ==========================================
# 2. Data Loading & Stratified Subsampling
# ==========================================
N_PER_CLUSTER = 300

sampled_indices = []
clusters = adata_b.obs["cell_ontology_class"].unique()
for cl in clusters:
    cells_in_cluster = adata_b.obs[adata_b.obs["cell_ontology_class"] == cl].index
    if len(cells_in_cluster) > N_PER_CLUSTER:
        sampled = np.random.choice(cells_in_cluster, N_PER_CLUSTER, replace=False)
    else:
        sampled = cells_in_cluster

    sampled_indices.extend(sampled)

adata_sub = adata_b[sampled_indices].copy()

print(f"  -> Original: {adata_b.n_obs} cells")
print(f"  -> Stratified: {adata_sub.n_obs} cells (Balanced)")

# Preprocessing
print("\nPreprocessing: Normalizing and Log-transforming...")
adata_sub.layers["counts"] = adata_sub.raw.X.copy()
adata_sub.raw = None

sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)

adata_sub.layers["logcounts"] = adata_sub.X.copy()
adata_sub.obs['Sample_ID'] = adata_sub.obs.index
print("âœ… Data Preprocessing Complete.")

# Rename for consistency with other scripts
adata_sub.obs['meta.cluster'] = adata_sub.obs['cell_ontology_class'].copy()

save_path = OUT_DIR / "mouse_b_benchmark_data.h5ad"
print(f"\nðŸ’¾ Saving Benchmark Data to: {save_path}")
adata_sub.write_h5ad(save_path)

meta_save_path = OUT_DIR / "mouse_b_benchmark_metadata.csv"
adata_sub.obs.to_csv(meta_save_path)
print(f"ðŸ“„ Metadata saved to: {meta_save_path}")

  -> Original: 33391 cells
  -> Stratified: 1500 cells (Balanced)

Preprocessing: Normalizing and Log-transforming...
âœ… Data Preprocessing Complete.

ðŸ’¾ Saving Benchmark Data to: /runs/20251201.v1/mouse_b_benchmark_data.h5ad
ðŸ“„ Metadata saved to: /runs/20251201.v1/mouse_b_benchmark_metadata.csv


In [None]:
print(f"\n Loading data...: {save_path}")
adata_sub = sc.read_h5ad(save_path)
print("\nCell counts per cluster (Should be balanced approx ~300 if 3 datasets merged):")
print(adata_sub.obs['meta.cluster'].value_counts().head(10))


 Loading data...:  /runs/20251201.v1/mouse_b_benchmark_data.h5ad

Cell counts per cluster (Should be balanced approx ~300 if 3 datasets merged):
meta.cluster
B cell              300
immature B cell     300
late pro-B cell     300
naive B cell        300
precursor B cell    300
Name: count, dtype: int64
