In [None]:
from pathlib import Path
import os

DAY = "20251201"
Version = "v1"

PROJECT_ROOT = Path(os.getenv("LLMSC_ROOT", ".")).resolve()

DATA_DIR = Path(os.getenv("LLMSC_DATA_DIR", PROJECT_ROOT / "input")).resolve()
OUT_DIR  = Path(os.getenv("LLMSC_OUT_DIR",  PROJECT_ROOT / "runs" / f"{DAY}.{Version}")).resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
RANDOM_SEED = 42
import random, os
import numpy as np
random.seed(RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"ðŸ”’ Random seed set to {RANDOM_SEED} for reproducibility.")

ðŸ”’ Random seed set to 42 for reproducibility.


In [None]:
import scanpy as sc
import scipy

import os
import sys
import numpy as np
import pandas as pd

import json
import gc
import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
DATASETS = {
    "cd8": {
        "adata_path": "/runs/20251201.v1/cd8_benchmark_data.h5ad",
        "cluster_col": "meta.cluster",
    },
    "cd4": {
        "adata_path": "/runs/20251201.v1/cd4_benchmark_data.h5ad",
        "cluster_col": "meta.cluster",
    },
    "msc": {
        "adata_path": "/runs/20251201.v1/brca_msc_benchmark_data.h5ad",
        "cluster_col": "meta.cluster",
    },
    "mouse_b": {
        "adata_path": "/runs/20251201.v1/mouse_b_benchmark_data.h5ad",
        "cluster_col": "meta.cluster",
    },
}


In [None]:
IDMAP = {
    "cd8": "cd8",
    "cd4": "cd4",
    "msc": "brca_msc",
    "mouse_b": "mouse_b",
}

COLS = {
    "cd8": "Cancer_Type,meta.cluster",
    "cd4": "Cancer_Type,meta.cluster",
    "msc": "meta.cluster",
    "mouse_b": "meta.cluster",
}

rows = []
for key, spec in DATASETS.items():
    rows.append({
        "dataset_id": IDMAP.get(key, key),
        "adata_path": spec["adata_path"],
        "out_csv": f"paper/source_data/subsampled_ids/{IDMAP.get(key, key)}_cells.csv",
        "cols": COLS.get(key, "meta.cluster"),
    })

df = pd.DataFrame(rows)
os.makedirs("paper/config", exist_ok=True)
df.to_csv("paper/config/datasets.tsv", sep="\t", index=False)
df


Unnamed: 0,dataset_id,adata_path,out_csv,cols
0,cd8,/runs/20251201.v1...,paper/source_data/subsampled_ids/cd8_cells.csv,"Cancer_Type,meta.cluster"
1,cd4,/runs/20251201.v1...,paper/source_data/subsampled_ids/cd4_cells.csv,"Cancer_Type,meta.cluster"
2,brca_msc,/runs/20251201.v1...,paper/source_data/subsampled_ids/brca_msc_cell...,meta.cluster
3,mouse_b,/runs/20251201.v1...,paper/source_data/subsampled_ids/mouse_b_cells...,meta.cluster


In [None]:
!python paper/scripts/export_subsampled_ids.py --datasets-tsv paper/config/datasets.tsv --dry-run

[DRY RUN] Planned exports:
  - dataset_id=cd8
    h5ad_path=/runs/20251201.v1/cd8_benchmark_data.h5ad
    out_csv=paper/source_data/subsampled_ids/cd8_cells.csv
    cols=Cancer_Type,meta.cluster
  - dataset_id=cd4
    h5ad_path=/runs/20251201.v1/cd4_benchmark_data.h5ad
    out_csv=paper/source_data/subsampled_ids/cd4_cells.csv
    cols=Cancer_Type,meta.cluster
  - dataset_id=brca_msc
    h5ad_path=/runs/20251201.v1/brca_msc_benchmark_data.h5ad
    out_csv=paper/source_data/subsampled_ids/brca_msc_cells.csv
    cols=meta.cluster
  - dataset_id=mouse_b
    h5ad_path=/runs/20251201.v1/mouse_b_benchmark_data.h5ad
    out_csv=paper/source_data/subsampled_ids/mouse_b_cells.csv
    cols=meta.cluster


In [None]:
!python paper/scripts/export_subsampled_ids.py --datasets-tsv paper/config/datasets.tsv

[OK] cd8: 4466 rows -> paper/source_data/subsampled_ids/cd8_cells.csv
  Top meta.cluster counts:
    CD8.c01.Tn.MAL: 300
    CD8.c02.Tm.IL7R: 300
    CD8.c03.Tm.RPS12: 300
    CD8.c04.Tm.CD52: 300
    CD8.c05.Tem.CXCR5: 300
    CD8.c06.Tem.GZMK: 300
    CD8.c07.Temra.CX3CR1: 300
    CD8.c11.Tex.PDCD1: 300
    CD8.c10.Trm.ZNF683: 300
    CD8.c12.Tex.CXCL13: 300
[OK] cd4: 6005 rows -> paper/source_data/subsampled_ids/cd4_cells.csv
  Top meta.cluster counts:
    CD4.c01.Tn.TCF7: 300
    CD4.c06.Tm.ANXA1: 300
    CD4.c07.Tm.ANXA2: 300
    CD4.c16.Tfh.CXCR5: 300
    CD4.c14.Th17.SLC4A10: 300
    CD4.c15.Th17.IL23R: 300
    CD4.c12.Tem.GZMK: 300
    CD4.c10.Tm.CAPG: 300
    CD4.c20.Treg.TNFRSF9: 300
    CD4.c17.TfhTh1.CXCL13: 300
[OK] brca_msc: 2150 rows -> paper/source_data/subsampled_ids/brca_msc_cells.csv
  Top meta.cluster counts:
    CAFs MSC iCAF-like: 300
    CAFs myCAF-like: 300
    Endothelial ACKR1: 300
    Endothelial CXCL12: 300
    PVL Differentiated: 300
    Endothelial RGS5: 3

In [None]:
!python paper/scripts/apply_label_map.py \
  --label-map paper/config/label_maps/cd8_gt_map.yaml \
  --text "CD8.c02.Tm.IL7R"

CD8_EffectorMemory


In [None]:
!python paper/scripts/apply_label_map.py \
  --label-map paper/config/label_maps/cd8_gt_map.yaml \
  --csv paper/source_data/subsampled_ids/cd8_cells.csv \
  --col meta.cluster \
  --out paper/source_data/subsampled_ids/cd8_cells.with_gt.csv \
  --out-col Ground_Truth

!python paper/scripts/apply_label_map.py \
  --label-map paper/config/label_maps/cd4_gt_map.yaml \
  --csv paper/source_data/subsampled_ids/cd4_cells.csv \
  --col meta.cluster \
  --out paper/source_data/subsampled_ids/cd4_cells.with_gt.csv \
  --out-col Ground_Truth

!python paper/scripts/apply_label_map.py \
  --label-map paper/config/label_maps/brca_msc_gt_map.yaml \
  --csv paper/source_data/subsampled_ids/brca_msc_cells.csv \
  --col meta.cluster \
  --out paper/source_data/subsampled_ids/brca_msc_cells.with_gt.csv \
  --out-col Ground_Truth

!python paper/scripts/apply_label_map.py \
  --label-map paper/config/label_maps/mouse_b_gt_map.yaml \
  --csv paper/source_data/subsampled_ids/mouse_b_cells.csv \
  --col meta.cluster \
  --out paper/source_data/subsampled_ids/mouse_b_cells.with_gt.csv \
  --out-col Ground_Truth

[OK] wrote: paper/source_data/subsampled_ids/cd8_cells.with_gt.csv
[OK] wrote: paper/source_data/subsampled_ids/cd4_cells.with_gt.csv
[OK] wrote: paper/source_data/subsampled_ids/brca_msc_cells.with_gt.csv
[OK] wrote: paper/source_data/subsampled_ids/mouse_b_cells.with_gt.csv


In [None]:
df = pd.read_csv("paper/source_data/subsampled_ids/cd8_cells.with_gt.csv")
print(df["Ground_Truth"].value_counts(dropna=False).head(20))

Ground_Truth
CD8_EffectorMemory    1983
CD8_Exhausted          928
CD8_Effector           753
CD8_Naive              300
CD8_MAIT               300
CD8_ISG                202
Name: count, dtype: int64
