We preprocess the data from Chen et al., Nature Immunology 2024

In [None]:
import pandas as pd

In [2]:
data = "../data/dt_All_bin_50_wCopyNumber_20220114.csv"
df = pd.read_csv(data)
df

Unnamed: 0,Window.ID,X,Y,Counts CD8+,Counts PD1+,Counts CD8+PD1+,Counts CD8+Ki67+,Counts CD8+TCF7+,Counts CD8+TCF7+Ki67+,Counts CD8+PD1+Ki67+,...,i.Copies CXCL10.11,i.Copies IFNG,i.Copies CD3E,i.Slide.id.Ab,i.Shuffled,Counts.CXCL10.11.bin,Xbin,Ybin,bin,nbins
0,315,25.744,16207.360,0,0,0,0,0,0,0,...,0,0,0,2e79_08,0,0,0,0,0,1
1,316,25.744,16257.360,0,0,0,0,0,0,0,...,0,0,0,2e79_08,0,0,0,0,0,1
2,317,25.744,16307.360,0,0,0,0,0,0,0,...,0,0,0,2e79_08,0,0,0,0,0,1
3,318,25.744,16357.360,0,0,0,0,0,0,0,...,0,0,0,2e79_08,0,0,0,0,0,1
4,319,25.744,16407.360,0,0,0,0,0,0,0,...,0,0,0,2e79_08,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865790,64778,24300.752,16268.008,0,0,0,0,0,0,0,...,0,0,0,2e80_02,0,0,0,0,0,1
865791,64781,24300.752,16418.008,0,0,0,0,0,0,0,...,0,0,0,2e80_02,0,0,0,0,0,1
865792,64782,24300.752,16468.008,0,0,0,0,0,0,0,...,0,0,0,2e80_02,0,0,0,0,0,1
865793,64783,24300.752,16518.008,0,0,0,0,0,0,0,...,0,0,0,2e80_02,0,0,0,0,0,1


#### Explorations

In [3]:
df.columns

Index(['Window.ID', 'X', 'Y', 'Counts CD8+', 'Counts PD1+', 'Counts CD8+PD1+',
       'Counts CD8+Ki67+', 'Counts CD8+TCF7+', 'Counts CD8+TCF7+Ki67+',
       'Counts CD8+PD1+Ki67+',
       ...
       'i.Copies CXCL10.11', 'i.Copies IFNG', 'i.Copies CD3E', 'i.Slide.id.Ab',
       'i.Shuffled', 'Counts.CXCL10.11.bin', 'Xbin', 'Ybin', 'bin', 'nbins'],
      dtype='object', length=127)

In [4]:
df["Response"].value_counts()

Response
NR    623461
R     242334
Name: count, dtype: int64

In [5]:
df["Slide.id.Ab"].nunique()

46

In [6]:
df["Slide.id.Ab"].value_counts()

Slide.id.Ab
2e80_09    95007
2e77_08    55811
2e79_06    53534
2e74_03    53387
2e81_03    50406
2e77_10    50219
2e75_01    50213
2e79_10    46291
2e83_08    42994
2e82_07    42004
2e79_11    34648
2e80_02    29617
2e79_08    28813
2e77_09    28646
2e79_03    24800
2e74_06    21883
2e82_03    17808
2e77_02    16866
2e74_02    12412
2e80_04    11234
2e75_11    10736
2e74_01     9748
2e77_11     8731
2e82_02     8064
2e82_01     7538
2e81_08     7036
2e77_16     6520
2e77_15     5397
2e82_04     5273
2e74_05     3293
2e83_07     3152
2e80_05     2569
2e77_14     2550
2e75_06     2419
2e77_07     2303
2e77_04     2148
2e77_01     2103
2e77_06     2030
2e79_07     1659
2e77_13     1470
2e83_06     1091
2e81_01     1025
2e79_02      781
2e75_09      715
2e75_07      502
2e82_08      349
Name: count, dtype: int64

In [7]:
df["Slide.id.Ab"].value_counts().sum()

np.int64(865795)

#### Processing into X

In [14]:
df["Counts CD8+"].describe()

count    865795.000000
mean          0.613406
std           1.576024
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          36.000000
Name: Counts CD8+, dtype: float64

In [15]:
# see if there are X, Y duplicates
df_duplicates = df[df.duplicated(subset=["X", "Y"], keep=False)]
df_duplicates.shape

(0, 127)

In [68]:
import numpy as np
import pandas as pd
from typing import Dict
from pathlib import Path

output_dir = Path("/Users/jamesbolepan/Documents/cancer-immune-simulations/data/nature_immune_processed")

def build_slide_grids_checked(
    df: pd.DataFrame,
    slide_col: str = "Slide.id.Ab",
    xbin_col: str = "X",
    ybin_col: str = "Y",
    save: bool = True,
) -> Dict[str, np.ndarray]:
    """
    For each unique slide, construct a [rows, cols, 3] array with channels [C, T, A].

    - T cell counts are summed from all columns whose names contain "Counts CD8+".
    - Cancer cell counts are summed from all columns whose names contain "Counts CK".
    - Missing spatial grids (X,Y) are filled with 0.
    - Saves each slide's grid as a .npy file in output_dir (no subfolders).
    - Performs consistency checks on grid counts and max values.
    """

    # ---- auto-detect relevant columns ----
    cd8_cols = [c for c in df.columns if "Counts CD8+" in c and "NormCounts" not in c and "i.Counts" not in c]
    ck_cols  = [c for c in df.columns if "Counts CK" in c and "NormCounts" not in c and "i.Counts" not in c]

    if not cd8_cols:
        raise ValueError("No columns found containing 'Counts CD8+'")
    if not ck_cols:
        raise ValueError("No columns found containing 'Counts CK'")

    print(f"[INFO] Using {len(cd8_cols)} CD8+ columns: {cd8_cols}")
    print(f"[INFO] Using {len(ck_cols)} CK columns: {ck_cols}")

    # ---- prepare output ----
    grids: Dict[str, np.ndarray] = {}
    total_grids = 0
    all_grid_t_max = -np.inf
    all_grid_c_max = -np.inf

    # ---- ensure output directory exists ----
    if save:
        output_dir.mkdir(parents=True, exist_ok=True)

    # ---- build grids per slide ----
    for slide_id, g in df.groupby(slide_col, dropna=False):
        g = g.copy()
        g["T_sum"] = g[cd8_cols].sum(axis=1, skipna=True)
        g["C_sum"] = g[ck_cols].sum(axis=1, skipna=True)

        # unique spatial coordinates
        xs = np.sort(g[xbin_col].unique())
        ys = np.sort(g[ybin_col].unique())
        rows, cols = len(ys), len(xs)

        # pivot and reindex to ensure full grid coverage
        T_mat = (
            g.pivot(index=ybin_col, columns=xbin_col, values="T_sum")
             .reindex(index=ys, columns=xs)
             .fillna(0.0)
        )
        C_mat = (
            g.pivot(index=ybin_col, columns=xbin_col, values="C_sum")
             .reindex(index=ys, columns=xs)
             .fillna(0.0)
        )

        # 3D array [rows, cols, 3]
        grid = np.zeros((rows, cols, 3), dtype=float)
        grid[:, :, 0] = C_mat.to_numpy()  # Cancer
        grid[:, :, 1] = T_mat.to_numpy()  # T cells
        grid[:, :, 2] = 0.0               # Drug placeholder

        grids[str(slide_id)] = grid

        # track stats
        total_grids += rows * cols
        all_grid_t_max = max(all_grid_t_max, grid[:, :, 1].max())
        all_grid_c_max = max(all_grid_c_max, grid[:, :, 0].max())

        # ---- save directly into output_dir ----
        if save:
            # sanitize filename
            safe_name = str(slide_id).replace("/", "_").replace(" ", "_")
            save_path = output_dir / f"{safe_name}.npy"
            np.save(save_path, grid)
            print(f"[SAVED] {slide_id}: grid shape={grid.shape} → {save_path}")

    # ---- global consistency checks ----
    n_unique_grids = len(df.drop_duplicates([slide_col, xbin_col, ybin_col]))
    df_t_max = pd.to_numeric(df[cd8_cols].sum(axis=1), errors="coerce").max()
    df_c_max = pd.to_numeric(df[ck_cols].sum(axis=1), errors="coerce").max()

    errors = []
    if total_grids < n_unique_grids:
        errors.append(
            f"Total expected grid positions ({total_grids}) < number of unique (slide,X,Y) = {n_unique_grids}. "
            "Some spatial bins may be missing or misaligned."
        )
    if not np.isclose(all_grid_t_max, df_t_max, rtol=1e-6, atol=1e-6):
        errors.append(f"Max T mismatch: grid={all_grid_t_max}, df={df_t_max}")
    if not np.isclose(all_grid_c_max, df_c_max, rtol=1e-6, atol=1e-6):
        errors.append(f"Max Cancer mismatch: grid={all_grid_c_max}, df={df_c_max}")

    if errors:
        raise AssertionError(" | ".join(errors))

    print(f"[CHECK PASSED] Grids built for {len(grids)} slides. "
          f"Total expected grids={total_grids}, unique (slide,X,Y)={n_unique_grids}, "
          f"max T={all_grid_t_max}, max C={all_grid_c_max}")

    return grids


In [69]:
grids = build_slide_grids_checked(df)

[INFO] Using 15 CD8+ columns: ['Counts CD8+', 'Counts CD8+PD1+', 'Counts CD8+Ki67+', 'Counts CD8+TCF7+', 'Counts CD8+TCF7+Ki67+', 'Counts CD8+PD1+Ki67+', 'Counts CD8+PD1+TCF7+', 'Counts CD8+PD1+Ki67+TCF7+', 'Counts CD8+PD1+Ki67-TCF7+', 'Counts CD8+PD1-Ki67+TCF7+', 'Counts CD8+PD1-Ki67-TCF7+', 'Counts CD8+PD1+Ki67+TCF7-', 'Counts CD8+PD1+Ki67-TCF7-', 'Counts CD8+PD1-Ki67+TCF7-', 'Counts CD8+PD1-Ki67-TCF7-']
[INFO] Using 11 CK columns: ['Counts CK.ab+', 'Counts CK.ab+PDL1+', 'Counts CK.ab+PDL1-', 'Counts CK.ab-PDL1-', 'Counts CK.ab-PDL1+', 'Counts CK.ab+TCF7+', 'Counts CK.ish+CXCL10.11+', 'Counts CK.ish+CXCL10.11-', 'Counts CK.ish-CXCL10.11+', 'Counts CK.ish-CXCL10.11-', 'Counts CK.ish+']
[SAVED] 2e74_01: grid shape=(156, 116, 3) → /Users/jamesbolepan/Documents/cancer-immune-simulations/data/nature_immune_processed/2e74_01.npy
[SAVED] 2e74_02: grid shape=(168, 136, 3) → /Users/jamesbolepan/Documents/cancer-immune-simulations/data/nature_immune_processed/2e74_02.npy
[SAVED] 2e74_03: grid 

In [70]:
grids

{'2e74_01': array([[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]], shape=(156, 116, 3)),
 '2e74_02': array([[[0.