In [1]:
# Reuse the inferred GRN, LR interactions, and compute niches
# GRN from CO is already formatted (bc cell types are shared
# LRs/cluster are imferred from COMMOT, but need to be formatted /cell
# Run COVET to get the niche representations

In [None]:
import scanpy as sc 
import pandas as pd 
import numpy as np 

import pickle 

import sys 
sys.path.append('../../src')

In [None]:
import scanpy as sc
adata = sc.read_h5ad('/ix/djishnu/shared/djishnu_kor11/training_data_2025/mouse_kidney_13.h5ad')
adata

In [None]:
%%time
# adata = sc.read_h5ad('/Users/allywang/Desktop/work/S2025/SpaceOracle/data/mouse_kidney_visiumHD/mouse_kidney_visium.h5ad')
adata = sc.read_h5ad('/ix/djishnu/shared/djishnu_kor11/training_data_2025/mouse_kidney_visium.h5ad')
adata.obs['cell_type'].value_counts()

In [None]:
adata

AnnData object with n_obs × n_vars = 415538 × 3058
    obs: 'ct3', 'cell_type', 'cell_type_int'
    obsm: 'spatial', 'spatial_unscaled'
    layers: 'imputed_count', 'normalized_count', 'raw_count'

In [None]:
np.min(adata.X), np.max(adata.X)

(0.0, 5.973809611869261)

### Reformat COMMOT links

In [None]:
with open('/Users/allywang/Desktop/work/S2025/SpaceOracle/data/mouse_kidney_survey/kidney_communication.pkl', 'rb') as f:
    info = pickle.load(f)

len(info)

326

In [6]:
expanded = pd.DataFrame(
    [i.rsplit('-') for i in info.keys()],
    index = info.keys(),
    columns = ['ligand', 'receptor', 'drop']
)
expanded = expanded.drop('drop', axis=1)
expanded.head(3)

Unnamed: 0,ligand,receptor
Tgfb2-Tgfbr1,Tgfb2,Tgfbr1
Tgfb3-Tgfbr1,Tgfb3,Tgfbr1
Bmp2-Bmpr1b,Bmp2,Bmpr1b


In [7]:
from tqdm import tqdm 

def get_sig_interactions(value_matrix, p_matrix, pval=0.3):
    p_matrix = np.where(p_matrix < pval, 1, 0)
    return value_matrix * p_matrix

interactions = {}
for lig, rec in tqdm(zip(expanded['ligand'], expanded['receptor'])):
    name = lig + '-' + rec

    if name in info.keys():

        value_matrix = info[name]['communication_matrix']
        p_matrix = info[name]['communication_pvalue']

        sig_matrix = get_sig_interactions(value_matrix, p_matrix)
        
        if sig_matrix.sum().sum() > 0:
            interactions[name] = sig_matrix
    
len(interactions)

326it [00:00, 3841.44it/s]


320

In [8]:
# create cell x gene matrix
genes = adata.var_names
ct_masks = {ct: adata.obs['cell_type'] == ct for ct in adata.obs['cell_type'].unique()}

df = pd.DataFrame(index=adata.obs_names, columns=genes, dtype=np.float32)
df = df.fillna(0)

for name in tqdm(interactions.keys(), total=len(interactions)):
    lig, rec = name.rsplit('-', 1)
    
    tmp = interactions[name].sum(axis=1)
    for ct, val in zip(interactions[name].index, tmp):
        df.loc[ct_masks[ct], lig] += tmp[ct]
    
    tmp = interactions[name].sum(axis=0)
    for ct, val in zip(interactions[name].columns, tmp):
        df.loc[ct_masks[ct], rec] += tmp[ct]

df.shape

100%|██████████| 320/320 [00:17<00:00, 17.88it/s]


(415538, 3058)

In [9]:
print('Number of LR filtered using celltype specificity:')
np.where(df > 0, 1, 0).sum().sum() / (df.shape[0] * df.shape[1])


Number of LR filtered using celltype specificity:


0.051751274237527733

In [10]:
df.to_parquet('/Users/allywang/Desktop/work/S2025/SpaceOracle/data/mouse_kidney_visiumHD/kidney_visiumHD_LRs.parquet')


### Separate adata because it's large

In [None]:
pd.DataFrame(
    adata.layers['raw_count'],
    index=adata.obs_names,
    columns=adata.var_names
).to_parquet('/ix/djishnu/shared/djishnu_kor11/raw_data/mouse_kidney_visiumHD/raw_count.parquet')

del adata.layers['raw_count']

In [None]:
pd.DataFrame(
    adata.layers['normalized_count'],
    index=adata.obs_names,
    columns=adata.var_names
).to_parquet('/ix/djishnu/shared/djishnu_kor11/raw_data/mouse_kidney_visiumHD/normalized_count.parquet')

del adata.layers['normalized_count']

In [None]:
# only imputed count is used during training
adata.X = adata.layers['imputed_count']
adata.write('/ix/djishnu/shared/djishnu_kor11/training_data_2025/mouse_kidney_visium.h5ad')