In [1]:
import commot as ct
import scanpy as sc
import pandas as pd
import numpy as np

### Run Commot

In [2]:
adata = sc.read_h5ad('/ix/djishnu/shared/djishnu_kor11/training_data_2025/snrna_human_tonsil.h5ad')
adata

AnnData object with n_obs × n_vars = 5778 × 3549
    obs: 'cell_type', 'author_cell_type', 'cell_type_int', 'leiden', 'leiden_R', 'cell_type_2'
    uns: 'author_cell_type_colors', 'cell_type_2_colors', 'cell_type_colors', 'dendrogram_leiden', 'leiden', 'leiden_R', 'leiden_colors', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'ora_estimate', 'ora_pvals', 'spatial', 'spatial_unscaled'
    varm: 'PCs'
    layers: 'imputed_count', 'normalized_count'
    obsp: 'connectivities', 'distances'

In [3]:
# adata.X = adata.layers['imputed_count']
adata.X = adata.layers['normalized_count']


In [4]:
df_ligrec = ct.pp.ligand_receptor_database(
    database='CellChat', 
    species='human', 
    signaling_type=None
)
    
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']  

In [5]:
ct.tl.spatial_communication(adata,
    database_name='user_database', df_ligrec=df_ligrec, dis_thr=200, heteromeric=True)

In [6]:
adata

AnnData object with n_obs × n_vars = 5778 × 3549
    obs: 'cell_type', 'author_cell_type', 'cell_type_int', 'leiden', 'leiden_R', 'cell_type_2'
    uns: 'author_cell_type_colors', 'cell_type_2_colors', 'cell_type_colors', 'dendrogram_leiden', 'leiden', 'leiden_R', 'leiden_colors', 'neighbors', 'pca', 'umap', 'commot-user_database-info'
    obsm: 'X_pca', 'X_umap', 'ora_estimate', 'ora_pvals', 'spatial', 'spatial_unscaled', 'commot-user_database-sum-sender', 'commot-user_database-sum-receiver'
    varm: 'PCs'
    layers: 'imputed_count', 'normalized_count'
    obsp: 'connectivities', 'distances', 'commot-user_database-EFNA5-EPHA3', 'commot-user_database-EFNA5-EPHA2', 'commot-user_database-EFNA5-EPHA1', 'commot-user_database-EFNA5-EPHA4', 'commot-user_database-SEMA6A-PLXNA4', 'commot-user_database-CD96-NECTIN1', 'commot-user_database-CD96-PVR', 'commot-user_database-CXCL10-CXCR3', 'commot-user_database-LCK-CD8A_CD8B', 'commot-user_database-NRXN3-NLGN3', 'commot-user_database-NRXN3-NLGN1'

In [7]:
# adata.write_h5ad('commot.h5ad')
# adata = sc.read_h5ad('commot.h5ad')

In [8]:
lr_info = {k.replace('commot-user_database-', ''): v for k, v in adata.obsp.items() if 'commot-user_database-' in k}
len(lr_info)

239

In [9]:
df_ligrec['name'] = df_ligrec['ligand'] + '-' + df_ligrec['receptor']
len(df_ligrec['name'].unique())

1938

In [10]:
df_ligrec = df_ligrec[df_ligrec['name'].isin(lr_info.keys())]
df_ligrec['signaling'].value_counts()

signaling
ECM-Receptor          89
Cell-Cell Contact     80
Secreted Signaling    69
Name: count, dtype: int64

### Get cluster communication scores

In [11]:
from tqdm import tqdm

for name in tqdm(df_ligrec['name'].unique()):

    ct.tl.cluster_communication(adata, database_name='user_database', pathway_name=name, clustering='cell_type',
        random_seed=12, n_permutations=100)


100%|██████████| 238/238 [21:02<00:00,  5.30s/it]


In [12]:
# adata.write_h5ad('commot_cluster.h5ad')

In [13]:
from collections import defaultdict
data_dict = defaultdict(dict)

for name in df_ligrec['name']:
    data_dict[name]['communication_matrix'] = adata.uns[f'commot_cluster-cell_type-user_database-{name}']['communication_matrix']
    data_dict[name]['communication_pvalue'] = adata.uns[f'commot_cluster-cell_type-user_database-{name}']['communication_pvalue']

import pickle
with open('/ix/djishnu/shared/djishnu_kor11/commot_outputs/tonsil_communication.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

In [14]:
data_dict[name]['communication_matrix']

Unnamed: 0,B memory,B naive,CD4+ T,DC,GC B,Macrophages,Plasma,T cells,Tfh
B memory,2.095644e-06,8.353067e-07,8.335533e-07,9.249579e-07,6.119164e-08,7.404731e-07,2.814607e-06,3.001041e-06,2.311913e-10
B naive,2.911999e-07,6.657985e-07,2.378677e-06,3.667573e-08,1.562063e-06,3.319903e-06,1.50329e-06,9.967709e-08,0.0
CD4+ T,7.257167e-08,6.924273e-08,1.68049e-05,2.143465e-06,9.715337e-07,0.0,7.411088e-07,4.356478e-06,5.572553e-06
DC,2.225611e-06,1.272428e-07,1.054148e-05,1.51253e-05,1.650814e-06,5.050991e-07,7.16326e-06,7.371988e-06,8.933529e-07
GC B,0.0,0.0,1.071443e-06,1.7981e-08,5.581995e-07,4.424655e-07,1.024771e-06,4.748916e-10,2.253601e-10
Macrophages,1.228234e-06,4.553331e-08,6.682417e-06,1.449201e-06,2.606084e-06,9.31586e-05,1.992549e-05,1.184459e-05,2.167524e-05
Plasma,2.945086e-07,1.055453e-08,4.184535e-08,2.742177e-07,4.761751e-06,1.184002e-06,3.048466e-05,1.720666e-07,6.53745e-09
T cells,3.099752e-06,7.434104e-07,1.100197e-05,5.704448e-06,1.816207e-06,1.882568e-07,1.19864e-06,2.079213e-05,1.575489e-06
Tfh,1.388513e-06,1.575063e-08,2.881256e-06,2.852561e-06,5.456191e-06,2.105352e-07,1.051707e-05,1.903669e-06,5.313902e-07


In [15]:
def get_sig_interactions(value_matrix, p_matrix, pval=0.05):
    p_matrix = np.where(p_matrix < pval, 1, 0)
    return value_matrix * p_matrix

interactions = {}
for lig, rec in tqdm(zip(df_ligrec['ligand'], df_ligrec['receptor'])):
    name = lig + '-' + rec

    value_matrix = adata.uns[f'commot_cluster-cell_type-user_database-{name}']['communication_matrix']
    p_matrix = adata.uns[f'commot_cluster-cell_type-user_database-{name}']['communication_pvalue']

    sig_matrix = get_sig_interactions(value_matrix, p_matrix)
    
    if sig_matrix.sum().sum() > 0:
        interactions[name] = sig_matrix
    
len(interactions)

238it [00:00, 1727.19it/s]


238

### Get expanded LR masks

In [16]:
import sys
sys.path.append('../../src')

from spaceoracle.tools.network import expand_paired_interactions

In [17]:
expanded = expand_paired_interactions(df_ligrec)

genes = set(expanded.ligand) | set(expanded.receptor)
genes = list(genes)
len(genes)

198

In [18]:
df_ligrec['name'] = df_ligrec['ligand'] + '@' + df_ligrec['receptor']
len(df_ligrec['name'].unique())

238

In [19]:
x, y = zip(*[name.split('@') for name in df_ligrec['name']])
y = list(y)

In [21]:
# units2genes = {lig: lig.split('_') for lig in x}
# units = units2genes.keys()
# cell_thresholds = df
# counts_df = adata.to_df(layer='imputed_count')


In [22]:
from collections import defaultdict

lr_units = defaultdict(lambda: defaultdict(list))

for lig, rec, name in zip(expanded['ligand'], expanded['receptor'], expanded['name']):
    lr_units[name]

In [23]:
# create cell x gene matrix
ct_masks = {ct: adata.obs['cell_type'] == ct for ct in adata.obs['cell_type'].unique()}

df = pd.DataFrame(index=adata.obs_names, columns=genes)
df = df.fillna(0)

for name, lig, rec in tqdm(zip(expanded.name, expanded.ligand, expanded.receptor), total=len(expanded)):

    interaction_df = interactions[name]

    tmp = interactions[name].sum(axis=0)
    for ct, val in zip(interactions[name].index, tmp):
        df.loc[ct_masks[ct], lig] += tmp[ct]
    
    tmp = interactions[name].sum(axis=1)
    for ct, val in zip(interactions[name].columns, tmp):
        df.loc[ct_masks[ct], rec] += tmp[ct]

df

100%|██████████| 320/320 [00:07<00:00, 43.61it/s]


Unnamed: 0_level_0,EFNA5,SEMA6A,CD96,CXCL10,TNFRSF9,NCAM2,LCK,NRXN3,CSF1R,CD44,...,NLGN1,CD28,ANGPTL1,COL6A1,ITGA4,CD36,NPR2,CCR6,ICOSLG,PTPRM
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGCGCCTTG-1,0.000333,0.000000,0.000011,0.000008,0.000022,0.000005,0.000132,0.000083,0.000144,0.000158,...,0.000000,0.000000,0.000094,0.000262,0.000000,0.000000,0.000000,0.000048,0.000225,0.001721
AAACCCAAGTGGACGT-1,0.000017,0.000000,0.000062,0.000000,0.000000,0.000000,0.000000,0.000083,0.000223,0.001443,...,0.000004,0.000484,0.000000,0.000225,0.000178,0.000000,0.000051,0.000000,0.000080,0.000192
AAACCCACAGAAGTGC-1,0.000028,0.000000,0.000000,0.000000,0.000009,0.000008,0.000000,0.000085,0.000000,0.000438,...,0.000000,0.000997,0.000000,0.000150,0.000016,0.000000,0.000000,0.000000,0.000000,0.000000
AAACCCAGTCATTGCA-1,0.000629,0.000320,0.000517,0.000004,0.000000,0.000128,0.000007,0.000230,0.001678,0.006365,...,0.000181,0.000684,0.000203,0.000525,0.001482,0.000041,0.000123,0.000077,0.000015,0.004682
AAACCCATCATCGCAA-1,0.000090,0.000000,0.000000,0.000000,0.000013,0.000000,0.000000,0.000090,0.000064,0.002305,...,0.000000,0.000611,0.000000,0.000147,0.000000,0.000012,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCAGGGACTA-1,0.000028,0.000000,0.000000,0.000000,0.000009,0.000008,0.000000,0.000085,0.000000,0.000438,...,0.000000,0.000997,0.000000,0.000150,0.000016,0.000000,0.000000,0.000000,0.000000,0.000000
TTTGTTGCATTGTAGC-1,0.000017,0.000000,0.000062,0.000000,0.000000,0.000000,0.000000,0.000083,0.000223,0.001443,...,0.000004,0.000484,0.000000,0.000225,0.000178,0.000000,0.000051,0.000000,0.000080,0.000192
TTTGTTGGTACCACGC-1,0.000028,0.000000,0.000000,0.000000,0.000009,0.000008,0.000000,0.000085,0.000000,0.000438,...,0.000000,0.000997,0.000000,0.000150,0.000016,0.000000,0.000000,0.000000,0.000000,0.000000
TTTGTTGGTCTGTCCT-1,0.000125,0.000035,0.000000,0.000004,0.000000,0.000021,0.000030,0.000081,0.000057,0.000057,...,0.000000,0.000000,0.000000,0.000124,0.000000,0.000000,0.000000,0.000012,0.000091,0.000260


In [29]:
print('Number of LR filtered using celltype specificity:')
np.where(df > 0, 1, 0).sum().sum() / (df.shape[0] * df.shape[1])

Number of LR filtered using celltype specificity:


0.5733555702403055

In [25]:
# df.to_parquet('/ix/djishnu/shared/djishnu_kor11/miscellaneous/tonsil_commot_LRs.parquet')
df.to_parquet('/ix/djishnu/shared/djishnu_kor11/commot_outputs/tonsil_LRs.parquet')


### Get true LR pairs

In [30]:
genes = set(df_ligrec.ligand) | set(df_ligrec.receptor)
genes = list(genes)
len(genes)

199

In [33]:
# create cell x LR unit matrix
ct_masks = {ct: adata.obs['cell_type'] == ct for ct in adata.obs['cell_type'].unique()}

df = pd.DataFrame(index=adata.obs_names, columns=genes)
df = df.fillna(0)

for lig, rec in tqdm(zip(df_ligrec.ligand, df_ligrec.receptor), total=len(df_ligrec)):
    name = lig + '-' + rec
    interaction_df = interactions[name]

    tmp = interactions[name].sum(axis=0)
    for ct, val in zip(interactions[name].index, tmp):
        df.loc[ct_masks[ct], lig] += tmp[ct]
    
    tmp = interactions[name].sum(axis=1)
    for ct, val in zip(interactions[name].columns, tmp):
        df.loc[ct_masks[ct], rec] += tmp[ct]

df

100%|██████████| 238/238 [00:06<00:00, 37.11it/s]


Unnamed: 0_level_0,EFNA5,SEMA6A,CD96,CXCL10,TNFRSF9,NCAM2,LCK,NRXN3,CSF1R,ITGA1_ITGB1,...,CD8B,NLGN1,CD28,ANGPTL1,COL6A1,CD36,NPR2,CCR6,ICOSLG,PTPRM
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGCGCCTTG-1,0.000333,0.000000,0.000011,0.000008,0.000022,0.000005,0.000066,0.000083,0.000144,0.000013,...,0.000000,0.000000,0.000000,0.000047,0.000221,0.000000,0.000000,0.000048,0.000225,0.001721
AAACCCAAGTGGACGT-1,0.000017,0.000000,0.000062,0.000000,0.000000,0.000000,0.000000,0.000083,0.000223,0.000051,...,0.000014,0.000004,0.000484,0.000000,0.000214,0.000000,0.000051,0.000000,0.000080,0.000192
AAACCCACAGAAGTGC-1,0.000028,0.000000,0.000000,0.000000,0.000009,0.000008,0.000000,0.000085,0.000000,0.000019,...,0.000000,0.000000,0.000997,0.000000,0.000125,0.000000,0.000000,0.000000,0.000000,0.000000
AAACCCAGTCATTGCA-1,0.000629,0.000320,0.000517,0.000004,0.000000,0.000128,0.000003,0.000230,0.001678,0.000764,...,0.000000,0.000181,0.000684,0.000101,0.000407,0.000041,0.000123,0.000077,0.000015,0.004682
AAACCCATCATCGCAA-1,0.000090,0.000000,0.000000,0.000000,0.000013,0.000000,0.000000,0.000090,0.000064,0.000148,...,0.000000,0.000000,0.000611,0.000000,0.000143,0.000012,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCAGGGACTA-1,0.000028,0.000000,0.000000,0.000000,0.000009,0.000008,0.000000,0.000085,0.000000,0.000019,...,0.000000,0.000000,0.000997,0.000000,0.000125,0.000000,0.000000,0.000000,0.000000,0.000000
TTTGTTGCATTGTAGC-1,0.000017,0.000000,0.000062,0.000000,0.000000,0.000000,0.000000,0.000083,0.000223,0.000051,...,0.000014,0.000004,0.000484,0.000000,0.000214,0.000000,0.000051,0.000000,0.000080,0.000192
TTTGTTGGTACCACGC-1,0.000028,0.000000,0.000000,0.000000,0.000009,0.000008,0.000000,0.000085,0.000000,0.000019,...,0.000000,0.000000,0.000997,0.000000,0.000125,0.000000,0.000000,0.000000,0.000000,0.000000
TTTGTTGGTCTGTCCT-1,0.000125,0.000035,0.000000,0.000004,0.000000,0.000021,0.000015,0.000081,0.000057,0.000012,...,0.000012,0.000000,0.000000,0.000000,0.000120,0.000000,0.000000,0.000012,0.000091,0.000260


In [34]:
# df.to_parquet('/ix/djishnu/shared/djishnu_kor11/miscellaneous/tonsil_commot_LRs_units.parquet')
df.to_parquet('/ix/djishnu/shared/djishnu_kor11/commot_outputs/tonsil_LRs_units.parquet')


### Scratch

In [None]:
# # def count_interactions(matrix):
# #     mask = matrix.astype(bool).toarray()
# #     mask = np.maximum(mask, mask.T)
# #     mask = np.triu(mask, k=1)
# #     return mask.sum()

# def count_interactions(matrix):
#     return matrix.sum()

# num_values = {k: count_interactions(lr_info[k]) for k in lr_info.keys()}

In [None]:
# import matplotlib.pyplot as plt

# plt.hist(list(num_values.values()), bins=1000)
# plt.semilogy()

# # threshold = round(adata.n_obs * 0.05)
# threshold = np.percentile(list(num_values.values()), 10)

# plt.text(threshold, plt.ylim()[1] * 0.9, f'Threshold: {threshold}', color='red', ha='center')
# plt.axvline(threshold, color='red', linestyle='dashed', linewidth=1)
# plt.axvspan(0, threshold, color='red', alpha=0.3)
# plt.xlim(0, 20000)
# plt.xlabel('Number of interactions')

# plt.show()

In [None]:
from collections import defaultdict

celltypes = adata.obs['cell_type'].unique()
interactions = defaultdict(lambda: defaultdict(dict))

for a in celltypes:
    a_mask = adata.obs['cell_type'] == a
    
    for b in celltypes:
        b_mask = adata.obs['cell_type'] == b
        
        for k, v in lr_info.items():
            if k == 'total=total':
                continue

            
            interactions[a][b][k]=np.sum(v[a_mask, :][:, b_mask].astype(bool))

len(interactions)

In [None]:
celltypes

In [None]:
sig_interactions = defaultdict(lambda: defaultdict(list))
discard = defaultdict(lambda: defaultdict(list))

cell_counts = {k: (adata.obs['cell_type'] == k).sum() for k in celltypes}

for sender in celltypes:
    for receiver in celltypes:
        
        tot_cells = cell_counts[sender] + cell_counts[receiver]
        
            
            observed = np.sum(v)
            null_distribution = [np.sum(np.random.permutation(v)) for _ in range(1000)]
            p_value = np.mean([null >= observed for null in null_distribution])
            
            if p_value < 0.05:
                sig_interactions[sender][receiver].append(name)
            else:
                discard[sender][receiver].append(name)

In [None]:
len(sig_interactions['T cells']['T cells']), len(discard['T cells']['T cells'])

In [None]:
interactions[sender][receiver].items()

In [None]:
sender = 'GC B'
receiver = 'Tfh'

sig_vals = [interactions[sender][receiver][s] for s in sig_interactions[sender][receiver]]
discard_vals = [interactions[sender][receiver][s] for s in discard[sender][receiver]]


In [None]:
interactions[sender][receiver]['CDH2-CDH2']

In [None]:
sig_interactions[sender][receiver]

In [None]:
plt.hist(sig_vals, bins=1000, alpha=0.5, label='Significant', color='blue')
# _ = plt.hist(discard_vals, bins=1000, alpha=0.5, label='Discarded', color='red')
plt.ylim(0, 20)
plt.xlim(0, 5000)

In [None]:


sig_interactions['GC B']['Tfh']