Get communication scores from NATMI in each context of BALF data to use as input to tensor cell2cell

env_name: cci_dt

In [24]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import scanpy as sc

import cell2cell as c2c


expr_files = '/data2/eric/Tensor-Revisions/COVID-19-BALF-log1p.h5ad' # the log(1+CPM) files from Erick
rev_path = '/data3/hratch/tc2c_analyses_1/natcomm_revisions/'
natmi_path = '/data2/hratch/Software/NATMI/'
n_cores = 20
# installed for this:
# xlsxwriter=3.0.2
# xlrd=2.0.1
# NATMI: git clone https://github.com/asrhou/NATMI.git commit: f35f677cb2e4c8a176a6c5501a9483c9e28661a1

In [1]:
import sys
sys.path.insert(1, '/home/hratch/Projects/CCC/tc2c_analyses_1/notebooks/natcomm_revisions/')
from utility import natmi_edgelist_to_communication_matrix

NATMI uses connectomeDB2020 by default

# Prepare the Expression Data and MD

In [13]:
def format_for_natmi(adata_sample):
    """Format adata of a sample for input to NATMI"""
    # convert to CPM with genes as rows and cells as columns
    expr = pd.DataFrame(np.exp(adata_sample.X.toarray()) - 1 ).T
    expr.index = adata_sample.var.index.tolist()
    expr.columns = adata_sample.obs.index.tolist()

    # get the cell type annotation
    md = pd.DataFrame(adata_sample.obs['celltype'])
    md.reset_index(inplace = True)
    md.rename(columns = {'index': 'Cell', 'celltype': 'Annotation'}, inplace = True)
    
    return expr, md

def run_natmi(fp):
    """Run NATMI on inputs generated from 'format_for_natmi'"""
    # connectomeDB2020, human species, and gene symbol by default are correct
    cmd = ['python', natmi_path + 'ExtractEdges.py'] 
    cmd += ['--emFile', fp + '_CPM_expr.csv']
    cmd += ['--annFile', fp + '_metadata.csv']
    # cmd += ['--interDB', natmi_path + 'lrdbs/lrc2p.csv'] # connectomeDB2020

    if n_cores is not None and n_cores >1:
        cmd += ['--coreNum', str(n_cores)]
    cmd += ['--out', rev_path + 'interim/tc2c_external_inputs/natmi/natmi_outputs/' + sample]
    cmd = ' '.join(cmd)
    return cmd

In [149]:
adata = sc.read_h5ad(expr_files) # log(1+CPM) BALF from Erick
samples = adata.obs['sample'].unique()

for sample in tqdm(samples):
    fp = rev_path + 'interim/tc2c_external_inputs/natmi/natmi_inputs/' + sample 

    adata_sample = adata[adata.obs['sample'] == sample] # subset to sample of interest
    expr,md = format_for_natmi(adata_sample) # format for input to NATMI

    # files must be written because NATMI is command-line only
    expr.to_csv(fp + '_CPM_expr.csv', index=True, header=True)
    md.to_csv(fp + '_metadata.csv', index=False, header=True)

    cmd = run_natmi(fp)
    os.system(cmd) # this must be run within the NATMI directory, otherwise permission errors

    # load resultant edge list and format for input to tensor-cell2cell
    edge_file = rev_path + 'interim/tc2c_external_inputs/natmi/natmi_outputs/' + sample + '/Edges_lrc2p.csv'
    cm = natmi_edgelist_to_communication_matrix(edge_file) 
    cm.to_csv(rev_path + 'interim/tc2c_external_inputs/natmi/' + sample + '_communication_matrix.csv')

  expr = pd.DataFrame(np.exp(adata_sample.X.toarray()) - 1 ).T
