In [1]:
import sys
sys.path.insert(1, '../scripts/') # comment out in python script
from load_environmental_variables import *

In [2]:
import cell2cell as c2c
import numpy as np
import pandas as pd
import h5py

from tqdm.auto import tqdm

import glob
import os
from multiprocessing import Pool

  import pandas.util.testing as tm


In [3]:
print('Load params')
files = {'ppi': local_data_path + 'raw/Human-2020-Cabello-Aguilar-LR-pairs.csv', 
        'output_folder': local_data_path + 'interim/get_CCI_psuedotime/'}

if not os.path.isdir(files['output_folder']):
    os.mkdir(files['output_folder'])

rnaseq_setup, ppi_setup, cutoff_setup, analysis_setup = dict(), dict(), dict(), dict()

rnaseq_setup['gene_col'] = None
rnaseq_setup['drop_nangenes'] = True
rnaseq_setup['log_transform'] = False

ppi_setup['protein_cols'] = ['ligand', 'receptor']

cutoff_setup['type'] = 'constant_value'
cutoff_setup['parameter'] = 0.1

analysis_setup['communication_score'] = 'expression_thresholding'
analysis_setup['cci_score'] = 'bray_curtis'
analysis_setup['cci_type'] = 'undirected'


Load params


In [4]:
cell_types = c2c.io.load_table(local_data_path + 'processed/5k_pbmc_celltypes_velocytoformatted.csv')
celltype_mapper = cell_types[['SampleID', 'Cell_Type']].set_index('SampleID').to_dict()['Cell_Type']

/data2/hratch/immune_CCI_pseudotime/processed/5k_pbmc_celltypes_velocytoformatted.csv was correctly loaded


In [6]:
filenames = glob.glob(local_data_path + 'interim/velocyto_analyses/projected_gene_expression_csvs/T-*.csv')


def get_CCI(file, counter):
    basename = os.path.basename(file).split('.csv')[0]
    print('Filename:' + basename)
    print('File number {} of {}'.format(counter, len(filenames)))
    try:
        print('Format rnaseq data')
        rnaseq_data = c2c.io.load_rnaseq(rnaseq_file=file,
                                                 gene_column=rnaseq_setup['gene_col'],
                                                 drop_nangenes=rnaseq_setup['drop_nangenes'],
                                                 log_transformation=rnaseq_setup['log_transform'],
                                                 format='auto',
                                                 **{'index_col' : 0})
        print('Format ppi data')
        ppi_data = c2c.io.load_ppi(ppi_file=files['ppi'],
                                   interaction_columns=ppi_setup['protein_cols'],
                                   rnaseq_genes=list(rnaseq_data.index),
                                   format='auto')
        
        print('Set up parameters')
        if analysis_setup['cci_type'] == 'undirected':
            bi_ppi_data = c2c.preprocessing.bidirectional_ppi_for_cci(ppi_data=ppi_data, verbose=False)
            ref_ppi = ppi_data
        else:
            bi_ppi_data = ppi_data.copy()
            ref_ppi = None
            
        print('Setup interaction space')
        interaction_space = c2c.core.InteractionSpace(rnaseq_data=rnaseq_data,
                                                              ppi_data=bi_ppi_data,
                                                              gene_cutoffs=cutoff_setup,
                                                              communication_score=analysis_setup['communication_score'],
                                                              cci_score=analysis_setup['cci_score'],
                                                              cci_type=analysis_setup['cci_type'],
                                                              verbose=False)
        # compute interactions
        print('Compute interactions')
        interaction_space.compute_pairwise_cci_scores(use_ppi_score=False, verbose=False)

        # # untested - don't need communication for now
        # # compute communication
        # interaction_space.compute_pairwise_communication_scores(ref_ppi_data=ref_ppi, cci_type='directed', verbose=False)

        print('Save CCI dataframe')
        interaction_space.interaction_elements['cci_matrix'].to_csv(files['output_folder'] + basename + '_CCI.csv')
        # interaction_space.interaction_elements['communication_matrix'].to_csv(files['output_folder'] + basename + '_CCC.csv')
    except:
        print('CCI failed on ' + basename)
    print('-----------------------------------------------------------------------------')

In [None]:
print('Begin parallelization')
pool = Pool(processes = 20)
pool.starmap(get_CCI, zip(filenames, list(range(1, len(filenames) + 1))))
pool.close()

In [5]:
# #run the following command in terminal
# cmd = 'python ' + root_path + 'scripts/get_CCI_pseudotime.py > ' + local_data_path 
# cmd += 'interim/get_CCI_pseudotime_terminal_output.txt'
# print(cmd)


python /home/hratch/Projects/immune_CCI_psuedotime/scripts/get_CCI_pseudotime.py > /data2/hratch/immune_CCI_pseudotime/interim/get_CCI_pseudotime_terminal_output.txt


In [37]:
# compress files to send to Cameron
from tqdm import tqdm

CCI_dt = h5py.File(local_data_path + 'interim/CCI_dt.h5', 'w')
filenames = sorted(os.listdir(local_data_path + 'interim/get_CCI_psuedotime/'))
for file in tqdm(filenames):
    time = float(file.split('-')[1])
    df = pd.read_csv(local_data_path + 'interim/get_CCI_psuedotime/' + file, index_col = 0)
    CCI_dt.create_dataset(str(time), data=np.array(df))

CCI_dt.close()


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:14<23:56, 14.51s/it][A
  2%|▏         | 2/100 [00:29<23:46, 14.56s/it][A
  3%|▎         | 3/100 [00:42<23:10, 14.34s/it][A
  4%|▍         | 4/100 [00:56<22:37, 14.14s/it][A
  5%|▌         | 5/100 [01:10<22:12, 14.02s/it][A
  6%|▌         | 6/100 [01:24<21:47, 13.91s/it][A
  7%|▋         | 7/100 [01:37<21:25, 13.83s/it][A
  8%|▊         | 8/100 [01:51<21:11, 13.82s/it][A
  9%|▉         | 9/100 [02:05<20:51, 13.75s/it][A
 10%|█         | 10/100 [02:18<20:36, 13.74s/it][A
 11%|█         | 11/100 [02:32<20:18, 13.69s/it][A
 12%|█▏        | 12/100 [02:46<20:11, 13.77s/it][A
 13%|█▎        | 13/100 [02:59<19:51, 13.70s/it][A
 14%|█▍        | 14/100 [03:13<19:39, 13.71s/it][A
 15%|█▌        | 15/100 [03:27<19:24, 13.70s/it][A
 16%|█▌        | 16/100 [03:41<19:13, 13.73s/it][A
 17%|█▋        | 17/100 [03:54<18:53, 13.66s/it][A
 18%|█▊        | 18/100 [04:08<18:44, 13.71s/it][A
 19%|█▉        | 19/100 [04:2