In [1]:
import cell2cell as c2c
import scanpy as sc
import scanorama as sm
import scvi

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

import os

Global seed set to 0


# Load Data

In [2]:
data_folder = '../data/'

In [3]:
output_folder = './outputs/'
if not os.path.isdir(output_folder):
        os.mkdir(output_folder)

**scRNA-seq**

In [4]:
rna_folder = data_folder + 'COVID-19/'
rna_directory = os.fsencode(rna_folder)

In [5]:
data = dict()
for file in os.listdir(rna_directory):
    filename = os.fsdecode(file)
    if filename.endswith(".h5"): 
        print(filename)
        basename = os.path.basename(filename)
        sample = basename.split('_')[1]
        data[sample] = sc.read_10x_h5(rna_folder + filename)
    else:
        continue

GSM4475051_C148_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4475050_C100_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4339770_C142_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4475048_C51_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4339774_C146_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4475049_C52_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4475052_C149_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4339771_C143_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4475053_C152_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4339773_C145_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM4339772_C144_filtered_feature_bc_matrix.h5
GSM4339769_C141_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [6]:
meta = pd.read_csv(rna_folder + '/metadata.txt', sep='\t')

In [7]:
meta = meta.sort_values(['sample_new', 'sample'])

In [8]:
meta.head()

Unnamed: 0,ID,sample,sample_new,group,disease,hasnCoV,cluster,celltype
0,AAACCTGAGACACTAA_1,C51,HC1,HC,N,N,3,Macrophages
1,AAACCTGAGGAGTACC_1,C51,HC1,HC,N,N,3,Macrophages
2,AAACCTGAGGATATAC_1,C51,HC1,HC,N,N,3,Macrophages
3,AAACCTGAGGTCATCT_1,C51,HC1,HC,N,N,3,Macrophages
4,AAACCTGCACGGATAG_1,C51,HC1,HC,N,N,5,Macrophages


**LR pairs**

In [9]:
lr_pairs = pd.read_csv(data_folder + '/LR-pairs/Human-2020-Jin-LR-pairs.csv')

In [10]:
lr_pairs.head(2)

Unnamed: 0,interaction_name,pathway_name,ligand,receptor,agonist,antagonist,co_A_receptor,co_I_receptor,evidence,annotation,interaction_name_2
0,TGFB1_TGFBR1_TGFBR2,TGFb,TGFB1,TGFbR1_R2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,KEGG: hsa04350,Secreted Signaling,TGFB1 - (TGFBR1+TGFBR2)
1,TGFB2_TGFBR1_TGFBR2,TGFb,TGFB2,TGFbR1_R2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,KEGG: hsa04350,Secreted Signaling,TGFB2 - (TGFBR1+TGFBR2)


**Change annotations of protein complexes:** Use all capital letters for names of proteins and separate subunits by "&".

For example, complex composed by ProteinX and ProteinY would be **PROTEINX&PROTEINY**

In [11]:
# Change complex annotations
lr_pairs['ligand_symbol'] = lr_pairs.interaction_name_2.apply(lambda x: x.split(' - ')[0].upper())
lr_pairs['receptor_symbol'] = lr_pairs.interaction_name_2.apply(lambda x: x.split(' - ')[1].upper() \
                                                       .replace('(', '').replace(')', '').replace('+', '&'))

lr_pairs['c2c_interaction'] = lr_pairs.apply(lambda row: row['ligand_symbol'] + '^' + row['receptor_symbol'], axis=1)

In [12]:
# interaction columns:
int_columns = ('ligand_symbol', 'receptor_symbol')

Remove bidirectionality in the list of ligand-receptor pairs. That is, remove repeated interactions where both interactions are the same but in different order:

From this list:

| Ligand | Receptor |
| --- | --- |
| Protein A | Protein B |
| Protein B | Protein A |

We will have:

| Ligand | Receptor |
| --- | --- |
| Protein A | Protein B |

In [13]:
lr_pairs = c2c.preprocessing.remove_ppi_bidirectionality(ppi_data=lr_pairs, 
                                                         interaction_columns=int_columns
                                                         )

Removing bidirectionality of PPI network


In [14]:
lr_pairs.shape

(1991, 14)

**Generate a dictionary with function info for each LR pairs. Keys are LIGAND_NAME^RECEPTOR_NAME and values are the function in the annotation column in the dataframe containing ligand-receptor pairs.**

In [15]:
ppi_functions = dict()

for idx, row in lr_pairs.iterrows():
    ppi_label = row[int_columns[0]] + '^' + row[int_columns[1]]
    ppi_functions[ppi_label] = row['annotation']

# scRNA-seq Pre-processing

**Sample names**

In [16]:
sample_meta = meta[['sample', 'sample_new', 'disease']].drop_duplicates().reset_index(drop=True)
sample_meta = sample_meta.loc[sample_meta['sample'].isin(list(data.keys()))]

In [17]:
context_names = sample_meta['sample'].tolist()

In [18]:
context_labels = sample_meta['sample_new'].tolist()

**Pre-processing**

In [19]:
count_matrices = []
log_matrices = []
fraction_matrices = []

cell_number = dict()

for context in tqdm(context_names):
    df = data[context]
    df.var_names_make_unique()
    
    # Modify names of genes
    df.var.columns = [col.upper() for col in df.var.columns]
    # Modify names of cells
    df.obs.index = [idx.split('-')[0] for idx in df.obs.index]

    # Meta
    meta_context = meta.loc[meta['sample'] == context].set_index('ID')
    meta_context.index = [idx.split('_')[0] for idx in meta_context.index]
    meta_context.index.name = 'barcode'
    
    
    # Keep cells with metadata
    cells = list(meta_context.index)
    tmp_data = df[cells]
    tmp_data.obs = tmp_data.obs.join(meta_context)
    
    cell_num = tmp_data.obs.groupby('celltype').count().to_dict()['group']
    cell_num['Total'] = len(cells)
    cell_number[context] = cell_num
        
    # Keep genes in each sample with at least n single cells expressing it
    n = 4
    sc.pp.filter_genes(tmp_data, min_cells=n) # At least 4, or >3
        
    # Aggregate gene expression of single cells into cell types
    exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=tmp_data.to_df(),
                                                      metadata=meta_context,
                                                      barcode_col='barcode',
                                                      celltype_col='celltype',
                                                      method='nn_cell_fraction',
                                                     )
    
    count_matrices.append(tmp_data)    
    fraction_matrices.append(exp_df)
    
    # log(CPM+1)
    tmp_data = df[cells]
    tmp_data.obs = tmp_data.obs.join(meta_context)
    
    sc.pp.normalize_total(tmp_data, target_sum=1e6, inplace=True)
    sc.pp.filter_genes(tmp_data, min_cells=n)
    sc.pp.log1p(tmp_data)
    
    log_matrices.append(tmp_data)

  0%|          | 0/12 [00:00<?, ?it/s]

In [20]:
total = 0
for k, v in cell_number.items():
    total += v['Total']

In [21]:
total

63103

*Keep only genes present across all samples*

In [22]:
inner_genes = []
for df in fraction_matrices:
    inner_genes.append(df.index)
inner_genes = set.intersection(*map(set, inner_genes))

In [23]:
len(inner_genes)

11688

In [24]:
count_matrices = [adata[:, adata.var.index.isin(inner_genes)] for adata in count_matrices]
log_matrices = [adata[:, adata.var.index.isin(inner_genes)] for adata in log_matrices]
fraction_matrices = [df.loc[inner_genes, :] for df in fraction_matrices]

In [25]:
del data

# Batch effects & Integration

In [26]:
seed = 888

**SCANORAMA**

In [32]:
# Batch correction.
sm_corrected = sm.correct_scanpy(log_matrices)

Found 11688 genes among all datasets
[[0.00000000e+00 6.37924690e-01 1.48090413e-01 2.26052557e-03
  2.34672925e-03 1.92837466e-02 2.36574403e-03 1.18287201e-03
  7.73993808e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 7.66588986e-02 4.80361684e-03
  5.28014080e-03 1.37741047e-02 4.53820680e-03 2.69839323e-03
  7.73993808e-04 1.16550117e-03 0.00000000e+00 5.52295478e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.44193297e-01
  6.11847233e-02 6.88705234e-02 3.31254871e-02 8.96336711e-03
  1.47058824e-02 5.82750583e-03 1.44997583e-03 2.57209665e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  8.06688178e-01 7.24517906e-01 6.80983329e-02 1.41847980e-01
  1.07585139e-01 9.96503497e-02 6.81488642e-02 5.00517777e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 8.70523416e-01 1.45203872e-01 2.60780287e-01
  1.33900929e-01 1.28787879e-01 1.59497342e-01 7.87021056e-02]
 [0.00000000e+00 0.00000000e

**SCANVI**

In [26]:
adata = count_matrices[0].concatenate(count_matrices[1:], batch_key='batch_key')

In [27]:
#adata.write_h5ad('../COVID-19-BALF.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_new' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'group' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'disease' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'hasnCoV' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'celltype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'FEATURE_TYPES' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-0' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-1' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-10' as categorical

In [31]:
scvi.model.SCVI.setup_anndata(adata, batch_key='batch_key', labels_key='celltype')

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"batch_key"[0m[1m][0m                                           
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"celltype"[0m[1m][0m                                             
[34mINFO    [0m Using data from adata.X                                                             
[34mINFO    [0m Successfully registered anndata object containing [1;36m63103[0m cells, [1;36m11688[0m vars, [1;36m12[0m       
         batches, [1;36m10[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates  
         and [1;36m0[0m extra continuous covariates.                                                  
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [32]:
scvi_model = scvi.model.SCVI(adata)

In [33]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, unlabeled_category="Unknown", adata=adata)



In [34]:
scvi_df = scanvi_model.get_normalized_expression()

In [35]:
scvi_df_matrices = [scvi_df.loc[adata.obs.loc[adata.obs['sample'] == context].index] for context in context_names]

**ComBat**

In [31]:
logdata = log_matrices[0].concatenate(log_matrices[1:], batch_key='batch_key')

In [32]:
#logdata.write_h5ad('../COVID-19-BALF-log1p.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_new' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'group' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'disease' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'hasnCoV' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'celltype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'FEATURE_TYPES' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-0' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-1' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-10' as categorical

In [41]:
sc.pp.combat(logdata, key='batch_key')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_new' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'group' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'disease' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'hasnCoV' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'celltype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'FEATURE_TYPES' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-0' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-1' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'GENOME-10' as categorical

In [42]:
combat_matrices = [logdata[adata.obs.loc[adata.obs['sample'] == context].index] for context in context_names]

# Matrices to build the Tensors

In [36]:
exp_matrices = dict()

*Aggregate count matrix into mean*

In [34]:
avg_count_matrices = []

for df in tqdm(count_matrices):
    meta_context = df.obs.copy()
    meta_context.index.name = 'barcode'
    
    exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=df.to_df(),
                                                      metadata=meta_context,
                                                      barcode_col='barcode',
                                                      celltype_col='celltype',
                                                      method='average',
                                                     )
    
    avg_count_matrices.append(exp_df)
exp_matrices['Avg Raw Counts'] = avg_count_matrices

  0%|          | 0/12 [00:00<?, ?it/s]

*Aggregate log matrix into mean*

In [35]:
avg_log_matrices = []

for df in tqdm(log_matrices):
    meta_context = df.obs.copy()
    meta_context.index.name = 'barcode'
    
    exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=df.to_df(),
                                                      metadata=meta_context,
                                                      barcode_col='barcode',
                                                      celltype_col='celltype',
                                                      method='average',
                                                     )
    
    avg_log_matrices.append(exp_df)
exp_matrices['Avg log1p(CPM)'] = avg_log_matrices

  0%|          | 0/12 [00:00<?, ?it/s]

*Fraction of non-zero cells*

In [36]:
exp_matrices['Fraction Non-Zero Cells'] = fraction_matrices

*Aggregate scanorama correction into mean*

In [37]:
avg_sm_matrices = []

for df in tqdm(sm_corrected):
    meta_context = df.obs.copy()
    meta_context.index.name = 'barcode'
    
    exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=df.to_df(),
                                                      metadata=meta_context,
                                                      barcode_col='barcode',
                                                      celltype_col='celltype',
                                                      method='average',
                                                     )
    
    avg_sm_matrices.append(exp_df)
exp_matrices['Scanorama'] = avg_sm_matrices

  0%|          | 0/12 [00:00<?, ?it/s]

*Aggregate SCVI correction into mean*

In [37]:
avg_scvi_matrices = []

for i, context in tqdm(enumerate(context_names), total=len(context_names)):
    # Meta
    meta_context = meta.loc[meta['sample'] == context].set_index('ID')
    meta_context.index = [idx.split('_')[0] for idx in meta_context.index]
    meta_context.index.name = 'barcode'
    
    
    # Keep cells with metadata
    cells_ = list(meta_context.index)
    tmp_data = scvi_df_matrices[i]
    tmp_data.index = [idx.split('-')[0] for idx in tmp_data.index]
    tmp_data = tmp_data.loc[cells_]
        
    # Aggregate gene expression of single cells into cell types
    exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=tmp_data,
                                                      metadata=meta_context,
                                                      barcode_col='barcode',
                                                      celltype_col='celltype',
                                                      method='average',
                                                     )
    
    avg_scvi_matrices.append(exp_df)
exp_matrices['SCANVI'] = avg_scvi_matrices

  0%|          | 0/12 [00:00<?, ?it/s]

*Aggregate ComBat into mean*

In [43]:
avg_cb_matrices = []

for df in tqdm(combat_matrices):
    meta_context = df.obs.copy()
    meta_context.index.name = 'barcode'
    
    exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=df.to_df(),
                                                      metadata=meta_context,
                                                      barcode_col='barcode',
                                                      celltype_col='celltype',
                                                      method='average',
                                                     )
    
    avg_cb_matrices.append(exp_df)
exp_matrices['ComBat'] = avg_cb_matrices

  0%|          | 0/12 [00:00<?, ?it/s]

# Build Tensors

In [44]:
for method, matrices in exp_matrices.items():

    tensor = c2c.tensor.InteractionTensor(rnaseq_matrices=matrices,
                                          ppi_data=lr_pairs,
                                          context_names=context_labels,
                                          how='inner',
                                          complex_sep='&',
                                          interaction_columns=int_columns,
                                          communication_score='expression_mean'
                                         )
    c2c.io.export_variable_with_pickle(tensor, output_folder + '/Tensor_{}.pkl'.format(method.replace(' ','_')))
    print(tensor.tensor.shape)
    print('')

Getting expression values for protein complexes
Building tensor for the provided context
./outputs//Tensor_SCANVI.pkl  was correctly saved.
(12, 189, 6, 6)

Getting expression values for protein complexes
Building tensor for the provided context
./outputs//Tensor_ComBat.pkl  was correctly saved.
(12, 189, 6, 6)



**Tensor Metadata**

In [44]:
def meta_disease(x):
    if 'HC' in x:
        return 'Control'
    elif 'M' in x:
        return 'Moderate COVID-19'
    elif 'S' in x:
        return 'Severe COVID-19'
    else:
        return 'NA'
    
sample_disease = dict()
for idx, row in sample_meta.iterrows():
    sample_disease[row['sample_new']] = meta_disease(row['sample_new'])

In [45]:
meta_tf = c2c.tensor.generate_tensor_metadata(interaction_tensor=tensor,
                                              metadata_dicts=[sample_disease, ppi_functions, None, None],
                                              fill_with_order_elements=True
                                             )

In [46]:
c2c.io.export_variable_with_pickle(meta_tf, output_folder + '/Meta_Tensors.pkl')


./outputs//Meta_Tensors.pkl  was correctly saved.
