## Batch Process h5ad files for EnDecon and CARD analysis

### Use Kernal: sim_data

#### assuming the gene list is the same for sc reference and synthetic data, otherwise, you need to perform intersection for genes first!

Step-003: Process synthetic datasets for EnDecon input
Description: In this step, we will read in all synthetic files within the assigned folder, process the single cell reference and synthetic data into 4 csv files. These csv files will be the input files for EnDecon. Note that some of these csv files might have a large file size.




In [119]:
# pip install hdf5plugin
import pandas as pd
import scanpy as sc
import hdf5plugin #important to read files downloaded from s3
# hdf5plugin.version   #  should be at least 4.1.3
import os

## Data processing Functions

In [120]:
def SynData_pro (sim_file, ref_id, refdir):
    # Read data
    print ("Read Synthetic data: " + sim_file)
    sim_path = "/home/oneai/data/bucket_data_download/" + ref_id + "/" + sim_file
    sim_id = sim_file[:-5] #get rid of ".h5ad"
    adata_vis = sc.read_h5ad (sim_path)
    adata_vis.obs['sample'] = sim_id
    
    syndir = refdir + sim_id + "/"
    if not os.path.exists(syndir):
        os.mkdir(syndir)
    
    #Process synthetic data
    # File 1: Spatial count
    spatial_count_df = pd.DataFrame(adata_vis.X.transpose(), columns= list(adata_vis.obs.index) , index= list(adata_vis.var.index))
    file1 = syndir + 'spatial_count.csv'
    spatial_count_df.to_csv(file1, index = True)
    print ("Export file 1/2: spatial_count")
    
    # File 2: Spatial location
    spatial_location_df = pd.DataFrame(adata_vis.obsm['spatial'], columns = ["x","y"])
    spatial_location_df.index = spatial_location_df.index.map(str)
    spatial_location_df.index = "X" + spatial_location_df.index
    file2 = syndir + 'spatial_location.csv'
    spatial_location_df.to_csv(file2, index = True)
    print ("Export file 2/2: spatial_location")
    
    return file1, file2

In [121]:
def RefData_pro (ref_id, refdir, tail_of_ref):
    # Read data
    print ("Read reference data: " + ref_id)
    ref_run_name = "/home/oneai/data/bucket_data_download/" + ref_id      #for reference_signatures
    adata_ref = sc.read_h5ad(ref_run_name + "/Reference_for_deconv" + tail_of_ref + ".h5ad")
    
    #    
    #Process reference data
    # File 1: sc meta 
    sc_meta_df = pd.DataFrame (adata_ref.obs, columns = ['cell_type', 'sample'])
    # for CARD dense matrix, "-" will be "." 
    sc_meta_df.index = sc_meta_df.index.str.replace('-','.')
    sc_meta_df['cellID'] = sc_meta_df.index
    file1 = refdir +'sc_meta' + str(tail_of_ref) + '.csv'
    sc_meta_df.to_csv(file1, index = True)
    print ("Export file 1/2: sc_meta")
    
    # File 2: sc count 
    sc_count_df = adata_ref.X.todense()
    sc_count_df = pd.DataFrame(sc_count_df.transpose(), columns= list(adata_ref.obs.index) , index= list(adata_ref.var.index))
    file2 =  refdir +'sc_count' + str(tail_of_ref) + '.csv'
    sc_count_df.to_csv(file2, index = True)
    print ("Export file 2/2: sc_count")
    
    return file1, file2

## Step 1: Input of file path. Batch running


Here I made slightly adjust for the input for 3 scenarios, but the main body of the code remains the same. The major difference is: for scenario 2, I added a tail for ref_id indicating the source of reference data. For scenario 1 and 3, there is no tail added.

#### For scenario 1 or 3, modify ref_id input here

In [135]:
ref_id = '6702e4f7a944d6095c2c2d1496311866' # for scenario 3
# ref_id = 'b2c29bbd640a75d9d4415fead7f854d4' # For scenario 1

#In the case of haveing multiple reference data, attaching tail for reference, otherwise, tail is empty
tail_list =[''] # No tail for scenario 1 and 3

synthetic_folder = "/home/oneai/data/bucket_data_download/" +ref_id

#### For scenario 2, modify ref_id input here, with tails of reference file

In [129]:
ref_id = 'feff311061feb64e82aaf93071ed1d86' #for scenario 2, Different subject

tail_list = [
'_Banovich_Kropski_2020',
'_Lafyatis_Rojas_2019',
'_Teichmann_Meyer_2019',
'_Krasnow_2020',
'_Meyer_2019',
'_Atlas'
]

# modify this for atlas reference
'''
ref_id = '8186a67c130de14f068e45cf7b611538' #for scenario 2, Atlas
tail_list = [
'_Atlas'
]
'''

synthetic_folder = "/home/oneai/data/bucket_data_download/" +ref_id

In [136]:
# get a list of simulated files in this folder
# file named start with "simulated_SRT_dataset"
#sim_file_list = [file_name for file_name in os.listdir(synthetic_folder) if 'simulated_SRT_dataset' in file_name]

sim_file_list = [file_name for file_name in os.listdir(synthetic_folder) if ('simulated_SRT_dataset' in file_name)&(".h5ad" in file_name.lower())]
sim_file_list

['simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea_mixlevel3.h5ad',
 'simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea_mixlevel2.h5ad',
 'simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea_mixlevel1.h5ad',
 'simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea_mixlevel0.h5ad']

## Step 2 process reference data file

In [125]:
refdir = "/home/oneai/data/EnDecon_input/" + ref_id + "/"
if not os.path.exists(refdir):
    os.mkdir(refdir)

for tail_of_ref in tail_list:
    print( "Now processing sc_reference file:   Reference_for_deconv" + str(tail_of_ref) + '.h5ad')
    # Process reference dataset
    sc_meta, sc_count = RefData_pro (ref_id, refdir, tail_of_ref)


Now processing sc_reference file:   Reference_for_deconv.h5ad
Read reference data: 6702e4f7a944d6095c2c2d1496311866
Export file 1/2: sc_meta
Export file 2/2: sc_count


## Step 3 process spatial synthetic data

In [63]:
# Process syntetic dataset
for file in sim_file_list: 
    print (file)
    spatial_count, spatial_location = SynData_pro (file, ref_id, refdir)
    

simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea.h5ad
Read Synthetic data: simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea.h5ad
Export file 1/2: spatial_count
Export file 2/2: spatial_location


In [102]:
spatial_count

'/home/oneai/data/EnDecon_input/6702e4f7a944d6095c2c2d1496311866/simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea_mixlevel2/spatial_count.csv'

In [103]:
test = pd.read_csv(spatial_count)

In [104]:
test

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,390,391,392,393,394,395,396,397,398,399
0,ENSG00000000003,0,0,0,0,0,0,0,0,0,...,3,0,2,0,2,0,2,0,2,0
1,ENSG00000000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSG00000000419,4,1,4,1,4,1,4,1,4,...,2,2,2,1,2,2,2,1,2,1
3,ENSG00000000457,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000000460,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28019,ENSG00000283078,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28020,ENSG00000283103,0,0,0,1,0,1,0,1,0,...,3,2,3,2,3,2,3,2,3,3
28021,ENSG00000283117,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28022,ENSG00000283118,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
gd = pd.read_csv('/home/oneai/data/bucket_data_download/6702e4f7a944d6095c2c2d1496311866/simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea_mixlevel2_gd_prop.csv')

In [106]:
gd

Unnamed: 0.1,Unnamed: 0,B cell,"CD4-positive, alpha-beta T cell","CD8-positive, alpha-beta T cell",alveolar capillary type 1 endothelial cell,alveolar capillary type 2 endothelial cell,epithelial cell of lung,fibroblast of lung,tracheobronchial smooth muscle cell,type I pneumocyte
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.4,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.3,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.4,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...
395,395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
396,396,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.0
397,397,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.2,0.0
398,398,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.2,0.0
