## Batch Process h5ad files for EnDecon and CARD analysis
<font color="red">For scenario 1 and 3 datasets</font>

Step-003: Process synthetic datasets for EnDecon input
Description: In this step, we will read in all synthetic files within the assigned folder, process the single cell reference and synthetic data into 4 csv files. These csv files will be the input files for EnDecon. Note that some of these csv files might have a large file size.


### Use Kernal: sim_data

#### assuming the gene list is the same for sc reference and synthetic data, otherwise, you need to perform intersection for genes first!

In [1]:
# pip install hdf5plugin
import pandas as pd
import scanpy as sc
import hdf5plugin #important to read files downloaded from s3
# hdf5plugin.version   #  should be at least 4.1.3
import os

## Data processing Functions

In [2]:
def SynData_pro (sim_file, ref_id, refdir):
    # Read data
    print ("Read Synthetic data: " + sim_file)
    sim_path = "/home/oneai/data/bucket_data_download/" + ref_id + "/" + sim_file
    sim_id = sim_file[:-5] #get rid of ".h5ad"
    adata_vis = sc.read_h5ad (sim_path)
    adata_vis.obs['sample'] = sim_id
    
    syndir = refdir + sim_id + "/"
    if not os.path.exists(syndir):
        os.mkdir(syndir)
    
    #Process synthetic data
    # File 1: Spatial count
    spatial_count_df = pd.DataFrame(adata_vis.X.transpose(), columns= list(adata_vis.obs.index) , index= list(adata_vis.var.index))
    file1 = syndir + 'spatial_count.csv'
    spatial_count_df.to_csv(file1, index = True)
    print ("Export file 1/2: spatial_count")
    
    # File 2: Spatial location
    spatial_location_df = pd.DataFrame(adata_vis.obsm['spatial'], columns = ["x","y"])
    spatial_location_df.index = spatial_location_df.index.map(str)
    spatial_location_df.index = "X" + spatial_location_df.index
    file2 = syndir + 'spatial_location.csv'
    spatial_location_df.to_csv(file2, index = True)
    print ("Export file 2/2: spatial_location")
    
    return file1, file2

In [3]:
def RefData_pro (ref_id, refdir):
    # Read data
    print ("Read reference data: " + ref_id)
    ref_run_name = "/home/oneai/data/bucket_data_download/" + ref_id      #for reference_signatures
    adata_ref = sc.read_h5ad(ref_run_name + "/Reference_for_deconv.h5ad")
    
    #    
    #Process reference data
    # File 1: sc meta 
    sc_meta_df = pd.DataFrame (adata_ref.obs, columns = ['cell_type', 'sample'])
    # for CARD dense matrix, "-" will be "." 
    sc_meta_df.index = sc_meta_df.index.str.replace('-','.')
    sc_meta_df['cellID'] = sc_meta_df.index
    file1 = refdir + 'sc_meta.csv'
    sc_meta_df.to_csv(file1, index = True)
    print ("Export file 1/2: sc_meta")
    
    # File 2: sc count 
    sc_count_df = adata_ref.X.todense()
    sc_count_df = pd.DataFrame(sc_count_df.transpose(), columns= list(adata_ref.obs.index) , index= list(adata_ref.var.index))
    file2 = refdir + 'sc_count.csv'
    sc_count_df.to_csv(file2, index = True)
    print ("Export file 2/2: sc_count")
    
    return file1, file2

## Batch running

### Step1  Your input in this cell only, change ref_if for scenario 1 or 3

In [10]:
#ref_id Should be the folder name under /results/syn_deconv/

#ref_id = '6702e4f7a944d6095c2c2d1496311866' # for scenario 3
ref_id = 'b2c29bbd640a75d9d4415fead7f854d4' # For scenario 1
synthetic_folder = "/home/oneai/data/bucket_data_download/" +ref_id

In [11]:
# get a list of simulated files in this folder
# file named start with "simulated_SRT_dataset"
#sim_file_list = [file_name for file_name in os.listdir(synthetic_folder) if 'simulated_SRT_dataset' in file_name]

sim_file_list = [file_name for file_name in os.listdir(synthetic_folder) if ('simulated_SRT_dataset' in file_name)&(".h5ad" in file_name.lower())]
sim_file_list

['simulated_SRT_dataset_94a0d58229d38a8cfe6af48f0bc9ffe7.h5ad',
 'simulated_SRT_dataset_dcc01a2a5f65d552a54ee0d1f8863acc.h5ad',
 'simulated_SRT_dataset_c456392019d1940e7e699a14aec5bcf6.h5ad',
 'simulated_SRT_dataset_4555a63ca602ee279f81713e3f30adfb.h5ad',
 'simulated_SRT_dataset_8dbc6acb6712fee359bf3afe1857c536.h5ad',
 'simulated_SRT_dataset_49db3a23106485d79a11c5f9ec2be108.h5ad',
 'simulated_SRT_dataset_24bc51ffa07c227a8aaab7f616fa7166.h5ad',
 'simulated_SRT_dataset_24d8649ad90c3e20cc02ad42f5246bd1.h5ad']

### Step2 Batch processing single cell reference data and synthetic data
This steps takes longer time

In [12]:
refdir = "/home/oneai/data/EnDecon_input/" + ref_id + "/"
if not os.path.exists(refdir):
    os.mkdir(refdir)

# Process reference dataset
sc_meta, sc_count = RefData_pro (ref_id, refdir)

# Process syntetic dataset
for file in sim_file_list: 
    print (file)
    spatial_count, spatial_location = SynData_pro (file, ref_id, refdir)
    

simulated_SRT_dataset_94a0d58229d38a8cfe6af48f0bc9ffe7.h5ad
Read Synthetic data: simulated_SRT_dataset_94a0d58229d38a8cfe6af48f0bc9ffe7.h5ad
Export file 1/2: spatial_count
Export file 2/2: spatial_location
simulated_SRT_dataset_dcc01a2a5f65d552a54ee0d1f8863acc.h5ad
Read Synthetic data: simulated_SRT_dataset_dcc01a2a5f65d552a54ee0d1f8863acc.h5ad
Export file 1/2: spatial_count
Export file 2/2: spatial_location
simulated_SRT_dataset_c456392019d1940e7e699a14aec5bcf6.h5ad
Read Synthetic data: simulated_SRT_dataset_c456392019d1940e7e699a14aec5bcf6.h5ad
Export file 1/2: spatial_count
Export file 2/2: spatial_location
simulated_SRT_dataset_4555a63ca602ee279f81713e3f30adfb.h5ad
Read Synthetic data: simulated_SRT_dataset_4555a63ca602ee279f81713e3f30adfb.h5ad
Export file 1/2: spatial_count
Export file 2/2: spatial_location
simulated_SRT_dataset_8dbc6acb6712fee359bf3afe1857c536.h5ad
Read Synthetic data: simulated_SRT_dataset_8dbc6acb6712fee359bf3afe1857c536.h5ad
Export file 1/2: spatial_count
Expo

### Optional: check the output files

In [16]:
sim_id = '43885de192f6b406821c09b949f8e5ea'

In [17]:
sc_count = pd.read_csv('/home/oneai/data/EnDecon_input/'+ ref_id + '/simulated_SRT_dataset_' + sim_id + '/spatial_count.csv')

In [18]:
sc_meta

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,390,391,392,393,394,395,396,397,398,399
0,ENSG00000000003,0,0,0,0,0,0,0,0,0,...,3,0,3,0,3,0,2,0,2,0
1,ENSG00000000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSG00000000419,4,1,3,1,4,1,4,1,3,...,2,1,2,2,2,1,2,1,2,2
3,ENSG00000000457,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000000460,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28019,ENSG00000283078,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28020,ENSG00000283103,0,1,0,0,0,1,0,0,0,...,3,2,3,2,3,3,3,2,3,2
28021,ENSG00000283117,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28022,ENSG00000283118,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
spatial_count = pd.read_csv('/home/oneai/data/EnDecon_input/6702e4f7a944d6095c2c2d1496311866/simulated_SRT_dataset_43885de192f6b406821c09b949f8e5ea/spatial_count.csv')

In [20]:
spatial_count

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,390,391,392,393,394,395,396,397,398,399
0,ENSG00000000003,0,0,0,0,0,0,0,0,0,...,3,0,3,0,3,0,2,0,2,0
1,ENSG00000000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSG00000000419,4,1,3,1,4,1,4,1,3,...,2,1,2,2,2,1,2,1,2,2
3,ENSG00000000457,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000000460,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28019,ENSG00000283078,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28020,ENSG00000283103,0,1,0,0,0,1,0,0,0,...,3,2,3,2,3,3,3,2,3,2
28021,ENSG00000283117,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28022,ENSG00000283118,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
