## CODE TO GET A SET WITH RANDOMLY MIXED CONDITIONS

<div class="alert alert-info">
    <b>Notes about usage: </b>
<ul style="list-style-type:decimal">
    <li>Set parameters </li>
    <li>Import dependencies</li>
    <li>Run required methods</li>
    <li>Check anndata object </li>
</ul>

</div>



___
### Parameters

In [34]:
folder_path = "/home/oneai/data/bucket_data_download/b2c29bbd640a75d9d4415fead7f854d4/pool"

metadata_feature_list= ['CancerousCells',
 'CellNumberPerSpot',
 'CellTypeList',
 'CellTypeNumber',
 'DataQuality',
 'Dataorigin',
 'DomainCellTypeForEachRegion',
 'Scenarios',
 'Subject',
 'Tissue']

seed=40

___
### Dependencies

In [35]:
import os
import collections
import itertools
import random

import numpy as np
import pandas as pd

import scanpy as sc
from anndata import AnnData
import hdf5plugin


___
### Methods

In [36]:
def get_mix_meta_template_dict(metadata_feature_list:list):
    """This method generate an empty dictionary used to save 
    :params metadata_feature_list (list): feature stored at metadata.
    """
    filter_dict = collections.defaultdict(dict)
    for key in metadata_feature_list:
        filter_dict[key] = []  
    filter_dict.update({'ids': []})
    return filter_dict

def get_mix_metadata(metadata_feature_list: list, metadata: dict, force_to_str=True):
    """This method is used to aggregate metadata comming from mixed dataset generation
    :params metadata_feature_list (list): feature stored at metadata.
    :params metadata (dict): dictionary with metadata from each synthetic dataset
    """
    mix_meta_dict = dict(get_mix_meta_template_dict(metadata_feature_list))

    for sample in metadata.keys():
        mix_meta_dict["ids"].append(sample)
        if force_to_str:
            for feature in list(mix_meta_dict.keys())[0:-1]:
                mix_meta_dict[feature].append(str(metadata[sample][feature]))
        else:
            for feature in list(mix_meta_dict.keys())[0:-1]:
                mix_meta_dict[feature].append(metadata[sample][feature])
    return mix_meta_dict

#def get_mixed_dataset(folder_path:str):
#    """ This method take all """

def check_list_of_list_size(lists):
    """ function used to check consistency """
    if all(len(lists[0]) == len(l) for l in lists[1:]):
        return True
    else:
        return False
    
def check_for_length(lists:list, msg:str):
    if check_list_of_list_size(lists):
        print(f"same number of {msg} detected")
    else:
        print(f"different number of {msg} detected, execution will brake")
        raise
        
def get_anndata_for_sim(X:np.array,
                     coordinates:np.array,
                     ground_truth_cc:pd.DataFrame,
                     ground_truth_cp: pd.DataFrame,
                     configure:dict,
                     spot_names:pd.DataFrame.index,
                     gene_names:pd.DataFrame.index):
    """This method generate a anndata object used for data simulation
    params: X (no.array): count matix with spots in rows and genes in columns
    params: coordinates (np.array) :numpy array with X1 and X2 coordinates from each spot.
    params: ground_truth_cc (pd.DataFrame) : dataframe with ground truth cell counts for each spot
    params: ground_truth_cp (pd.DataFrame) : dataframe with ground truth cell proportions for each spot
    """
    adata = AnnData(X, 
                    obsm={"spatial": coordinates}, 
                    dtype=np.int64)
    adata.uns['ground_truth']={"cell_proportion":ground_truth_cp, 'cell_count':ground_truth_cc}
    adata.uns['configure']  = configure
    adata.var= gene_names
    adata.obs= spot_names
    return adata


def get_mixed_dataset_from_path(folder_path:str, metadata_feature_list:dict, seed=40 ):
    """ This method generate an anndata object with a random mix of dataset builded for fixed conditions
    In order to run, desired h5ad files generated with simulation pipeline need to be placed at the same folder_path
    and a dict with metadata stored at configure bucket from anndata need to be provided. An anndata object will be provided.
    with randomnluy selected spots and metadata from samples stored at the folder_path
    :params folder path : directory
    :params metadata_feature: dictionary with metadata expected at configure file
    :params seed (int): random seed used for the sampling 
    """
    files_list = os.listdir(folder_path)
    h5ad_list = [item for item in files_list if ".h5ad" in item.lower() ]
    
    print ("h5ad files to be mixed: " + str(h5ad_list)) #JX added
    n_files = len(h5ad_list)
    X=[]
    ground_truth_cc=[]
    ground_truth_cp=[]
    cell_types=[]
    obs=[]
    var=[]
    spatial=[]
    configure={}

    for file in h5ad_list:
        anndata=sc.read_h5ad(os.path.join(folder_path,file))
        X.append(anndata.X)
        ground_truth_cc.append(anndata.uns['ground_truth']['cell_count'])
        ground_truth_cp.append(anndata.uns['ground_truth']['cell_proportion'])
        obs.append(anndata.obs)
        var.append(anndata.var)
        spatial.append(anndata.obsm['spatial'])
        configure[file]=anndata.uns['configure']

    check_for_length(obs, "spots")
    check_for_length(var, "genes")
    check_for_length(spatial, "spots coordinates")

    #fixed features (will not change)
    obs = obs[0]
    var = var[0]
    spatial = spatial[0]


    X=np.array(X)
    new_X= []
    new_ground_truth_cc=pd.DataFrame()
    new_ground_truth_cp=pd.DataFrame()
    configure = get_mix_metadata(metadata_feature_list, configure,force_to_str=True)

    #getting mixed results
    random.seed(seed)
    for spot in range(len(obs)):
        rand = random.randint(0, len(X)-1)
        new_X.append(X[rand, spot, :])
        new_ground_truth_cc = pd.concat([new_ground_truth_cc, ground_truth_cc[rand].iloc[spot, :]],axis=1 )
        new_ground_truth_cp= pd.concat([new_ground_truth_cp, ground_truth_cp[rand].iloc[spot, :]],axis=1)

    new_ground_truth_cc = new_ground_truth_cc.transpose()
    new_ground_truth_cp = new_ground_truth_cp.transpose()

    print(f"generating anndata object")
    adata = get_anndata_for_sim(
        X=np.array(new_X),
        coordinates=spatial,
        ground_truth_cc = new_ground_truth_cc,
        ground_truth_cp=new_ground_truth_cp,
        configure=configure,
        spot_names= obs,
        gene_names= var,
    )
    return adata

In [37]:
adata = get_mixed_dataset_from_path(folder_path, metadata_feature_list, seed=seed )

h5ad files to be mixed: ['simulated_SRT_dataset_24d8649ad90c3e20cc02ad42f5246bd1.h5ad', 'simulated_SRT_dataset_c456392019d1940e7e699a14aec5bcf6.h5ad', 'simulated_SRT_dataset_94a0d58229d38a8cfe6af48f0bc9ffe7.h5ad']
same number of spots detected
same number of genes detected
same number of spots coordinates detected
generating anndata object


___
### Check outputs

In [38]:
adata.X

array([[ 5,  0, 22, ...,  0,  0,  0],
       [ 3,  0, 12, ...,  0,  0,  0],
       [ 4,  0,  6, ...,  0,  0,  0],
       ...,
       [ 3,  0, 11, ...,  0,  0,  0],
       [ 2,  0, 17, ...,  0,  0,  0],
       [ 3,  0, 11, ...,  0,  0,  0]])

In [39]:
adata.X.shape

(400, 28024)

In [40]:
adata.uns['ground_truth']['cell_proportion']

Unnamed: 0,fibroblast of lung,B cell,alveolar capillary type 1 endothelial cell,epithelial cell of lung
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
395,1.0,0.0,0.0,0.0
396,1.0,0.0,0.0,0.0
397,1.0,0.0,0.0,0.0
398,1.0,0.0,0.0,0.0


In [41]:
adata.uns['ground_truth']['cell_count']

Unnamed: 0,fibroblast of lung,B cell,alveolar capillary type 1 endothelial cell,epithelial cell of lung
0,20.0,0.0,0.0,0.0
1,10.0,0.0,0.0,0.0
2,10.0,0.0,0.0,0.0
3,5.0,0.0,0.0,0.0
4,5.0,0.0,0.0,0.0
...,...,...,...,...
395,10.0,0.0,0.0,0.0
396,10.0,0.0,0.0,0.0
397,10.0,0.0,0.0,0.0
398,20.0,0.0,0.0,0.0


In [42]:
adata.obsm["spatial"]

array([[ 2.5,  2.5],
       [ 7.5,  2.5],
       [12.5,  2.5],
       [17.5,  2.5],
       [22.5,  2.5],
       [27.5,  2.5],
       [32.5,  2.5],
       [37.5,  2.5],
       [42.5,  2.5],
       [47.5,  2.5],
       [52.5,  2.5],
       [57.5,  2.5],
       [62.5,  2.5],
       [67.5,  2.5],
       [72.5,  2.5],
       [77.5,  2.5],
       [82.5,  2.5],
       [87.5,  2.5],
       [92.5,  2.5],
       [97.5,  2.5],
       [ 2.5,  7.5],
       [ 7.5,  7.5],
       [12.5,  7.5],
       [17.5,  7.5],
       [22.5,  7.5],
       [27.5,  7.5],
       [32.5,  7.5],
       [37.5,  7.5],
       [42.5,  7.5],
       [47.5,  7.5],
       [52.5,  7.5],
       [57.5,  7.5],
       [62.5,  7.5],
       [67.5,  7.5],
       [72.5,  7.5],
       [77.5,  7.5],
       [82.5,  7.5],
       [87.5,  7.5],
       [92.5,  7.5],
       [97.5,  7.5],
       [ 2.5, 12.5],
       [ 7.5, 12.5],
       [12.5, 12.5],
       [17.5, 12.5],
       [22.5, 12.5],
       [27.5, 12.5],
       [32.5, 12.5],
       [37.5,

In [43]:
adata.uns["configure"]

{'CancerousCells': ['No', 'No', 'No'],
 'CellNumberPerSpot': ['5', '20', '10'],
 'CellTypeList': ["['fibroblast of lung' 'epithelial cell of lung'\n 'alveolar capillary type 1 endothelial cell' 'B cell']",
  "['fibroblast of lung' 'epithelial cell of lung'\n 'alveolar capillary type 1 endothelial cell' 'B cell']",
  "['fibroblast of lung' 'epithelial cell of lung'\n 'alveolar capillary type 1 endothelial cell' 'B cell']"],
 'CellTypeNumber': ['4', '4', '4'],
 'DataQuality': ['Clean', 'Clean', 'Clean'],
 'Dataorigin': ['hlca', 'hlca', 'hlca'],
 'DomainCellTypeForEachRegion': ["{'A': 'fibroblast of lung', 'B': 'epithelial cell of lung', 'C': 'alveolar capillary type 1 endothelial cell', 'D': 'B cell'}",
  "{'A': 'fibroblast of lung', 'B': 'epithelial cell of lung', 'C': 'alveolar capillary type 1 endothelial cell', 'D': 'B cell'}",
  "{'A': 'fibroblast of lung', 'B': 'epithelial cell of lung', 'C': 'alveolar capillary type 1 endothelial cell', 'D': 'B cell'}"],
 'Scenarios': ['Same subje

In [44]:
import hashlib
dataset_id = hashlib.md5(str(adata.uns['configure']).encode()).hexdigest()
print (dataset_id)
print (folder_path)

8dbc6acb6712fee359bf3afe1857c536
/home/oneai/data/bucket_data_download/b2c29bbd640a75d9d4415fead7f854d4/pool


In [45]:
import hdf5plugin
adata.write_h5ad(
    os.path.join(folder_path, 'simulated_SRT_dataset_'+ dataset_id + ".h5ad"),
    compression=hdf5plugin.FILTERS["zstd"]
)

#generate groundtruth csv file
adata.uns['ground_truth']["cell_proportion"].to_csv(folder_path + '/simulated_SRT_dataset_' + dataset_id + '_gd_prop.csv')
adata.uns['ground_truth']["cell_count"].to_csv(folder_path + '/simulated_SRT_dataset_' + dataset_id +  '_gd_count.csv')

# Finally, remember to move generated files put of pool folder