In [4]:
from scipy.stats import median_abs_deviation

import sys
sys.path.insert(0, '/home/workspace/mm_analysis')
sys.path.insert(0, '/home/workspace/')

from py_util import *
from utilities import *

hdir = '/home/workspace'
wdir = hdir + "/mm_analysis/EXP-01244"
objdir = wdir + "/object_building/objects/"

adata = sc.read_h5ad(objdir + "processed_merged_adata.h5ad")

# Downsampled

In [21]:
cr_outs_path = os.path.join(hdir, "mm_analysis/EXP-01244/data/EXP-01244_cr_outs")

# Dictionary mapping user-friendly sample names to their corresponding IDs
sample_dict = {
   'week2': "OR07965-01",    
   'week3': "OR07965-02", 
   'week4': "OR00001",
   'bm': "BMC07965-007",
   'msc': "CELL00911"
}

name_dict = {
   'week2': "Week 2",
   'week3': "Week 3", 
   'week4': "Week 4",
   'bm': "BMMC Start Sample",
   'msc': "MSC Start Sample"   
}

# Create reverse mapping from sample IDs to their user-friendly names
id_to_sample = {v: k for k, v in sample_dict.items()}

# Find all filtered_feature_bc_matrix.h5 files in the directory structure
h5_paths = [os.path.join(root, 'sample_filtered_feature_bc_matrix.h5') 
           for root, _, files in os.walk(cr_outs_path) 
           if 'sample_filtered_feature_bc_matrix.h5' in files]

starters = ["BMC07965-007", "CELL00911"]

samples = [sample for sample in sample_dict.values() if sample not in starters]

final_adatas = {}

for sample in samples:

    # Get only the sample h5 paths
    paths = [path for path in h5_paths if sample in path]

    # Dictionary to store AnnData objects for each sample
    adatas = {}
    
    # Process each H5 file
    for path in paths:
        # Extract sample name from path
        name = path.split('per_sample_outs/')[1].split('/')[0]
        
        # Read the H5 file and create AnnData object
        adata = sc.read_10x_h5(path)
        adata.var_names_make_unique()
    
        adata.obs['sample'] = name
    
        adata.obs['base_sample'] = adata.obs['sample'].str.replace(r'_\d+$', '', regex=True)      # Add metadata column for batched replicates
        adata.obs['sample_type'] = adata.obs['base_sample'].replace(id_to_sample)                 # Add sample names (week2, week3, etc.)
        adata.obs['name'] = adata.obs['sample_type'].replace(name_dict)                           # Add pretty names (Week 2, Week 3, etc.)
    
        adatas[name] = adata.copy()
    
    adata = ad.concat(adatas.values(), join='outer', merge='same')

    final_adatas[adata.obs['sample_type'].unique()[0]] = adata.copy()

# Downsample to lowest cell count (Week 3: 1092)
ds_adata = {}

for name, adata in final_adatas.items():
    downsampled = sc.pp.subsample(
        adata.copy(),
        n_obs = 1092,     
        random_state = 0,
        copy = True
    )

    ds_adata[name] = downsampled

adata = ad.concat(ds_adata.values(), join = 'outer', merge = 'same')

adata.write(objdir + 'downsampled_all_organoids_adata.h5ad', compression='gzip')

... storing 'sample' as categorical
... storing 'base_sample' as categorical
... storing 'sample_type' as categorical
... storing 'name' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
