In [2]:
import pandas as pd
import numpy as np
from scipy import ndimage as ndi
import glob

from PIL import Image
from numpy import asarray

from scipy.sparse import csr_matrix

from joblib import Parallel, delayed
from tqdm import tqdm

import os

os.chdir('/active/paper')

In [3]:
midbrain_metadata = pd.read_csv('input/metadata/sample_metadata_midbrain.txt', sep='\t', index_col='sample')
midbrain_metadata

Unnamed: 0_level_0,sample_name,region,age,genotype,mouse,dopa_patch_x,dopa_patch_y
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FP200000454TL_E2,YOUNG_WT_REL204.1a,MB,YOUNG,WT,REL204.1a,16000,9000
FP200000542BR_A5,YOUNG_WT_REL204.1b,MB,YOUNG,WT,REL204.1b,6000,12000
FP200000542BR_A3,YOUNG_OVX_REL206.1f,MB,YOUNG,OVX,REL206.1f,11500,11000
FP200000542BR_F3,YOUNG_OVX_REL207.1a,MB,YOUNG,OVX,REL207.1a,8500,12500
FP200000578TR_C3,OLD_WT_REL121.1a,MB,OLD,WT,REL121.1a,15000,11500
FP200000542BR_F4,OLD_WT_REL121.1b,MB,OLD,WT,REL121.1b,10000,12000
FP200000578TR_C5,OLD_OVX_REL120.1b,MB,OLD,OVX,REL120.1b,16000,16000
FP200000542BR_A1,OLD_OVX_REL120.1a,MB,OLD,OVX,REL120.1a,11000,11500


In [4]:
files = glob.glob("input/masks/20211214/*.png")

mask_dict = {}

for mask_file in sorted(files):
    label = mask_file.split("/")[-1].split('spot_intensity_')[-1].split('.png')[0]
    # print(label)
    # print(mask_file)
    mask_dict[label] = mask_file
    
# filter for MB samples
mask_dict = {key: value for key, value in mask_dict.items() if key in midbrain_metadata.index}

mask_dict

{'FP200000454TL_E2': 'input/masks/20211214/spot_intensity_FP200000454TL_E2.png',
 'FP200000542BR_A1': 'input/masks/20211214/spot_intensity_FP200000542BR_A1.png',
 'FP200000542BR_A3': 'input/masks/20211214/spot_intensity_FP200000542BR_A3.png',
 'FP200000542BR_A5': 'input/masks/20211214/spot_intensity_FP200000542BR_A5.png',
 'FP200000542BR_F3': 'input/masks/20211214/spot_intensity_FP200000542BR_F3.png',
 'FP200000542BR_F4': 'input/masks/20211214/spot_intensity_FP200000542BR_F4.png',
 'FP200000578TR_C3': 'input/masks/20211214/spot_intensity_FP200000578TR_C3.png',
 'FP200000578TR_C5': 'input/masks/20211214/spot_intensity_FP200000578TR_C5.png'}

In [8]:
files = glob.glob("input/gems/midbrain/*bin1.Lasso.gem.gz")

gem_dict = {}

for gem_file in sorted(files):
    label = gem_file.split("/")[-1].split('.bin1.Lasso.gem.gz')[0]
    # print(label)
    # print(gem_file)
    gem_dict[label] = gem_file
    
# filter for MB samples
gem_dict = {key: value for key, value in gem_dict.items() if key in midbrain_metadata.index}

gem_dict

{'FP200000454TL_E2': 'input/gems/midbrain/FP200000454TL_E2.bin1.Lasso.gem.gz',
 'FP200000542BR_A1': 'input/gems/midbrain/FP200000542BR_A1.bin1.Lasso.gem.gz',
 'FP200000542BR_A3': 'input/gems/midbrain/FP200000542BR_A3.bin1.Lasso.gem.gz',
 'FP200000542BR_A5': 'input/gems/midbrain/FP200000542BR_A5.bin1.Lasso.gem.gz',
 'FP200000542BR_F3': 'input/gems/midbrain/FP200000542BR_F3.bin1.Lasso.gem.gz',
 'FP200000542BR_F4': 'input/gems/midbrain/FP200000542BR_F4.bin1.Lasso.gem.gz',
 'FP200000578TR_C3': 'input/gems/midbrain/FP200000578TR_C3.bin1.Lasso.gem.gz',
 'FP200000578TR_C5': 'input/gems/midbrain/FP200000578TR_C5.bin1.Lasso.gem.gz'}

In [15]:
def create_mask_df(sample_name, folder):
    
    Image.MAX_IMAGE_PIXELS = 10000000000
    
    print(sample_name)
    
    mask_file = mask_dict[sample_name]
    gem_file = gem_dict[sample_name]
    
    # load the gem counts
    # df = pd.read_csv(gem_file, header = 0, sep = "\t").set_index(['x', 'y'])
    
    # load the image
    image = Image.open(mask_file)
    # convert image to numpy array
    data = asarray(image)
    # label the objects
    label_objects, nb_labels = ndi.label(data)
    # create a sparse matrix
    coo = csr_matrix(label_objects).tocoo(copy = False)
    # Access `row`, `col` and `data` properties of coo matrix.
    mask_df = pd.DataFrame({
        'x': coo.row,
        'y': coo.col,
        'label': coo.data
    })[['x', 'y', 'label']].sort_values(['x', 'y']).set_index(['x', 'y'])
    
    path = 'input/masks_dfs/midbrain/'+folder
    if not os.path.exists(path): 
        os.makedirs(path)

    mask_df.to_csv(path + '/'+sample_name+'_mask.csv')
    
    # inner join the two
    # df_single_cell = df.join(mask_df, how='inner')
    # write to csv
    # df_single_cell.to_csv('output/'+sample_name+'_sc_counts.csv')

In [10]:
inputs = tqdm(list(mask_dict.keys()))
inputs

  0%|                                                                                                                                                                                                                  | 0/8 [00:00<?, ?it/s]

<tqdm.std.tqdm at 0x7fc9f3af0100>

In [16]:
create_mask_df(list(mask_dict.keys())[0], folder = '20220907')

FP200000454TL_E2


In [7]:
inputs = tqdm(list(mask_dict.keys()))

Parallel(n_jobs=2)(delayed(create_mask_df)(sample_name) for sample_name in inputs)

100%|██████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.62s/it]


FP200000542BR_A1
FP200000542BR_F3
FP200000578TR_C3




[None, None, None, None, None, None, None]

FP200000542BR_A3
FP200000542BR_A5
FP200000542BR_F4
FP200000578TR_C5
