### Step2: cell typing

In [1]:
import numpy as np
import pandas as pd
import anndata
import matplotlib.pyplot as plt
import pickle
import scipy.stats as stats
from collections import defaultdict
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors
import random
import math
import timeit
# start = timeit.default_timer()
# stop = timeit.default_timer()
# print(f'Time: {stop - start}') 
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
pd.options.mode.chained_assignment = None  # default='warn'
from collections import Counter

red = '#c0362c'
lightgreen = '#93c572'
darkgreen = '#4c9141'
lightblue = '#5d8aa8'
darkblue = '#2e589c'
white = '#fafafa'
lightgray = '#d3d3d3'
darkgray ='#545454'
lightorange = '#fabc2e'
darkorange = '#fb9912'
yellow = '#e4d00a'



#### load labeled cells from step1

In [2]:
# pkl
outfile = 'output_step1/crop_data_allcells.pkl'
# save
# pickle_dict = {}
# pickle_dict['crop_cell_df'] = crop_cell_df
# pickle_dict['cl_1centroid'] = cl_1centroid
# pickle_dict['seg_df_com'] = seg_df_com
# pickle_dict['centroid_df_com'] = centroid_df_com
# with open(outfile, 'wb') as f:
#     pickle.dump(pickle_dict, f)
    
# load
with open(outfile, 'rb') as f:
    pickle_dict = pickle.load(f)
crop_cell_df = pickle_dict['crop_cell_df']
cl_1centroid = pickle_dict['cl_1centroid']
seg_df_com = pickle_dict['seg_df_com']
centroid_df_com = pickle_dict['centroid_df_com']

In [3]:
# rename
crop_cell_df = crop_cell_df.rename(columns={'label': 'cell', 
                                            'target_gene': 'gene', 
                                            'x': '_x',
                                            'y': '_y',
                                            'global_x': 'x',
                                            'global_y': 'y',
                                            'global_z': 'z',
                                            'center_x': 'centerX',
                                            'center_y': 'centerY',
                                           })

In [4]:
crop_cell_df.head()

Unnamed: 0,barcode_id,x,y,z,_x,_y,fov,gene,cell,centerX,centerY,sc_total
41968214,77,-3691.7622,1518.1683,0.0,678.88007,1375.7654,846,Slc17a7,100011840018250995710399369016464324637,-3693.983511,1515.632457,367
41969082,113,-3692.968,1514.7861,0.0,667.81635,1344.7354,846,Tox,100011840018250995710399369016464324637,-3693.983511,1515.632457,367
41969810,133,-3695.886,1518.9193,0.0,641.04626,1382.6552,846,Etv1,100011840018250995710399369016464324637,-3693.983511,1515.632457,367
41970990,204,-3697.1086,1518.0354,0.0,629.83014,1374.5458,846,Vgf,100011840018250995710399369016464324637,-3693.983511,1515.632457,367
41972011,248,-3691.7605,1519.7418,0.0,678.89496,1390.2002,846,Satb1,100011840018250995710399369016464324637,-3693.983511,1515.632457,367


In [5]:
cell_list = crop_cell_df.cell.unique()
print(f'#cells {len(cell_list)}') # #cells 58355

print(len(cl_1centroid))
print(crop_cell_df.cell.nunique())

#cells 58355
58355
58355


In [6]:
gene_list = crop_cell_df.gene.unique()
print(f'#genes {len(gene_list)}') # #genes 1240

#genes 1240


#### initial cell typing

In [7]:
##### 44 marker genes in Stereoseq paper
# figure S1 - J
geneMarker44List = ['Cldn5', 'Flt1', 'Ptgds', 'Hbb-bs', 'Acta2', 'Plp1', 'Mbp', 'Olig2', 'Pdgtra', 'Sox10', 'Cx3cr1', 'P2ry12', 'Csf1r', 'C1qa', 
                    'Sox9', 'Aqp4', 'Gfap', 'Mfge8', 'Sparc', 'Agt', 'Slc4a4', 'Slc1a3', 'Slc1a2', 
                    'Reln', 'Pvalb', 'Sst', 'Npy', 'Vip', 'Gad2', 'Gad1', 'Tcf7l2', 'Prkcd', 'Prox1', 
                    'Hpca', 'Th', 'Rprm', 'Cpne4', 'Myl4', 'Rorb', 'Cux2', 'Lamp5', 'Slc17a6', 'Slc17a7', 'Tubb3']
print(f'marker44 {len(geneMarker44List)}')

marker44 44


In [8]:
marker_ = []
for g in geneMarker44List:
    if g in gene_list:
        marker_.append(g)
    
print(f'marker_ {len(marker_)}')
print(marker_)

marker_ 22
['Cldn5', 'Acta2', 'Sox10', 'Sox9', 'Aqp4', 'Gfap', 'Mfge8', 'Agt', 'Reln', 'Pvalb', 'Gad2', 'Tcf7l2', 'Prkcd', 'Prox1', 'Th', 'Rprm', 'Myl4', 'Rorb', 'Cux2', 'Lamp5', 'Slc17a6', 'Slc17a7']


In [9]:
marker22 = np.array(marker_)

In [10]:
marker_EX = ['Rprm', 'Myl4', 'Rorb', 'Cux2', 'Lamp5', 'Slc17a7']
marker_IN = ['Reln', 'Pvalb', 'Gad2']
marker_Astr = ['Sox9', 'Aqp4', 'Gfap', 'Mfge8', 'Agt']
marker_Oligo = ['Sox10']
marker_else = ['Cldn5', 'Acta2', 'Tcf7l2', 'Prkcd', 'Prox1', 'Th', 'Slc17a6']

idx_EX = np.array([16,17,18,19,20,22])-1
idx_IN = np.array([9,10,11])-1
idx_Astr = np.array([4,5,6,7,8])-1
idx_Oligo = np.array([3])-1
idx_else = np.array([1,2,12,13,14,15,21])-1

In [11]:
print(marker22[idx_EX])
print(marker22[idx_IN])
print(marker22[idx_Astr])
print(marker22[idx_Oligo])
print(marker22[idx_else])

['Rprm' 'Myl4' 'Rorb' 'Cux2' 'Lamp5' 'Slc17a7']
['Reln' 'Pvalb' 'Gad2']
['Sox9' 'Aqp4' 'Gfap' 'Mfge8' 'Agt']
['Sox10']
['Cldn5' 'Acta2' 'Tcf7l2' 'Prkcd' 'Prox1' 'Th' 'Slc17a6']


In [12]:
# save marker genes
outfile = 'output_step2/marker_dict.pkl'
# save
pickle_dict = {}
pickle_dict['geneMarker44List'] = geneMarker44List
pickle_dict['marker22'] = marker22
pickle_dict['marker_EX'] = marker_EX
pickle_dict['marker_IN'] = marker_IN
pickle_dict['marker_Astr'] = marker_Astr
pickle_dict['marker_Oligo'] = marker_Oligo
pickle_dict['marker_else'] = marker_else
pickle_dict['idx_EX'] = idx_EX
pickle_dict['idx_IN'] = idx_IN
pickle_dict['idx_Astr'] = idx_Astr
pickle_dict['idx_Oligo'] = idx_Oligo
pickle_dict['idx_else'] = idx_else
with open(outfile, 'wb') as f:
    pickle.dump(pickle_dict, f)

# load
# with open(outfile, 'rb') as f:
#     pickle_dict = pickle.load(f)
# registered_dict = pickle_dict['df_registered']

In [13]:
# the expr of each marker gene
marker_dict = {}
for g in marker22[:]:
    crop_cell_df_g = crop_cell_df[crop_cell_df.gene==g]
    count_g = crop_cell_df_g.groupby('cell').size()
    marker_dict[f'{g}_cl'] = count_g.index.to_list()
    marker_dict[f'{g}_expr'] = count_g.values.tolist()

In [14]:
# marker expr for cell_list
res_list = []
for g in marker22[:]:
    data = cell_list
    keys = marker_dict[f'{g}_cl']
    vals = marker_dict[f'{g}_expr']
    memo = defaultdict(lambda: (0))
    for k, v in zip(keys, vals):
        if k not in memo:
            memo[k] = v
        else:
            raise ValueError('duplicated keys')

    res = np.array(list(map(lambda x: memo[x], data)))
    #print(len(res), len(data), len(keys), len(vals))
    res_list.append(res.tolist())
    
expr_mat = np.array(res_list).transpose()
print(expr_mat.shape) # (58355, 22)

(58355, 22)


In [15]:
# find sc total for `cell_list`
sct_df = crop_cell_df.groupby('cell')['sc_total'].first().head()
data = cell_list
keys = sct_df.index.to_list()
vals = sct_df.values.tolist()
memo = defaultdict(lambda: (0))
for k, v in zip(keys, vals):
    if k not in memo:
        memo[k] = v
    else:
        raise ValueError('duplicated keys')

sc_total_cl = np.array(list(map(lambda x: memo[x], data)))
print(len(sc_total_cl))

58355


In [16]:
# std `expr_mat` by sc_total
MG = 22
c0 = 1e-8
sc_total_mat = np.tile(np.array(sc_total_cl), (MG, 1)).transpose()
print(sc_total_mat.shape)
expr_mat_sc = expr_mat/(sc_total_mat+c0)
print(expr_mat_sc.shape)

(58355, 22)
(58355, 22)


In [17]:
# std across cells
c0 = 1e-8
expr_mat_stdC = np.zeros(expr_mat_sc.shape)
for ig in range(expr_mat_sc.shape[1]):
    expr_mat_stdC[:,ig] = (expr_mat_sc[:,ig]-np.mean(expr_mat_sc[:,ig]))/(np.std(expr_mat_sc[:,ig])+c0)
print(expr_mat_stdC.shape)

(58355, 22)


In [18]:
# std across genes
expr_mat_stdC_stdG = np.zeros(expr_mat_stdC.shape)
for ic in range(expr_mat.shape[0]):
    expr_mat_stdC_stdG[ic,:] = (expr_mat_stdC[ic,:]-np.mean(expr_mat_stdC[ic,:]))/np.std(expr_mat_stdC[ic,:])
print(expr_mat_stdC_stdG.shape)

(58355, 22)


In [19]:
# sum for each of the 5 catergory (4 markers 1 else)
expr_mat_cat5 = np.zeros((expr_mat_stdC_stdG.shape[0],5))

# 6 EX markers
expr_mat_cat5[:,0] = np.sum(expr_mat_stdC_stdG[:,idx_EX], axis=1)/len(idx_EX)
# 3 IN markers
expr_mat_cat5[:,1] = np.sum(expr_mat_stdC_stdG[:,idx_IN], axis=1)/len(idx_EX)
# 5 Astr markers
expr_mat_cat5[:,2] = np.sum(expr_mat_stdC_stdG[:,idx_Astr], axis=1)/len(idx_Astr)
# 1 Oligo markers
expr_mat_cat5[:,3] = np.sum(expr_mat_stdC_stdG[:,idx_Oligo], axis=1)/len(idx_Oligo)
# 7 else
expr_mat_cat5[:,4] = np.sum(expr_mat_stdC_stdG[:,idx_else], axis=1)/len(idx_else)

print(expr_mat_cat5.shape)

(58355, 5)


In [20]:
# argmax as initial cell type
idx2type_dict = {0:'EX', 1:'IN', 2:'Astr', 3:'Oligo', 4:'else'}
type_list_ = []
for ic in range(expr_mat_cat5.shape[0]):
    idx = np.argmax(expr_mat_cat5[ic,:])
    type_list_.append(idx2type_dict[idx])
print(Counter(type_list_))
# Counter({'else': 14309, 'EX': 13725, 'Oligo': 12150, 'Astr': 11480, 'IN': 6691})
type_list_ = np.array(type_list_)
print(len(type_list_))
# 58355

Counter({'else': 14309, 'EX': 13725, 'Oligo': 12150, 'Astr': 11480, 'IN': 6691})
58355


In [21]:
# for cells with less than 10 counts in marker22, change type to 'less'
idx_less10 = np.sum(expr_mat, axis=1)<10
print(np.sum(idx_less10))
type_list = type_list_.copy()
type_list[idx_less10] = 'less'
print(Counter(type_list))
print(len(type_list))

11147
Counter({'EX': 12736, 'less': 11147, 'else': 10269, 'Astr': 9772, 'Oligo': 9324, 'IN': 5107})
58355


In [22]:
# add cell type to df
data = crop_cell_df.cell.tolist()
keys = cell_list
vals = type_list
memo = defaultdict(lambda: (0))
for k, v in zip(keys, vals):
    if k not in memo:
        memo[k] = v
    else:
        raise ValueError('duplicated keys')

res = np.array(list(map(lambda x: memo[x], data)))

print(len(res), len(data), len(keys), len(vals))

# append
crop_cell_df['type'] = np.array(res)

16720026 16720026 58355 58355


In [25]:
# save intermediate results
outfile = 'output_step2/typed_new_merfish_data_dict.pkl'
# save
pickle_dict = {}
pickle_dict['cell_mask_df'] = seg_df_com
pickle_dict['data_df'] = crop_cell_df
pickle_dict['centroid_df_com'] = centroid_df_com
with open(outfile, 'wb') as f:
    pickle.dump(pickle_dict, f)

# load
# with open(outfile, 'rb') as f:
#     pickle_dict = pickle.load(f)
# registered_dict = pickle_dict['df_registered']

#### quality control

QC 1:\
cb at diff z should roughly agree\
using KL divergence of segs now\
can change to iou if want

QC2:\
center should not be too close to boundary\
check on z=0 only

didn't QC on cell size\
seems brain cells size can vary\
allow varing cell sizes

*QC 1*

In [23]:
# cell list with valid type
print(len(cell_list))
cell_list_withtype = cell_list[
    np.logical_or.reduce((type_list=='EX', 
                          type_list=='IN', 
                          type_list=='Astr', 
                          type_list=='Oligo'))
]
print(len(cell_list_withtype))

58355
36939


In [24]:
# seg with KLD at any z > this will be excluded
# chose this bc step1 under section "explore KL divergense of two segs" 
# the cells in the plot
KLD_thre = 0.5 #!!!<<<

#!!!<<< number of x/y bins
# 5 seems good
D = 5 

In [25]:
cell_list_KLD = []

start = timeit.default_timer()

Z = 5 # num of z panels
for ic, c in enumerate(cell_list_withtype[:]):
    if ic%5e3 == 0:
        print(np.round(ic/len(cell_list_withtype),1))
        
    seg_df_c = seg_df_com[seg_df_com['Unnamed: 0']==c]
    
    # all bc x/y vals
    cb_x_allz = np.empty([0])
    cb_y_allz = np.empty([0])
    for z in range(Z):
        cb_x_str = seg_df_c['boundaryX_z'+str(z)].iloc[0]
        cb_y_str = seg_df_c['boundaryY_z'+str(z)].iloc[0]
        cb_x = np.fromstring(cb_x_str, sep=',')
        cb_y = np.fromstring(cb_y_str, sep=',')
        cb_x_allz = np.concatenate((cb_x_allz, cb_x))
        cb_y_allz = np.concatenate((cb_y_allz, cb_y))
    xmin_ = cb_x_allz.min() - c0
    xmax_ = cb_x_allz.max() + c0
    ymin_ = cb_y_allz.min() - c0
    ymax_ = cb_y_allz.max() + c0

    delta_x = (xmax_-xmin_)/D
    delta_y = (ymax_-ymin_)/D
    
    # label of each bin
    prob_x = {}
    prob_y = {}
    for z in range(Z):
        cb_x_str = seg_df_c['boundaryX_z'+str(z)].iloc[0]
        cb_y_str = seg_df_c['boundaryY_z'+str(z)].iloc[0]
        cb_x = np.fromstring(cb_x_str, sep=',')
        cb_y = np.fromstring(cb_y_str, sep=',')
        lab_x = (cb_x-xmin_)//delta_x
        lab_y = (cb_y-ymin_)//delta_y
        count_x = np.zeros(D)
        count_y = np.zeros(D)
        arr_x = np.array(list(Counter(lab_x).items())).astype(int)
        arr_y = np.array(list(Counter(lab_y).items())).astype(int)
        count_x[arr_x[:,0]] = arr_x[:,1]
        count_y[arr_y[:,0]] = arr_y[:,1]
        prob_x[z] = count_x/np.sum(count_x)
        prob_y[z] = count_y/np.sum(count_y)
        
        
    # KL-d between z0 and others
    kld_x = np.zeros(Z)
    kld_y = np.zeros(Z)
    for z in range(Z):
        kld_x[z] = np.sum(prob_x[z]*np.log((prob_x[z]+c0)/(prob_x[0]+c0) + c0))
        kld_y[z] = np.sum(prob_y[z]*np.log((prob_y[z]+c0)/(prob_y[0]+c0) + c0))
        
    if kld_x.max()<KLD_thre and kld_y.max()<KLD_thre:
        cell_list_KLD.append(c)
        
stop = timeit.default_timer()
print(f'Time: {stop - start}') 

print(len(cell_list_withtype))
print(len(cell_list_KLD))

# Time: 241.59815689409152
# 36939
# 32439

0.0
0.1
0.3
0.4
0.5
0.7
0.8
0.9
Time: 254.59678228758276
36939
32439


*QC 2*

In [41]:
# remove cells with center too close to cell boundary
# keep cells if dmin/dmax > DR_thre
DR_thre = 0.5

In [42]:
cell_list_KLD_DR = []
z = 0 # only look at z=0 panel

start = timeit.default_timer()

for ic, c in enumerate(cell_list_KLD[:]):
    if ic%5e3 == 0:
        print(np.round(ic/len(cell_list_KLD), 1))
        
    seg_df_c = seg_df_com[seg_df_com['Unnamed: 0']==c]
    
    # all bc at z=0
    cb_x_str = seg_df_c['boundaryX_z'+str(z)].iloc[0]
    cb_y_str = seg_df_c['boundaryY_z'+str(z)].iloc[0]
    cb_x = np.fromstring(cb_x_str, sep=',')
    cb_y = np.fromstring(cb_y_str, sep=',')
    
    # center
    cx = centroid_df_com[centroid_df_com.index==c].center_x.iloc[0]
    cy = centroid_df_com[centroid_df_com.index==c].center_y.iloc[0]

    # cb dist to center
    d = ((cb_x - cx)**2 + (cb_y - cy)**2)**(1/2)
    r = d.min()/d.max()
    
    # keep cells with ratio > DR_thre
    if r > DR_thre:
        cell_list_KLD_DR.append(c)
        
stop = timeit.default_timer()
print(f'Time: {stop - start}') 

print(len(cell_list_KLD))
print(len(cell_list_KLD_DR))

# Time: 289.10265821171924
# 32439
# 26068

0.0
0.2
0.3
0.5
0.6
0.8
0.9
Time: 331.8782531078905
32439
26068


In [56]:
crop_cell_df_QC = crop_cell_df[crop_cell_df.cell.isin(cell_list_KLD_DR)]

#### gene/cell lists

In [44]:
crop_cell_df_QC['umi'] = 1

In [45]:
print(np.sum(crop_cell_df_QC.cell=='0'))
print(np.sum(np.isnan(crop_cell_df_QC.centerX)))
print(crop_cell_df_QC.type.unique())

0
0
['EX' 'IN' 'Astr' 'Oligo']


In [46]:
data_df = crop_cell_df_QC

In [47]:
data_df.head(2)

Unnamed: 0,barcode_id,x,y,z,_x,_y,fov,gene,cell,centerX,centerY,sc_total,type,umi
1709435,13,4744.9375,-460.2649,0.0,305.0219,1553.6246,39,Lamp5,100053190361369505905550305146712371108,4752.94644,-463.275989,746,EX,1
1709436,13,4751.493,-460.17047,0.0,365.16663,1554.491,39,Lamp5,100053190361369505905550305146712371108,4752.94644,-463.275989,746,EX,1


In [48]:
type_list = ['EX', 'IN', 'Astr', 'Oligo']

In [49]:
# filtering params
# 1. min #cells available for a gene
nc_avl_min = 200
# 2. >=`nc_expr_thre` number of cells with gene counts>=`expr_thre`
expr_thre = 3
nc_expr_thre = 5
# 3. sort the genes and get the dataframe

In [50]:
# 1. nc_avl_min
gene_list_dict1 = {}
df_dict1 = {}
for t in type_list:
    df_t = data_df[data_df.type==t]
    nc_avl_t = df_t.groupby('gene')['cell'].nunique()
    nc_avl_t_filtered = nc_avl_t[nc_avl_t>=nc_avl_min]
    gene_list_dict1[t] = nc_avl_t_filtered.index.to_numpy()
    df_dict1[t] = df_t[df_t.gene.isin(gene_list_dict1[t])]
    print(f'type {t} #genes={len(gene_list_dict1[t])}')

type EX #genes=1031
type IN #genes=738
type Astr #genes=660
type Oligo #genes=656


In [51]:
# 2. nc_expr_thre and expr_thre
gene_list_dict2 = {}
for t in type_list:
    df_t = df_dict1[t]
    df_t_gbG = df_t.groupby('gene')
    gl_t = []
    for g in gene_list_dict1[t][:]:
        df_t_g = df_t_gbG.get_group(g)
        sc_umi_g = df_t_g.groupby('cell')['umi'].sum()
        if np.sum(sc_umi_g>=expr_thre)>=nc_expr_thre:
            gl_t.append(g)
    gene_list_dict2[t] = np.array(gl_t)
    print(f'type {t} #genes={len(gl_t)}')

type EX #genes=799
type IN #genes=682
type Astr #genes=597
type Oligo #genes=561


In [52]:
# 3. get the df and sort the genes
gene_list_dict3 = {}
cell_list_dict3 = {}
df_dict3 = {}
for t in type_list:
    gl_t2 = gene_list_dict2[t]
    df_t = data_df[data_df.type==t]
    df_t3 = df_t[df_t.gene.isin(gl_t2)]
    gl_t3 = df_t3.groupby('gene')['umi'].sum().sort_values(ascending=False).index.to_numpy()
    gene_list_dict3[t] = gl_t3
    df_dict3[t] = df_t3
    cell_list_dict3[t] = df_t3.cell.unique().tolist()

In [53]:
data_df = pd.concat(list(df_dict3.values()))
gene_list_dict = gene_list_dict3
cell_list_dict = cell_list_dict3
cell_list_all = [item for sublist in list(cell_list_dict.values()) for item in sublist]

In [54]:
print(f'total #cells {len(cell_list_all)}')
for t in type_list:
    print(f'{t} #genes {len(gene_list_dict[t])} #cells {len(cell_list_dict[t])}')

total #cells 26068
EX #genes 799 #cells 8930
IN #genes 682 #cells 3700
Astr #genes 597 #cells 6493
Oligo #genes 561 #cells 6945


In [55]:
# check the number of cells avl for a few genes
t = 'EX'
igl = [0, 10, 100, 200, 500, 798]
for ig in igl:
    g = gene_list_dict[t][ig]
    df_g = data_df[(data_df.type==t)&(data_df.gene==g)]
    print(f'{ig}th gene #cells avl {df_g.cell.nunique()}')

0th gene #cells avl 8371
10th gene #cells avl 2873
100th gene #cells avl 4834
200th gene #cells avl 1913
500th gene #cells avl 969
798th gene #cells avl 207


#### prepare cell masks/boundaries

In [140]:
seg_df_com.head(2)

Unnamed: 0.1,Unnamed: 0,boundaryX_z0,boundaryY_z0,boundaryX_z1,boundaryY_z1,boundaryX_z2,boundaryY_z2,boundaryX_z3,boundaryY_z3,boundaryX_z4,boundaryY_z4
0,4195605887788691952135636853561851,"3250.893838056922, 3250.893838056922, 3250.893...","2744.462888389826, 2744.3538883924484, 2744.24...","3247.6238381356, 3247.6238381356, 3247.6238381...","2747.4058883190155, 2747.296888321638, 2747.18...","3246.8608381539584, 3246.8608381539584, 3246.8...","2747.7328883111477, 2747.6238883137703, 2747.5...","3246.8608381539584, 3246.8608381539584, 3246.8...","2747.4058883190155, 2747.296888321638, 2747.18...","3250.56683806479, 3250.56683806479, 3250.56683...","2743.0458884239197, 2742.9368884265423, 2742.8..."
1,5780261735764654093526972708487477,"-7246.938868483901, -7246.938868483901, -7246....","-1168.5700405836105, -1168.679040580988, -1168...","-7247.592868468166, -7247.592868468166, -7247....","-1172.6030404865742, -1172.7120404839516, -117...","-7249.772868415714, -7249.772868415714, -7249....","-1166.3900406360626, -1166.49904063344, -1166....","-7249.4458684235815, -7249.4458684235815, -724...","-1166.3900406360626, -1166.49904063344, -1166....","-7249.118868431449, -7249.118868431449, -7249....","-1166.0630406439304, -1166.1720406413078, -116..."


In [141]:
start = timeit.default_timer()

Z = 5
cb_dict = {}

for ic, c in enumerate(cell_list_all):
    seg_c = seg_df_com[seg_df_com['Unnamed: 0']==c]
    for z in range(Z):
        cb_z_c_x = seg_c['boundaryX_z'+str(z)].iloc[0]
        cb_z_c_y = seg_c['boundaryY_z'+str(z)].iloc[0]
        cb_x = np.fromstring(cb_z_c_x, sep=',')
        cb_y = np.fromstring(cb_z_c_y, sep=',')
        cb_df_c = pd.DataFrame({'x': cb_x, 'y': cb_y})
        cb_df_c['cell'] = c
        cb_df_c['z'] = z
        cb_dict[f'{c}_z{z}'] = cb_df_c

stop = timeit.default_timer()
print('Time: ', stop - start)

# concatenate all cell_masks_dict to one df
cb_df = pd.concat(list(cb_dict.values()))

# Time:  175.61210712324828

Time:  175.61210712324828


In [143]:
print(cb_df.cell.nunique())
print(len(cell_list_all))

26068
26068


In [142]:
cb_df.head(2)

Unnamed: 0,x,y,cell,z
0,4763.58484,-461.640989,100053190361369505905550305146712371108,0
1,4763.58484,-461.749989,100053190361369505905550305146712371108,0


#### save prepared data

In [145]:
outfile = 'output_step2/merfish_data_dict.pkl'
# save
pickle_dict = {}
pickle_dict['type_list'] = type_list
pickle_dict['gene_list_dict'] = gene_list_dict
pickle_dict['cell_list_dict'] = cell_list_dict
pickle_dict['cell_list_all'] = cell_list_all
pickle_dict['cell_mask_df'] = cb_df #!!!
pickle_dict['data_df'] = data_df
with open(outfile, 'wb') as f:
    pickle.dump(pickle_dict, f)

# load
# with open(outfile, 'rb') as f:
#     pickle_dict = pickle.load(f)
# registered_dict = pickle_dict['df_registered']