### Step3: prepare data as pickled dict for ELLA analysis

In [1]:
import numpy as np
import pandas as pd
import anndata
import matplotlib.pyplot as plt
import cv2
from sklearn.cluster import KMeans
import pickle
import timeit



#### Load data

In [2]:
##### read processed gene expression data and mask (from step2)
outfile = 'output_step2/stereoseq_df_dict.pkl'
# load
with open(outfile, 'rb') as f:
    pickle_dict = pickle.load(f)
df = pickle_dict['df']
mask = pickle_dict['mask']
df.head()

Unnamed: 0,gene,x,y,umi,xr,yr,cell_r,centerX_r,centerY_r,centerX,...,cell,sc_xrange,sc_yrange,sc_xmin,sc_xmax,sc_ymin,sc_ymax,sc_total,sc_ngene,type
12,Cr1l,26717,13204,1,717,3704,12014,712,3700,26712,...,12014,36,25,26704,26740,13191,13216,1666,724,17
32,Cr1l,28282,14437,1,2282,4937,35736,2291,4944,28291,...,35736,35,33,28271,28306,14417,14450,1367,689,14
140,Cr1l,29228,15955,1,3228,6455,49512,3234,6442,29234,...,49512,20,25,29225,29245,15933,15958,736,442,8
454,Cr1l,28116,13051,1,2116,3551,33156,2114,3556,28114,...,33156,15,27,28106,28121,13042,13069,825,408,14
459,Cr1l,26934,12445,1,934,2945,15337,946,2941,26946,...,15337,27,14,26928,26955,12435,12449,671,340,2


In [3]:
df.gene.nunique()

26895

#### Prepare df and masks

In [4]:
##### only keep cells with x/yrange $\in$ [12,31] (approx 0.5, 0.95 quantiles)
f1 = df.sc_xrange>=12
f2 = df.sc_xrange<=31
f3 = df.sc_yrange>=12
f4 = df.sc_yrange<=31
df = df[f1&f2&f3&f4].copy()
print('df #cells', df.cell.nunique())

df #cells 90974


#### Gene and cell lists for each type

In [5]:
##### cell type list
type_list = df.type.unique().tolist()
type_list.sort()
print(type_list)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


In [22]:
##### filtering params
# 1. min #cells available for a gene
nc_avl_min = 50
# 2. >=`nc_expr_thre` number of cells with gene counts>=`expr_thre`
expr_thre = 1
nc_expr_thre = 200
# 3. sort the genes and get the df

In [23]:
# 1. nc_avl_min
gene_list_dict1 = {}
df_dict1 = {}
for t in type_list[:]:
    df_t = df[df.type==t]
    nc_avl_t = df_t.groupby('gene')['cell'].nunique()
    nc_avl_t_filtered = nc_avl_t[nc_avl_t>=nc_avl_min]
    gene_list_dict1[t] = nc_avl_t_filtered.index.to_numpy()
    df_dict1[t] = df_t[df_t.gene.isin(gene_list_dict1[t])]
    print(f'type {t} #genes={len(gene_list_dict1[t])}')

type 1 #genes=5745
type 2 #genes=7094
type 3 #genes=5395
type 4 #genes=7493
type 5 #genes=8207
type 6 #genes=8047
type 7 #genes=8035
type 8 #genes=8630
type 9 #genes=7253
type 10 #genes=7969
type 11 #genes=7894
type 12 #genes=4732
type 13 #genes=7729
type 14 #genes=7916
type 15 #genes=4978
type 16 #genes=8217
type 17 #genes=6572
type 18 #genes=6071
type 19 #genes=7622
type 20 #genes=8720
type 21 #genes=8011
type 22 #genes=1337
type 23 #genes=8080
type 24 #genes=7372
type 25 #genes=7410


In [24]:
# 2. nc_expr_thre and expr_thre
gene_list_dict2 = {}
for t in type_list[:]:
    df_t = df_dict1[t]
    df_t_gbG = df_t.groupby('gene')
    gl_t = []
    for g in gene_list_dict1[t][:]:
        df_t_g = df_t_gbG.get_group(g)
        sc_umi_g = df_t_g.groupby('cell')['umi'].sum()
        if np.sum(sc_umi_g>=expr_thre)>=nc_expr_thre:
            gl_t.append(g)
    gene_list_dict2[t] = np.array(gl_t)
    print(f'type {t} #genes={len(gl_t)}')

type 1 #genes=1043
type 2 #genes=1553
type 3 #genes=815
type 4 #genes=1770
type 5 #genes=2256
type 6 #genes=1998
type 7 #genes=1912
type 8 #genes=2338
type 9 #genes=1532
type 10 #genes=2047
type 11 #genes=2049
type 12 #genes=813
type 13 #genes=1910
type 14 #genes=1874
type 15 #genes=726
type 16 #genes=2178
type 17 #genes=1183
type 18 #genes=1029
type 19 #genes=2047
type 20 #genes=2892
type 21 #genes=1947
type 22 #genes=195
type 23 #genes=2115
type 24 #genes=1778
type 25 #genes=1821


In [25]:
# 3. get the df and sort the genes
gene_list_dict3 = {}
cell_list_dict3 = {}
df_dict3 = {}
for t in type_list:
    gl_t2 = gene_list_dict2[t]
    df_t = df_dict1[t]
    df_t3 = df_t[df_t.gene.isin(gl_t2)]
    gl_t3 = df_t3.groupby('gene')['umi'].sum().sort_values(ascending=False).index.to_numpy()
    gene_list_dict3[t] = gl_t3
    df_dict3[t] = df_t3
    cell_list_dict3[t] = df_t3.cell.unique().tolist()

In [26]:
data_df = pd.concat(list(df_dict3.values()))
gene_list_dict = gene_list_dict3
cell_list_dict = cell_list_dict3
cell_list_all = [item for sublist in list(cell_list_dict.values()) for item in sublist]

#### Prepare cell masks

In [28]:
start = timeit.default_timer()

n_pad = 5 # pad for better cropping cell masks <<<<<
df_gyC = df.groupby('cell')

cell_masks_dict = {}

for ic, c in enumerate(cell_list_all):    
    # df for c
    df_c = df_gyC.get_group(c).copy()
    
    # crop mask for c
    xmin = df_c.sc_xmin.iloc[0]
    xmax = df_c.sc_xmax.iloc[0]
    ymin = df_c.sc_ymin.iloc[0]
    ymax = df_c.sc_ymax.iloc[0]
    mask_c = mask[(xmin-n_pad):(xmax+n_pad), (ymin-n_pad):(ymax+n_pad)]
    tuple_c = np.where(mask_c==int(c))
    mask_df_c = pd.DataFrame({'x': (tuple_c[0]+(xmin-n_pad)), 'y': (tuple_c[1]+(ymin-n_pad))})
    mask_df_c['cell'] = [c]*len(mask_df_c)
    
    # add
    cell_masks_dict[c] = mask_df_c
    
# concatenate all cell_masks_dict to one df
cell_masks = pd.concat(list(cell_masks_dict.values()))

stop = timeit.default_timer()
print('Time: ', stop - start) # ~116s

Time:  116.09417752409354


In [30]:
cell_masks.head()

Unnamed: 0,x,y,cell
0,29908,16502,56638
1,29908,16503,56638
2,29908,16504,56638
3,29908,16505,56638
4,29908,16506,56638


In [31]:
##### change col data type
df = df.astype({'cell': str})
df = df.astype({'type': str})
cell_masks = cell_masks.astype({'cell': str})

print(df.cell.iloc[0])
print(df.type.iloc[0])
print(cell_masks.cell.iloc[0])

type_list = [str(x) for x in type_list]
cell_list_all = [str(x) for x in cell_list_all]
cell_list_dict_str = {}
for t in type_list:
    cl_t = cell_list_dict[int(t)]
    cell_list_dict_str[t] = [str(x) for x in cl_t]
cell_list_dict = cell_list_dict_str
gene_list_dict_str={}
for t in type_list:
    gene_list_dict_str[t] = gene_list_dict[int(t)]
gene_list_dict = gene_list_dict_str

49512
8
56638


In [32]:
cell_masks.head()

Unnamed: 0,x,y,cell
0,29908,16502,56638
1,29908,16503,56638
2,29908,16504,56638
3,29908,16505,56638
4,29908,16506,56638


In [13]:
df.head()

Unnamed: 0,gene,x,y,umi,xr,yr,cell_r,centerX_r,centerY_r,centerX,...,cell,sc_xrange,sc_yrange,sc_xmin,sc_xmax,sc_ymin,sc_ymax,sc_total,sc_ngene,type
140,Cr1l,29228,15955,1,3228,6455,49512,3234,6442,29234,...,49512,20,25,29225,29245,15933,15958,736,442,8
454,Cr1l,28116,13051,1,2116,3551,33156,2114,3556,28114,...,33156,15,27,28106,28121,13042,13069,825,408,14
459,Cr1l,26934,12445,1,934,2945,15337,946,2941,26946,...,15337,27,14,26928,26955,12435,12449,671,340,2
532,Cd46,26126,10847,1,126,1347,2251,123,1360,26123,...,2251,22,21,26114,26136,10847,10868,1004,507,5
546,Gm16897,28922,12434,1,2922,2934,45268,2919,2935,28919,...,45268,17,19,28910,28927,12424,12443,491,294,23


#### Save prepared data

In [33]:
##### save data
outfile = 'output_step3/stereoseq_data_dict.pkl'
# save
pickle_dict = {}
pickle_dict['type_list'] = type_list
pickle_dict['gene_list_dict'] = gene_list_dict
pickle_dict['cell_list_dict'] = cell_list_dict
pickle_dict['cell_list_all'] = cell_list_all
pickle_dict['cell_mask_df'] = cell_masks
pickle_dict['data_df'] = df
with open(outfile, 'wb') as f:
    pickle.dump(pickle_dict, f)

# load
# with open(outfile, 'rb') as f:
#     pickle_dict = pickle.load(f)
# registered_dict = pickle_dict['df_registered']