In [1]:
from ALLCools.clustering import *
from ALLCools.mcds import MCDS
from wmb import brain, aibs, broad, mm10

import pandas as pd
import anndata
import seaborn as sns
import matplotlib.pyplot as plt
from ALLCools.plot import *

import scanpy as sc
from wmb import cemba
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
import glob
import numpy as np


In [2]:
group_name = 'All'

## select gene features

In [4]:
var_dim = 'gene'
merfish_ds = MCDS.open('/cemba/MERFISH/MERFISH.TotalWatershedCell.BasicFilter.zarr', var_dim=var_dim)
merfish_ds

Unnamed: 0,Array,Chunk
Bytes,1.13 GiB,38.15 MiB
Shape,"(613096, 494)","(100000, 100)"
Count,36 Tasks,35 Chunks
Type,uint32,numpy.ndarray
"Array Chunk Bytes 1.13 GiB 38.15 MiB Shape (613096, 494) (100000, 100) Count 36 Tasks 35 Chunks Type uint32 numpy.ndarray",494  613096,

Unnamed: 0,Array,Chunk
Bytes,1.13 GiB,38.15 MiB
Shape,"(613096, 494)","(100000, 100)"
Count,36 Tasks,35 Chunks
Type,uint32,numpy.ndarray


In [5]:
gene_meta = mm10.get_gene_metadata()
merfish_gene = pd.DataFrame(merfish_ds['gene'].values, index=merfish_ds['gene_id'].values, columns=['gene'])
merfish_gene.index.name = 'gene_id'
print(merfish_gene.index.isin(gene_meta.index).sum())


493


In [6]:
merfish_gene['chrom'] = gene_meta['chrom']
chrom_to_remove = ['chrX', 'chrY', 'chrM', 'chrL']
print(merfish_gene['chrom'].isin(chrom_to_remove).sum())


6


In [7]:
var_dim = 'gene'
rna_ds = MCDS.open(aibs.AIBS_TENX_V2_ZARR_PATH, var_dim=var_dim)
rna_ds

Unnamed: 0,Array,Chunk
Bytes,500.57 GiB,615.79 MiB
Shape,"(4162025, 32285)","(5000, 32285)"
Count,834 Tasks,833 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 500.57 GiB 615.79 MiB Shape (4162025, 32285) (5000, 32285) Count 834 Tasks 833 Chunks Type float32 numpy.ndarray",32285  4162025,

Unnamed: 0,Array,Chunk
Bytes,500.57 GiB,615.79 MiB
Shape,"(4162025, 32285)","(5000, 32285)"
Count,834 Tasks,833 Chunks
Type,float32,numpy.ndarray


In [9]:
rna_gene = rna_ds['gene'].to_pandas()
rna_gene

gene
ENSMUSG00000051951    ENSMUSG00000051951
ENSMUSG00000089699    ENSMUSG00000089699
ENSMUSG00000102331    ENSMUSG00000102331
ENSMUSG00000102343    ENSMUSG00000102343
ENSMUSG00000025900    ENSMUSG00000025900
                             ...        
ENSMUSG00000095523    ENSMUSG00000095523
ENSMUSG00000095475    ENSMUSG00000095475
ENSMUSG00000094855    ENSMUSG00000094855
ENSMUSG00000095019    ENSMUSG00000095019
ENSMUSG00000095041    ENSMUSG00000095041
Length: 32285, dtype: object

In [10]:
selg = (merfish_gene.index.isin(rna_gene) & ~merfish_gene['chrom'].isin(chrom_to_remove) & merfish_gene.index.isin(gene_meta.index))
print(selg.sum())

487


In [12]:
merfish_gene.loc[selg, 'gene'].to_csv('common_gene.csv')

In [13]:
common_gene = pd.read_csv('common_gene.csv', header=0, index_col=0)
common_gene

Unnamed: 0_level_0,gene
gene_id,Unnamed: 1_level_1
ENSMUSG00000038257,Glra3
ENSMUSG00000000305,Cdh4
ENSMUSG00000000708,Kat2b
ENSMUSG00000001120,Pcbp3
ENSMUSG00000003411,Rab3b
...,...
ENSMUSG00000013846,St3gal1
ENSMUSG00000022332,Khdrbs3
ENSMUSG00000022883,Robo1
ENSMUSG00000050272,Dscam


## get RNA adata

In [14]:
rna_ds = rna_ds.sel({var_dim: rna_ds.get_index(var_dim).intersection(common_gene.index)})
rna_ds

Unnamed: 0,Array,Chunk
Bytes,7.55 GiB,9.29 MiB
Shape,"(4162025, 487)","(5000, 487)"
Count,1667 Tasks,833 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 7.55 GiB 9.29 MiB Shape (4162025, 487) (5000, 487) Count 1667 Tasks 833 Chunks Type float32 numpy.ndarray",487  4162025,

Unnamed: 0,Array,Chunk
Bytes,7.55 GiB,9.29 MiB
Shape,"(4162025, 487)","(5000, 487)"
Count,1667 Tasks,833 Chunks
Type,float32,numpy.ndarray


In [15]:
merfish_ds = merfish_ds.sel({var_dim: merfish_ds.get_index(var_dim).intersection(common_gene['gene'].values)})
merfish_ds

Unnamed: 0,Array,Chunk
Bytes,1.11 GiB,38.15 MiB
Shape,"(613096, 487)","(100000, 100)"
Count,71 Tasks,35 Chunks
Type,uint32,numpy.ndarray
"Array Chunk Bytes 1.11 GiB 38.15 MiB Shape (613096, 487) (100000, 100) Count 71 Tasks 35 Chunks Type uint32 numpy.ndarray",487  613096,

Unnamed: 0,Array,Chunk
Bytes,1.11 GiB,38.15 MiB
Shape,"(613096, 487)","(100000, 100)"
Count,71 Tasks,35 Chunks
Type,uint32,numpy.ndarray


In [16]:
ref_adata = rna_ds.get_count_adata(da_name=f'{var_dim}_da', loading_chunk=50000)
ref_adata

Loading chunk 0-50000/4162025
Loading chunk 50000-100000/4162025
Loading chunk 100000-150000/4162025
Loading chunk 150000-200000/4162025
Loading chunk 200000-250000/4162025
Loading chunk 250000-300000/4162025
Loading chunk 300000-350000/4162025
Loading chunk 350000-400000/4162025
Loading chunk 400000-450000/4162025
Loading chunk 450000-500000/4162025
Loading chunk 500000-550000/4162025
Loading chunk 550000-600000/4162025
Loading chunk 600000-650000/4162025
Loading chunk 650000-700000/4162025
Loading chunk 700000-750000/4162025
Loading chunk 750000-800000/4162025
Loading chunk 800000-850000/4162025
Loading chunk 850000-900000/4162025
Loading chunk 900000-950000/4162025
Loading chunk 950000-1000000/4162025
Loading chunk 1000000-1050000/4162025
Loading chunk 1050000-1100000/4162025
Loading chunk 1100000-1150000/4162025
Loading chunk 1150000-1200000/4162025
Loading chunk 1200000-1250000/4162025
Loading chunk 1250000-1300000/4162025
Loading chunk 1300000-1350000/4162025
Loading chunk 135000

AnnData object with n_obs × n_vars = 4162025 × 487
    obs: 'count', 'umi_count'
    var: 'name'

In [18]:
# ref_adata = anndata.read_h5ad('aibs_10x.h5ad')
# ref_adata

In [19]:
aibs_annot = aibs.get_tenx_annot()
aibs_annot = aibs_annot.sel({'cell':aibs_annot.get_index('cell').intersection(ref_adata.obs.index)})
annot = aibs_annot[['L1', 'L2', 'L3', 'L1_annot', 'L2_annot', 'DissectionRegion', 'SubRegion', 'MajorRegion']].to_pandas()
annot

Unnamed: 0_level_0,L1,L2,L3,L1_annot,L2_annot,DissectionRegion,SubRegion,MajorRegion
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
L8TX_180406_01_B01-TCTTCGGCAGCCTTTC-1,6,85,1,Glut,L2 IT PPP-APr Glut,PAR-POST-PRE-SUB-ProS,RHP,HPF
L8TX_180406_01_B01-CGTCTACTCCTAGGGC-1,6,85,1,Glut,L2 IT PPP-APr Glut,PAR-POST-PRE-SUB-ProS,RHP,HPF
L8TX_180406_01_E01-AGCATACTCCTGTACC-1,6,85,1,Glut,L2 IT PPP-APr Glut,PAR-POST-PRE-SUB-ProS,RHP,HPF
L8TX_180406_01_B01-CGTTCTGAGAGGTTGC-1,6,85,1,Glut,L2 IT PPP-APr Glut,PAR-POST-PRE-SUB-ProS,RHP,HPF
L8TX_180406_01_B01-CGTTGGGGTTTACTCT-1,6,85,1,Glut,L2 IT PPP-APr Glut,PAR-POST-PRE-SUB-ProS,RHP,HPF
...,...,...,...,...,...,...,...,...
L8TX_190822_01_B12-CCTACGTCATACTGTG-1,6,8,5411,Glut,APN C1ql4 Glut,MB - MRN-CUN-RN-RR-PPN-NB-SAG-PBG,MB,MB
L8TX_190725_01_E08-ATTCAGGAGGTGGGTT-1,6,8,5411,Glut,APN C1ql4 Glut,MB - SC,MB,MB
L8TX_200130_01_B02-CCATAAGTCGAGTACT-1,6,8,5411,Glut,APN C1ql4 Glut,MB - MRN-CUN-RN-RR-PN,MB,MB
L8TX_200130_01_G02-GAGTGAGTCCCAGCGA-1,6,8,5411,Glut,APN C1ql4 Glut,MB - MRN-CUN-RN-RR-PN,MB,MB


In [20]:
ref_adata = ref_adata[annot.index, common_gene.index].copy()
ref_adata.obs = pd.concat([ref_adata.obs, annot], axis=1)
ref_adata

AnnData object with n_obs × n_vars = 4065284 × 487
    obs: 'count', 'umi_count', 'L1', 'L2', 'L3', 'L1_annot', 'L2_annot', 'DissectionRegion', 'SubRegion', 'MajorRegion'
    var: 'name'

In [21]:
ref_adata.X.data = ref_adata.X.data/np.repeat(ref_adata.obs['umi_count'].values, ref_adata.X.getnnz(axis=1)
                                     ) * ref_adata.obs['umi_count'].median()
sc.pp.log1p(ref_adata)

In [22]:
np.random.seed(0)

n_train_cell = 100000
# select mC cells to fit the model
train_cell = np.zeros(ref_adata.shape[0]).astype(bool)
if ref_adata.shape[0] > n_train_cell:
    train_cell[np.random.choice(np.arange(ref_adata.shape[0]), n_train_cell, False)] = True
else:
    train_cell[:] = True

ref_adata.obs['Train'] = train_cell.copy()
ref_adata.obs['Train'].sum()


100000

In [23]:
ndim = min(100, ref_adata.obs['Train'].sum()-1, ref_adata.shape[1]-1)
model = TruncatedSVD(n_components=ndim, algorithm='arpack', random_state=0) 
model.fit(ref_adata.X[ref_adata.obs['Train'].values])
sel_dim = (model.singular_values_ != 0)
print(sel_dim.sum())


100


In [24]:
## Transform 10x

chunk_size = 50000
chunks = []
for chunk_start in range(0, ref_adata.shape[0], chunk_size):
    chunks.append(
        model.transform(ref_adata.X[chunk_start:(chunk_start + chunk_size)]))

ref_adata.obsm['X_pca'] = np.concatenate(chunks, axis=0)[:, sel_dim]
ref_adata.obsm['X_pca'] /= model.singular_values_[sel_dim]


In [25]:
npc = significant_pc_test(ref_adata, p_cutoff=0.1, update=False, obsm='X_pca')


Downsample PC matrix to 50000 cells to calculate significant PC components
43 components passed P cutoff of 0.1.


In [26]:
ref_adata.write_h5ad('aibs_10x.h5ad')


## get MERFISH adata

In [27]:
qry_adata = merfish_ds.get_count_adata(da_name='count', loading_chunk=50000)
qry_adata


Loading chunk 0-50000/613096
Loading chunk 50000-100000/613096
Loading chunk 100000-150000/613096
Loading chunk 150000-200000/613096
Loading chunk 200000-250000/613096
Loading chunk 250000-300000/613096
Loading chunk 300000-350000/613096
Loading chunk 350000-400000/613096
Loading chunk 400000-450000/613096
Loading chunk 450000-500000/613096
Loading chunk 500000-550000/613096
Loading chunk 550000-600000/613096
Loading chunk 600000-613096/613096


AnnData object with n_obs × n_vars = 613096 × 487
    obs: 'blank_count', 'n_counts', 'n_genes'
    var: 'id'

In [28]:
qry_adata = qry_adata[:, common_gene['gene'].values].copy()
qry_adata.var.index = common_gene.index


In [29]:
qry_adata.X.data = qry_adata.X.data/np.repeat(qry_adata.obs['n_counts'].values, qry_adata.X.getnnz(axis=1)
                                     ) * qry_adata.obs['n_counts'].median()
sc.pp.log1p(qry_adata)

In [30]:
chunks = []
for chunk_start in range(0, qry_adata.shape[0], chunk_size):
    # tmp = (qry_adata.X[chunk_start:(chunk_start + chunk_size)].toarray() - qry_adata.var['mean'].values) / qry_adata.var['std'].values
    tmp = (qry_adata.X[chunk_start:(chunk_start + chunk_size)])
    chunks.append(model.transform(tmp))
    print(chunk_start)

qry_adata.obsm['X_pca'] = np.concatenate(chunks, axis=0)[:, sel_dim]
qry_adata.obsm['X_pca'] /= model.singular_values_[sel_dim]


0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000


In [31]:
qry_adata.write_h5ad('merfish.h5ad')
