In [1]:
import glob
import time

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import pynndescent
import seaborn as sns
from ALLCools.clustering import significant_pc_test, tsne
from ALLCools.integration import SeuratIntegration
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from wmb import aibs, brain, broad, cemba, mm10


In [2]:
group_name = "All"


## load adata

In [3]:
def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f"{name}_{i}"] = adata.obsm[f"X_{name}"][:, i]
    return adata


In [4]:
ref_adata = anndata.read_h5ad("aibs_10x.h5ad")
ref_adata.var['mean'] = ref_adata.X.mean(axis=0).A1
ref_adata.var['std'] = (ref_adata.X.multiply(ref_adata.X)).mean(axis=0).A1 - (ref_adata.var['mean'].values ** 2)
print(ref_adata.var['std'].min())
ref_adata = ref_adata[:, ref_adata.var['std']>1e-5].copy()


0.0034029149


In [5]:
qry_adata = anndata.read_h5ad("merfish.h5ad")
qry_adata.var['mean'] = qry_adata.X.mean(axis=0).A1
qry_adata.var['std'] = (qry_adata.X.multiply(qry_adata.X)).mean(axis=0).A1 - (qry_adata.var['mean'].values ** 2)
print(qry_adata.var['std'].min())
qry_adata = qry_adata[:, qry_adata.var['std']>1e-5].copy()


0.005477670362666543


In [6]:
ncell = ref_adata.shape[0] + qry_adata.shape[0]
ncc = significant_pc_test(ref_adata, p_cutoff=0.1, update=False, obsm="X_pca")
ncc = min(50, ncc, ref_adata.shape[0] - 1, qry_adata.shape[0] - 1, ref_adata.shape[1] // 5)
ncc = max(ncc, 5)
npc = min([50, ncc + 10, ref_adata.shape[0] - 1, ref_adata.obsm["X_pca"].shape[1]])
print(npc, ncc, ref_adata.shape[0], qry_adata.shape[0])


Downsample PC matrix to 50000 cells to calculate significant PC components
41 components passed P cutoff of 0.1.
50 41 4065284 613096


In [7]:
for xx in [ref_adata, qry_adata]:
    xx.obsm["X_pca"] = normalize(xx.obsm["X_pca"][:, :npc], axis=1)
    

In [8]:
qry_adata.obs["Study"] = "MERFISH"
ref_adata.obs["Study"] = "AIBS_10x"

In [9]:
adata_merge = anndata.AnnData(
    X=np.ones((ncell, 1)), obs=pd.concat([ref_adata.obs, qry_adata.obs], axis=0)
)

In [10]:
if min(ref_adata.shape[0], qry_adata.shape[0])<100:
    adata_merge.obsm['X_pca'] = np.concatenate([ref_adata.obsm["X_pca"], qry_adata.obsm["X_pca"]], axis=0)
    if min(ref_adata.shape[0], qry_adata.shape[0])<2:
        adata_merge.obsm['X_pca_corrected'] = adata_merge.obsm['X_pca'].copy()
    else:
        if np.min([np.round(ncell / 30.0), 100]).astype(int)<2:
            nclust = 2
        else:
            nclust = None
        sce.pp.harmony_integrate(adata_merge, key='Study', adjusted_basis='X_pca_corrected', max_iter_harmony=30, random_state=0, nclust=nclust)
else:
    integrator = SeuratIntegration()
    adata_list = [ref_adata, qry_adata]
    start_time = time.time()
    integrator.find_anchor(
        adata_list,
        k_local=None,
        key_local="X_pca",
        k_anchor=5,
        key_anchor="X",
        dim_red="cca",
        max_cc_cells=50000,
        k_score=30,
        # k_filter=min(200, ref_adata.shape[0] // 10),
        k_filter=None,
        scale1=True,
        scale2=True,
        # scale =[False, True]
        n_components=ncc,
        n_features=min(200, ncc * 10),
        alignments=[[[0], [1]]],
    )
    print(time.time() - start_time)
    start_time = time.time()
    corrected = integrator.integrate(
        key_correct="X_pca",
        row_normalize=True,
        n_components=npc,
        k_weight=min(100, integrator.anchor[(0, 1)].shape[0]),
        sd=1,
        alignments=[[[0], [1]]],
    )
    print(time.time() - start_time)
    qry_adata.obsm["X_pca_corrected"] = normalize(corrected[1], axis=1)
    qry_adata.write_h5ad("merfish.h5ad")
    integrator.save("integration_aibs10x_merfish")
    adata_merge.obsm["X_pca_corrected"] = np.concatenate(corrected, axis=0)


Find anchors across datasets.
Run CCA
non zero dims 41
Find Anchors using k=30
Score Anchors
Identified 579443 anchors between datasets 0 and 1.
1126.2117319107056
Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data
367.4503605365753


In [11]:
start_time = time.time()
tsne(
    adata_merge,
    obsm="X_pca_corrected",
    metric="euclidean",
    exaggeration=-1,
    perplexity=50,
    n_jobs=-1,
)
dump_embedding(adata_merge, "tsne")
adata_merge.obsm[f"u{npc}seurat_tsne"] = adata_merge.obsm["X_tsne"].copy()
print(time.time() - start_time)


2988.3058655261993


In [16]:
for i in range(3):
    adata_merge.obs[f"L{i+1}"] = adata_merge.obs[f"L{i+1}"].astype(str)
    adata_merge.obs.loc[adata_merge.obs["Study"] == "AIBS_10x", f"L{i+1}"] = (
        adata_merge.obs.loc[adata_merge.obs["Study"] == "AIBS_10x", f"L{i+1}"]
        .astype(float)
        .astype(int)
        .astype(str)
    )
    

In [17]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish.h5ad")


In [10]:
cell_per_cluster = 100
selc = [qry_adata.obs.index]
np.random.seed(0)
for xx,yy in ref_adata.obs.groupby('L3'):
    if yy.shape[0]>cell_per_cluster:
        selc.append(np.random.choice(yy.index, cell_per_cluster, False))
    else:
        selc.append(yy.index)

selc = np.concatenate(selc)
selc = adata_merge.obs.reset_index().reset_index().set_index('cell').loc[selc]['index']


In [11]:
adata_merge = anndata.AnnData(X=np.ones((len(selc), 1)), obs=adata_merge.obs.iloc[selc], obsm={'X_pca_corrected':adata_merge.obsm['X_pca_corrected'][selc]})
adata_merge


AnnData object with n_obs × n_vars = 1043707 × 1
    obs: 'count', 'umi_count', 'L1', 'L2', 'L3', 'L1_annot', 'L2_annot', 'DissectionRegion', 'SubRegion', 'MajorRegion', 'Train', 'Study', 'blank_count', 'n_counts', 'n_genes', 'tsne_0', 'tsne_1'
    obsm: 'X_pca_corrected'

In [1]:
start_time = time.time()
sc.pp.neighbors(
    adata_merge,
    use_rep="X_pca_corrected",
    n_neighbors=25,
    random_state=0,
    metric="cosine",
)
print(time.time() - start_time)


In [13]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish_downsample.h5ad")


In [14]:
start_time = time.time()
res = 1.0
sc.tl.leiden(adata_merge, resolution=res, random_state=0)
print(time.time() - start_time)


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


627.5981459617615


In [15]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish_downsample.h5ad")


In [16]:
tmp = adata_merge.obs['leiden']
tmp

cell
Sagittal-R1-1043754162511822305451378439343902674      5
Sagittal-R1-115889605987730877346658197744887916157    0
Sagittal-R1-118999854507870250420648053432664395722    9
Sagittal-R1-124436133021557870166867524905466212748    0
Sagittal-R1-131672228322041345871158129298266933775    9
                                                      ..
L8TX_190822_01_B12-CCTACGTCATACTGTG-1                  1
L8TX_190725_01_E08-ATTCAGGAGGTGGGTT-1                  1
L8TX_200130_01_B02-CCATAAGTCGAGTACT-1                  1
L8TX_200130_01_G02-GAGTGAGTCCCAGCGA-1                  1
L8TX_200130_01_G02-GCCTGTTAGTCTTGGT-1                  1
Name: leiden, Length: 1043707, dtype: category
Categories (97, object): ['0', '1', '2', '3', ..., '93', '94', '95', '96']

In [17]:
adata_merge = anndata.read_h5ad("aibs10xmerfish.h5ad")
adata_merge


AnnData object with n_obs × n_vars = 4678380 × 1
    obs: 'count', 'umi_count', 'L1', 'L2', 'L3', 'L1_annot', 'L2_annot', 'DissectionRegion', 'SubRegion', 'MajorRegion', 'Train', 'Study', 'blank_count', 'n_counts', 'n_genes', 'tsne_0', 'tsne_1'
    obsm: 'X_pca_corrected', 'X_tsne', 'u50seurat_tsne'

In [18]:
adata_merge.obs['leiden'] = tmp.copy()


In [19]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish.h5ad")


In [20]:
start_time = time.time()
sc.pp.neighbors(
    adata_merge,
    use_rep="X_pca_corrected",
    n_neighbors=25,
    random_state=0,
    metric="cosine",
)
print(time.time() - start_time)


810.1593112945557


In [21]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish.h5ad")


In [22]:
start_time = time.time()
res = 1.0
sc.tl.leiden(adata_merge, resolution=res, random_state=0)
print(time.time() - start_time)


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


9290.229736566544


In [23]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish.h5ad")
