In [1]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ALLCools.clustering import *
from ALLCools.integration.seurat_class import SeuratIntegration
from ALLCools.plot import *
from wmb import aibs, brain, cemba

In [2]:
dataset = "AIBS_TENX"
categorical_key = ["L2", "DissectionRegion"]
cpu = 3

In [3]:
# Parameters
categorical_key = ["L2", "DissectionRegion"]
cpu = 3
dataset = "AIBS_TENX"
group_name = "CorticalExc"
mem_gb = 1


## Input LSI before integration

In [4]:
ref_adata = anndata.read_h5ad("mc_pca.h5ad")
query_adata = anndata.read_h5ad("rna_pca.h5ad")

In [5]:
adata_list = [ref_adata, query_adata]

### Init empty adata_merge

In [6]:
from scipy.sparse import csr_matrix

cells = sum([a.shape[0] for a in adata_list])
features = adata_list[0].shape[1]

adata_merge = anndata.AnnData(
    X=csr_matrix((cells, features), dtype=np.float32),
    obs=pd.concat([a.obs for a in adata_list]),
    var=adata_list[0].var,
)

In [7]:
if dataset == "AIBS_SMART":
    rna_annot = aibs.get_smart_annot()
elif dataset == "AIBS_TENX":
    rna_annot = aibs.get_tenx_annot()
else:
    rna_annot = broad.get_tenx_annot()

mc_annot = cemba.get_mc_annot()

for key in categorical_key:
    adata_merge.obs[key] = pd.concat(
        [mc_annot[key].to_pandas(), rna_annot[key].to_pandas()]
    ).astype(str)

L4_annot does not exist in data_vars or coords, palette not added.


In [8]:
adata_list

[AnnData object with n_obs × n_vars = 123904 × 6231
     obs: 'count', 'umi_count', 'n_counts', 'Modality'
     var: 'chrom', 'name-RNA', 'start-RNA', 'end-RNA', 'mean-RNA', 'std-RNA', 'start-mC', 'end-mC', 'cov_mean-mC', 'cef-mC'
     obsm: 'X_pca',
 AnnData object with n_obs × n_vars = 1412319 × 6231
     obs: 'count', 'umi_count', 'n_counts', 'Modality'
     var: 'chrom', 'name-RNA', 'start-RNA', 'end-RNA', 'mean-RNA', 'std-RNA', 'start-mC', 'end-mC', 'cov_mean-mC', 'cef-mC'
     obsm: 'X_pca']

In [9]:
adata_merge

AnnData object with n_obs × n_vars = 1536223 × 6231
    obs: 'count', 'umi_count', 'n_counts', 'Modality', 'L2', 'DissectionRegion'
    var: 'chrom', 'name-RNA', 'start-RNA', 'end-RNA', 'mean-RNA', 'std-RNA', 'start-mC', 'end-mC', 'cov_mean-mC', 'cef-mC'

In [10]:
n_pc = adata_list[0].obsm["X_pca"].shape[1]
if n_pc < 10:
    n_cca_components = n_pc
else:
    n_cca_components = max(n_pc - 10, 10)

n_cca_components

56

In [11]:
min_sample = adata_merge.obs["Modality"].value_counts().min()

## Integration and transform

In [12]:
integrator = SeuratIntegration()

In [13]:
# take ~2.5-3h for 300K mC + 4M 10X RNA
anchor = integrator.find_anchor(
    adata_list,
    k_local=None,
    key_local="X_pca",
    k_anchor=5,
    key_anchor="X",
    dim_red="cca",
    max_cc_cells=100000,
    k_score=30,
    k_filter=min(200, min_sample),
    scale1=True,
    scale2=True,
    n_components=n_cca_components,
    n_features=200,
    alignments=[[[0], [1]]],
)

Find anchors across datasets.


Run CCA


non zero dims 56


Find Anchors


Anchor selected with high CC feature graph: 73051 / 265490
Score Anchors


Identified 73051 anchors between datasets 0 and 1.


In [14]:
corrected = integrator.integrate(
    key_correct="X_pca",
    row_normalize=True,
    k_weight=100,
    sd=1,
    alignments=[[[0], [1]]],
)

adata_merge.obsm["X_pca_integrate"] = np.concatenate(corrected)

Merge datasets
[[0], [1]]


Initialize
Find nearest anchors. 

  data=np.array(corrected),


k_weight:  100


Normalize graph


Transform data


## Label transfer

In [15]:
# transfer_results = integrator.label_transfer(
#     ref=[0],
#     qry=[1],
#     categorical_key=categorical_key,
#     key_dist='X_pca'
# )
# for k, v in transfer_results.items():
#     v.to_hdf(f'{k}_transfer.hdf', key='data')
# integrator.save_transfer_results_to_adata(adata_merge, transfer_results)

## Save

In [16]:
adata_merge.write_h5ad("final.h5ad")

In [17]:
adata_merge

AnnData object with n_obs × n_vars = 1536223 × 6231
    obs: 'count', 'umi_count', 'n_counts', 'Modality', 'L2', 'DissectionRegion'
    var: 'chrom', 'name-RNA', 'start-RNA', 'end-RNA', 'mean-RNA', 'std-RNA', 'start-mC', 'end-mC', 'cov_mean-mC', 'cef-mC'
    obsm: 'X_pca_integrate'

In [18]:
integrator.save("integration")

In [19]:
import subprocess
subprocess.run(['rm', '-f', 'mc_pca.h5ad', 'rna_pca.h5ad'])

CompletedProcess(args=['rm', '-f', 'mc_pca.h5ad', 'rna_pca.h5ad'], returncode=0)