<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Parameters" data-toc-modified-id="Parameters-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span><ul class="toc-item"><li><span><a href="#Cell-meta" data-toc-modified-id="Cell-meta-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Cell meta</a></span></li><li><span><a href="#Gene-meta" data-toc-modified-id="Gene-meta-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Gene meta</a></span></li><li><span><a href="#MCDS" data-toc-modified-id="MCDS-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>MCDS</a></span><ul class="toc-item"><li><span><a href="#Prefiltered-features" data-toc-modified-id="Prefiltered-features-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>Prefiltered features</a></span></li></ul></li></ul></li><li><span><a href="#Add-mC-rate" data-toc-modified-id="Add-mC-rate-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Add mC rate</a></span></li><li><span><a href="#Get-Anndata" data-toc-modified-id="Get-Anndata-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Get Anndata</a></span><ul class="toc-item"><li><span><a href="#CH" data-toc-modified-id="CH-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>CH</a></span></li><li><span><a href="#CG" data-toc-modified-id="CG-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>CG</a></span></li></ul></li><li><span><a href="#Prepare-Gene" data-toc-modified-id="Prepare-Gene-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Prepare Gene</a></span></li></ul></div>

In [1]:
from dask.distributed import Client
client = Client(dashboard_address=':46064')

In [2]:
import configparser
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy.api as sc
import seaborn as sns
import xarray as xr
import warnings

from cemba_data.local.mc.prepare_study import prepare_study
from cemba_data.plot import *
from cemba_data.plot.preprocessing import *
from cemba_data.tools.hdf5.anndata import highly_variable_methylation_feature
from ALLCools.mcds.MCDS import MCDS
from cemba_data.tools.hdf5.anndata import rank_features_groups

# Per Region Coords and Cluster

In [3]:
result_dir = pathlib.Path()
result_dir.mkdir(exist_ok=True)
fig_dir = result_dir / 'fig'
fig_dir.mkdir(exist_ok=True)

## Parameters

In [5]:
regions = [
    '1A', '1B', '1C', '2A', '2B', '2C', '2D', '2E', '3A', '3B', '3C',
    '3D', '3E', '3F', '4A', '4B', '4C', '4D', '4E', '4F', '4G', '4H',
    '5A', '5B', '5C', '5D', '5E', '5F', '5G', '5H', '5J', '6A', '6B',
    '6C', '6D', '7B', '8B', '9H', '9J', '11E', '11F']
max_cell = None

In [6]:
project_dir = '/home/hanliu/project/mouse_rostral_brain/'

cell_meta_path = '/home/hanliu/project/mouse_rostral_brain/study/regional_qc/cell_meta.after_qc.msg'
gene_annotation_path = '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz'

clustering_feature = 'chrom100k'  # 100kb chromosome bins

# preprocess parameters
min_feature_cov, max_feature_cov = 500, 3000

ch_hvf_top = 3000
min_ch_hvf_mean = 0.5
max_ch_hvf_mean = 2.5

cg_hvf_top = 3000
min_cg_hvf_mean = 0.5
max_cg_hvf_mean = 1.2

ch_pc_components = 25
cg_pc_components = 14

n_neighbors = 25
resolution = 0.8

mch_gene_score_cutoff = 5
mch_abs_log_fold_change = 1
n_marker_genes = 500

In [7]:
project_dir = pathlib.Path(project_dir)

## Load data
### Cell meta

In [8]:
cell_meta = pd.read_msgpack(cell_meta_path)
cell_meta = cell_meta[cell_meta['PassFilter'] & cell_meta['Region'].isin(regions)]

if (max_cell is not None) and (max_cell <= cell_meta.shape[0]):
    cell_meta = cell_meta.sample(max_cell)
cell_meta.to_msgpack(result_dir / 'used_cells.msg')

### Gene meta

In [9]:
gene_meta = pd.read_csv(gene_annotation_path, index_col='gene_id', sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v:k for k, v in gene_meta['gene_name'].iteritems()}
gene_id_base_to_id = pd.Series(gene_meta.index, index = gene_meta.index.map(lambda i: i.split('.')[0])).to_dict()

### MCDS

In [10]:
# mcds_path
dataset_dir = project_dir / 'dataset'
mcds_path_list = [i for i in dataset_dir.glob('*mcds') if i.name.split('-')[0] in regions]

#### Prefiltered features

In [16]:
total_hvf = pd.read_csv('total_hvf.txt', header=None, index_col=0, names=['chrom100k']).index
total_hvf

Int64Index([   32,    33,    34,    35,    36,    45,    46,    47,    48,
               55,
            ...
            26315, 26316, 26317, 26318, 26319, 26320, 26321, 26322, 26332,
            26335],
           dtype='int64', name='chrom100k', length=5553)

In [17]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    mcds = MCDS.open(mcds_path_list, 
                     use_cells=cell_meta[cell_meta['PassFilter']].index,
                     sel_dict=dict(chrom100k=total_hvf),
                     chunks={'cell': 1000})

In [18]:
mcds

<xarray.MCDS>
Dimensions:              (cell: 100449, chrom100k: 5553, count_type: 2, gene: 55487, mc_type: 2)
Coordinates:
  * mc_type              (mc_type) object 'CGN' 'CHN'
  * gene                 (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG00000064372.1'
  * count_type           (count_type) object 'mc' 'cov'
    strand_type          <U4 'both'
    gene_chrom           (gene) object dask.array<shape=(55487,), chunksize=(55487,)>
    gene_start           (gene) int64 dask.array<shape=(55487,), chunksize=(55487,)>
    gene_end             (gene) int64 dask.array<shape=(55487,), chunksize=(55487,)>
  * cell                 (cell) object '1A_M_0' '1A_M_1' ... '11F_M_999'
  * chrom100k            (chrom100k) int64 32 33 34 35 ... 26322 26332 26335
    chrom100k_chrom      (chrom100k) object dask.array<shape=(5553,), chunksize=(5553,)>
    chrom100k_bin_start  (chrom100k) int64 dask.array<shape=(5553,), chunksize=(5553,)>
    chrom100k_bin_end    (chrom100k) int64 dask.array<shap

## Add mC rate

In [19]:
mcds.add_mc_rate(dim=clustering_feature, da=f'{clustering_feature}_da')

## Get Anndata

### CH

In [21]:
ch_hvf = pd.read_csv('ch_hvf.txt', header=None, index_col=0, names=['chrom100k']).index

In [22]:
ch_adata = mcds.sel(chrom100k=ch_hvf).to_ann(f'{clustering_feature}_da_rate', 
                    var_dim=clustering_feature, 
                    mc_type='CHN')

Transforming to str index.


In [25]:
ch_adata.write_h5ad(result_dir / 'mch_adata.norm_per_cell.hvf.h5ad')

In [30]:
ch_adata

AnnData object with n_obs × n_vars = 100449 × 3000 

### CG

In [26]:
cg_hvf = pd.read_csv('cg_hvf.txt', header=None, index_col=0, names=['chrom100k']).index

In [27]:
cg_adata = mcds.sel(chrom100k=cg_hvf).to_ann(f'{clustering_feature}_da_rate', 
                    var_dim=clustering_feature, 
                    mc_type='CGN')

Transforming to str index.


In [28]:
cg_adata.write_h5ad(result_dir / 'mcg_adata.norm_per_cell.hvf.h5ad')

In [29]:
cg_adata

AnnData object with n_obs × n_vars = 100449 × 2999 

## Prepare Gene

In [31]:
mcds.add_gene_rate(in_memory=False, output_prefix='gene_rate', cell_chunks=10000)

