# Prepare

In [1]:
import pathlib
import warnings

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from ALLCools.mcds.MCDS import MCDS
from cemba_data.plot import cutoff_vs_cell_remain
from pybedtools import BedTool

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)
  'DataArray', pd.Series, pd.DataFrame, pd.Panel]:


## Parameters

In [2]:
cell_tidy_data_path = 'cell_tidy_data.msg'
cluster_col = 'SubType'
clustering_feature = 'gene'
dask_distribute = True
in_memory = False
mcds_path_list = list(
    pathlib.Path('/home/hanliu/project/mouse_rostral_brain/dataset/').glob(
        '*mcds'))
exclude_chromosome = ['chrM', 'chrY']
black_list_path = '/home/hanliu/project/mouse_rostral_brain/misc/mm10-blacklist.v2.bed.gz'
min_feature_cov = 30
max_feature_cov = 8000
mc_type = 'CHN'
filter_by_ncbi = True
ncbi_path = '/home/hanliu/ref/ncbi/gene2ensembl.mouse.tsv.gz'

### Stable parameters

In [21]:
output_dir = 'Adata'
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True)

## Load Data

In [4]:
if dask_distribute:
    from dask.distributed import Client
    client = Client(dashboard_address=':5555')

Port 5555 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


### Clustering results

In [5]:
cell_tidy_data = pd.read_msgpack(cell_tidy_data_path)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


### Raw Count Matrix (before filter)

In [6]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    mcds = MCDS.open(mcds_path_list,
                     use_cells=cell_tidy_data.index,
                     chunks={'cell': 1000})
if in_memory:
    mcds.load()

In [7]:
mcds

<xarray.MCDS>
Dimensions:              (cell: 1000, chrom100k: 27269, count_type: 2, gene: 55487, mc_type: 2)
Coordinates:
  * mc_type              (mc_type) object 'CGN' 'CHN'
  * gene                 (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG00000064372.1'
  * count_type           (count_type) object 'mc' 'cov'
    strand_type          <U4 'both'
    gene_chrom           (gene) object dask.array<shape=(55487,), chunksize=(55487,)>
    gene_start           (gene) int64 dask.array<shape=(55487,), chunksize=(55487,)>
    gene_end             (gene) int64 dask.array<shape=(55487,), chunksize=(55487,)>
  * cell                 (cell) object '5E_M_1338' '5J-2_M_1215' ... '3F_M_2184'
  * chrom100k            (chrom100k) int64 0 1 2 3 4 ... 27265 27266 27267 27268
    chrom100k_chrom      (chrom100k) object dask.array<shape=(27269,), chunksize=(27269,)>
    chrom100k_bin_start  (chrom100k) int64 dask.array<shape=(27269,), chunksize=(27269,)>
    chrom100k_bin_end    (chrom100k) int64 

## Filter Feature

### Remove some chromosome

In [8]:
mcds = mcds.sel(
    {clustering_feature: ~mcds.coords[f'{clustering_feature}_chrom'].isin(exclude_chromosome)})

### Remove blacklist

In [9]:
spetial_char = '_bin' if clustering_feature.startswith('chrom') else ''

feature_bed_df = pd.DataFrame([
    mcds.coords[f'{clustering_feature}_chrom'].to_pandas(),
    mcds.coords[f'{clustering_feature}{spetial_char}_start'].to_pandas(),
    mcds.coords[f'{clustering_feature}{spetial_char}_end'].to_pandas()
],
                              index=['chrom', 'start', 'end'],
                              columns=mcds.get_index(clustering_feature)).T
feature_bed = BedTool.from_dataframe(feature_bed_df)

  3: pd.Panel}
  3: pd.Panel}
  3: pd.Panel}


In [10]:
black_list_bed = BedTool(black_list_path)
black_feature = feature_bed.intersect(black_list_bed, f=0.2, wa=True)
black_feature_index = black_feature.to_dataframe().set_index(
    ['chrom', 'start', 'end']).index
black_feature_id = pd.Index(feature_bed_df.reset_index()\
                       .set_index(['chrom', 'start', 'end'])\
                       .loc[black_feature_index][clustering_feature])

In [11]:
mcds = mcds.sel({clustering_feature: ~mcds.get_index(clustering_feature).isin(black_feature_id)})

## Filter feature by NCBI id
- Since Allen's data must have NCBI id to match

In [12]:
if clustering_feature == 'gene':
    have_entrez_id = pd.read_csv(ncbi_path, sep='\t', index_col='GeneID')['Ensembl_gene_identifier']
    have_entrez_id = set(have_entrez_id.drop_duplicates().tolist())
    _index = mcds.get_index(clustering_feature)
    judge = _index.map(lambda i: i.split('.')[0] in have_entrez_id)
    mcds = mcds.sel({clustering_feature: _index[judge]})

### Remove by mean cov

In [13]:
# # only calculate CHN
# fig, axes = cutoff_vs_cell_remain(mcds[f'{clustering_feature}_da']\
#                                   .sel(count_type='cov', mc_type='CHN')\
#                                   .mean(axis=0)\
#                                   .load())

In [14]:
before = mcds.coords[clustering_feature].size
mcds = mcds.filter_region_cov(dim=clustering_feature,
                              da=f'{clustering_feature}_da',
                              mc_type=mc_type,
                              min_cov=min_feature_cov,
                              max_cov=max_feature_cov)
after = mcds.coords[clustering_feature].size
print(
    f'Filter {clustering_feature} by [{min_feature_cov}, {max_feature_cov}], '
    f'filter rate {(after / before):.2f}. {after} features remained.')

Filter gene by [30, 8000], filter rate 0.89. 20799 features remained.


In [15]:
mcds

<xarray.MCDS>
Dimensions:              (cell: 1000, chrom100k: 27269, count_type: 2, gene: 20799, mc_type: 2)
Coordinates:
  * mc_type              (mc_type) object 'CGN' 'CHN'
  * gene                 (gene) object 'ENSMUSG00000051951.5' ... 'ENSMUSG00000035299.16'
  * count_type           (count_type) object 'mc' 'cov'
    strand_type          <U4 'both'
    gene_chrom           (gene) object dask.array<shape=(20799,), chunksize=(20799,)>
    gene_start           (gene) int64 dask.array<shape=(20799,), chunksize=(20799,)>
    gene_end             (gene) int64 dask.array<shape=(20799,), chunksize=(20799,)>
  * cell                 (cell) object '5E_M_1338' '5J-2_M_1215' ... '3F_M_2184'
  * chrom100k            (chrom100k) int64 0 1 2 3 4 ... 27265 27266 27267 27268
    chrom100k_chrom      (chrom100k) object dask.array<shape=(27269,), chunksize=(27269,)>
    chrom100k_bin_start  (chrom100k) int64 dask.array<shape=(27269,), chunksize=(27269,)>
    chrom100k_bin_end    (chrom100k) int64

## Add rate

In [16]:
mcds.add_mc_rate(dim=clustering_feature, da=f'{clustering_feature}_da')

In [17]:
data = mcds[f'{clustering_feature}_da_rate'].sel(mc_type=mc_type).values

## Make Adata

In [18]:
adata = anndata.AnnData(X=data,
                        obs=cell_tidy_data.reindex(mcds.get_index('cell')),
                        var=feature_bed_df.reindex(
                            mcds.get_index(clustering_feature)))

In [19]:
adata

AnnData object with n_obs × n_vars = 1000 × 20799 
    obs: 'AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate', 'CH_RateAdj', 'FinalReads', 'InputReads', 'MappedReads', 'Region', 'index_name', 'uid', 'BamFilteringRate', 'MappingRate', 'Pos96', 'Plate', 'Col96', 'Row96', 'Col384', 'Row384', 'FACS_Date', 'Slice', 'MajorRegion', 'SubRegion', 'CellClass', 'l1-umap_0', 'l1-umap_1', 'l1-tsne_0', 'l1-tsne_1', 'MajorType', 'l2-umap_0', 'l2-umap_1', 'l2-tsne_0', 'l2-tsne_1', 'SubType', 'l3-umap_0', 'l3-umap_1', 'l3-tsne_0', 'l3-tsne_1', 'L1CellClass', 'class_tsne_0', 'class_tsne_1', 'class_umap_0', 'class_umap_1', 'Order', 'RegionName', 'DetailRegion', 'PotentialOverlap (MMB)', 'Anterior (CCF coords)', 'Posterior (CCF coords)', 'MajorRegionColor', 'SubRegionColor', 'DissectionRegionColor'
    var: 'chrom', 'start', 'end'

In [20]:
adata.write_h5ad(output_dir / 'mc.cell_by_feature.cov_filter.rate.h5ad')

... storing 'Region' as categorical
... storing 'index_name' as categorical
... storing 'uid' as categorical
... storing 'Pos96' as categorical
... storing 'Plate' as categorical
... storing 'MajorRegion' as categorical
... storing 'SubRegion' as categorical
... storing 'CellClass' as categorical
... storing 'MajorType' as categorical
... storing 'SubType' as categorical
... storing 'L1CellClass' as categorical
... storing 'RegionName' as categorical
... storing 'DetailRegion' as categorical
... storing 'PotentialOverlap (MMB)' as categorical
... storing 'MajorRegionColor' as categorical
... storing 'SubRegionColor' as categorical
... storing 'DissectionRegionColor' as categorical
... storing 'chrom' as categorical
... storing 'start' as categorical
... storing 'end' as categorical
