# Calculate Highly Variable Features And Get mC Fraction AnnData

## Purpose
The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature's normalized dispersion among cells.

## Input
- Filtered cell metadata;
- MCDS files;
- Feature list from basic feature filtering

## Output
- cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

## Import

In [1]:
import yaml
import pandas as pd
import dask
import ALLCools
from ALLCools.mcds import MCDS
from wmb import cemba

## Parameters

In [2]:
with open('config/03b.yaml', 'r') as f:
    config = yaml.safe_load(f)
    locals().update(config)
    print('Notebook configs:')
    for _k, _v in config.items():
        print(f'{_k} = {_v}')

Notebook configs:
dataset = mC
mcg_pattern = CGN
select_cells = select_cells.txt
var_dim = chrom100k


In [3]:
# Parameters
dataset = "mC"
mcg_pattern = "CGN"
select_cells = "select_cells.txt"
var_dim = "chrom100k"


In [4]:
if dataset == 'mC':
    mcds_path = cemba.CEMBA_SNMC_MCDS_PATH
else:
    mcds_path = cemba.CEMBA_SNM3C_MCDS_PATH

## Load Data

### Metadata

In [5]:
if dataset == 'mC':
    metadata = cemba.get_mc_mapping_metric(select_cells=select_cells)
else:
    metadata = cemba.get_m3c_mapping_metric(select_cells=select_cells)

total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')

Metadata of 12291 cells


In [6]:
metadata.head()

Unnamed: 0_level_0,mCCCFrac,mCGFrac,mCHFrac,FinalmCReads,DissectionRegion,Plate,Col384,Row384,Slice,Sample,Technology,InputReads,PassBasicQC,PlateNormCov
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
14B_M_0,0.006819,0.735136,0.01224,1061284.0,14B,CEMBA210617_14B_1,9,0,14,14B_210617,snmC-seq3,2643012,True,0.945824
14B_M_1,0.007127,0.750066,0.013914,1059639.0,14B,CEMBA210617_14B_1,20,0,14,14B_210617,snmC-seq3,2658348,True,0.944358
14B_M_2,0.007187,0.73675,0.014544,773689.0,14B,CEMBA210617_14B_1,21,0,14,14B_210617,snmC-seq3,1827306,True,0.689518
14B_M_3,0.008106,0.748119,0.014039,1567001.0,14B,CEMBA210617_14B_1,8,0,14,14B_210617,snmC-seq3,3767798,True,1.396523
14B_M_4,0.007109,0.758576,0.021379,1121801.0,14B,CEMBA210617_14B_1,9,1,14,14B_210617,snmC-seq3,3119214,True,0.999758


### MCDS

In [7]:
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    # still use all the cells to load MCDS
    total_mcds = MCDS.open(mcds_path,
                           var_dim=var_dim,
                           use_obs=metadata.index)

## Add mC Rate

In [8]:
total_mcds.add_mc_rate(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds

Unnamed: 0,Array,Chunk
Bytes,532.60 kiB,80.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 532.60 kiB 80.00 kiB Shape (27269,) (4096,) Count 7 Tasks 7 Chunks Type numpy.ndarray",27269  1,

Unnamed: 0,Array,Chunk
Bytes,532.60 kiB,80.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,213.04 kiB,32.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 213.04 kiB 32.00 kiB Shape (27269,) (4096,) Count 7 Tasks 7 Chunks Type float64 numpy.ndarray",27269  1,

Unnamed: 0,Array,Chunk
Bytes,213.04 kiB,32.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,213.04 kiB,32.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 213.04 kiB 32.00 kiB Shape (27269,) (4096,) Count 7 Tasks 7 Chunks Type int64 numpy.ndarray",27269  1,

Unnamed: 0,Array,Chunk
Bytes,213.04 kiB,32.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,213.04 kiB,32.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 213.04 kiB 32.00 kiB Shape (27269,) (4096,) Count 7 Tasks 7 Chunks Type int64 numpy.ndarray",27269  1,

Unnamed: 0,Array,Chunk
Bytes,213.04 kiB,32.00 kiB
Shape,"(27269,)","(4096,)"
Count,7 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.50 GiB,19.20 MiB
Shape,"(12291, 27269, 2, 2)","(2458, 4096, 1, 1)"
Count,169 Tasks,168 Chunks
Type,uint16,numpy.ndarray
"Array Chunk Bytes 2.50 GiB 19.20 MiB Shape (12291, 27269, 2, 2) (2458, 4096, 1, 1) Count 169 Tasks 168 Chunks Type uint16 numpy.ndarray",12291  1  2  2  27269,

Unnamed: 0,Array,Chunk
Bytes,2.50 GiB,19.20 MiB
Shape,"(12291, 27269, 2, 2)","(2458, 4096, 1, 1)"
Count,169 Tasks,168 Chunks
Type,uint16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.99 GiB,76.81 MiB
Shape,"(12291, 27269, 2)","(2458, 4096, 1)"
Count,1441 Tasks,84 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 4.99 GiB 76.81 MiB Shape (12291, 27269, 2) (2458, 4096, 1) Count 1441 Tasks 84 Chunks Type float64 numpy.ndarray",2  27269  12291,

Unnamed: 0,Array,Chunk
Bytes,4.99 GiB,76.81 MiB
Shape,"(12291, 27269, 2)","(2458, 4096, 1)"
Count,1441 Tasks,84 Chunks
Type,float64,numpy.ndarray


## Save AnnData

In [9]:
mcg_hvf = pd.read_hdf('mcg_hvf.hdf')

feature_select = mcg_hvf['feature_select']
total_mcds.coords[f'{var_dim}_CGN_feature_select'] = feature_select

In [10]:
mcg_adata = total_mcds.get_adata(mc_type=mcg_pattern,
                                 var_dim=var_dim,
                                 select_hvf=True)

mcg_adata.write_h5ad(f'mCG.HVF.h5ad')

mcg_adata

  return func(*(_execute_task(a, cache) for a in args))


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'chrom' as categorical


AnnData object with n_obs × n_vars = 12291 × 15000
    var: 'chrom', 'cov_mean', 'end', 'start', 'CGN_feature_select'