# Calculate Highly Variable Features And Get mC Fraction AnnData

## Purpose
The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature's normalized dispersion among cells.

## Input
- Filtered cell metadata;
- MCDS files;
- Feature list from basic feature filtering

## Output
- cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

## Import

In [1]:
import yaml
import pandas as pd
import dask
from ALLCools.mcds import MCDS
from wmb import cemba

## Parameters

In [2]:
with open('config/02.yaml', 'r') as f:
    config = yaml.safe_load(f)
    locals().update(config)
    print('Notebook configs:')
    for _k, _v in config.items():
        print(f'{_k} = {_v}')

Notebook configs:
dataset = mC
downsample = 30000
feature_path = FeatureList.BasicFilter.txt
hvf_method = SVR
load = True
mcg_pattern = CGN
mch_pattern = CHN
n_top_feature = 15000
select_cells = select_cells.txt
var_dim = chrom100k


In [3]:
# Parameters
dataset = "mC"
downsample = 30000
feature_path = "FeatureList.BasicFilter.txt"
hvf_method = "SVR"
load = True
mcg_pattern = "CGN"
mch_pattern = "CHN"
n_top_feature = 15000
select_cells = "select_cells.txt"
var_dim = "chrom100k"


In [4]:
if dataset == 'mC':
    mcds_path = cemba.CEMBA_SNMC_MCDS_PATH
else:
    mcds_path = cemba.CEMBA_SNM3C_MCDS_PATH

## Load Data

### Metadata

In [5]:
if dataset == 'mC':
    metadata = cemba.get_mc_mapping_metric(select_cells=select_cells)
else:
    metadata = cemba.get_m3c_mapping_metric(select_cells=select_cells)

total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')

Metadata of 12291 cells


In [6]:
metadata.head()

Unnamed: 0_level_0,mCCCFrac,mCGFrac,mCHFrac,FinalmCReads,DissectionRegion,Plate,Col384,Row384,Slice,Sample,Technology,InputReads,PassBasicQC,PlateNormCov
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
14B_M_0,0.006819,0.735136,0.01224,1061284.0,14B,CEMBA210617_14B_1,9,0,14,14B_210617,snmC-seq3,2643012,True,0.945824
14B_M_1,0.007127,0.750066,0.013914,1059639.0,14B,CEMBA210617_14B_1,20,0,14,14B_210617,snmC-seq3,2658348,True,0.944358
14B_M_2,0.007187,0.73675,0.014544,773689.0,14B,CEMBA210617_14B_1,21,0,14,14B_210617,snmC-seq3,1827306,True,0.689518
14B_M_3,0.008106,0.748119,0.014039,1567001.0,14B,CEMBA210617_14B_1,8,0,14,14B_210617,snmC-seq3,3767798,True,1.396523
14B_M_4,0.007109,0.758576,0.021379,1121801.0,14B,CEMBA210617_14B_1,9,1,14,14B_210617,snmC-seq3,3119214,True,0.999758


### MCDS

In [7]:
use_features = pd.read_csv(feature_path, header=None, index_col=0).index
use_features.name = 'chrom100k'

In [8]:
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    # still use all the cells to load MCDS
    total_mcds = MCDS.open(mcds_path,
                           var_dim=var_dim,
                           use_obs=metadata.index).sel({var_dim: use_features})

## Add mC Rate

In [9]:
total_mcds.add_mc_rate(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds

Unnamed: 0,Array,Chunk
Bytes,441.86 kiB,75.35 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 441.86 kiB 75.35 kiB Shape (22623,) (3858,) Count 14 Tasks 7 Chunks Type numpy.ndarray",22623  1,

Unnamed: 0,Array,Chunk
Bytes,441.86 kiB,75.35 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,176.74 kiB,30.14 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 176.74 kiB 30.14 kiB Shape (22623,) (3858,) Count 14 Tasks 7 Chunks Type float64 numpy.ndarray",22623  1,

Unnamed: 0,Array,Chunk
Bytes,176.74 kiB,30.14 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,176.74 kiB,30.14 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 176.74 kiB 30.14 kiB Shape (22623,) (3858,) Count 14 Tasks 7 Chunks Type int64 numpy.ndarray",22623  1,

Unnamed: 0,Array,Chunk
Bytes,176.74 kiB,30.14 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,176.74 kiB,30.14 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 176.74 kiB 30.14 kiB Shape (22623,) (3858,) Count 14 Tasks 7 Chunks Type int64 numpy.ndarray",22623  1,

Unnamed: 0,Array,Chunk
Bytes,176.74 kiB,30.14 kiB
Shape,"(22623,)","(3858,)"
Count,14 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.07 GiB,18.09 MiB
Shape,"(12291, 22623, 2, 2)","(2458, 3858, 1, 1)"
Count,337 Tasks,168 Chunks
Type,uint16,numpy.ndarray
"Array Chunk Bytes 2.07 GiB 18.09 MiB Shape (12291, 22623, 2, 2) (2458, 3858, 1, 1) Count 337 Tasks 168 Chunks Type uint16 numpy.ndarray",12291  1  2  2  22623,

Unnamed: 0,Array,Chunk
Bytes,2.07 GiB,18.09 MiB
Shape,"(12291, 22623, 2, 2)","(2458, 3858, 1, 1)"
Count,337 Tasks,168 Chunks
Type,uint16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.14 GiB,72.35 MiB
Shape,"(12291, 22623, 2)","(2458, 3858, 1)"
Count,1609 Tasks,84 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 4.14 GiB 72.35 MiB Shape (12291, 22623, 2) (2458, 3858, 1) Count 1609 Tasks 84 Chunks Type float64 numpy.ndarray",2  22623  12291,

Unnamed: 0,Array,Chunk
Bytes,4.14 GiB,72.35 MiB
Shape,"(12291, 22623, 2)","(2458, 3858, 1)"
Count,1609 Tasks,84 Chunks
Type,float64,numpy.ndarray


### If downsample

In [10]:
if downsample and total_cells > downsample:
    # make a downsampled mcds
    print(f'Downsample cells to {downsample} to calculate HVF.')
    downsample_cell_ids = metadata.sample(downsample, random_state=0).index
    mcds = total_mcds.sel(
        {'cell': total_mcds.get_index('cell').isin(downsample_cell_ids)})
else:
    mcds = total_mcds

In [11]:
if load and (mcds.get_index('cell').size <= 50000):
    # load the relevant data so the computation can be fater, watch out memory!
    mcds[f'{var_dim}_da_frac'].load()

  return func(*(_execute_task(a, cache) for a in args))


The RuntimeWarning is expected (due to cov == 0). You can ignore it.

## Highly Variable Feature

### mCH

In [12]:
if hvf_method == 'SVR':
    # use SVR based method
    mch_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mch_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=False)
else:
    # use bin based method
    mch_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mch_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.05,
                                 cov_binsize=100)

Fitting SVR with gamma 0.0442, predicting feature dispersion using mc_frac_mean and cov_mean.


Total Feature Number:     22623
Highly Variable Feature:  15000 (66.3%)


In [13]:
mch_hvf.to_hdf('mch_hvf.hdf', key='data')

### mCG

In [14]:
if hvf_method == 'SVR':
    # use SVR based method
    mcg_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mcg_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=False)
else:
    # use bin based method
    mcg_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mcg_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.02,
                                 cov_binsize=20)

Fitting SVR with gamma 0.0442, predicting feature dispersion using mc_frac_mean and cov_mean.


Total Feature Number:     22623
Highly Variable Feature:  15000 (66.3%)


In [15]:
mcg_hvf.to_hdf('mcg_hvf.hdf', key='data')