# Differential Methylated Genes - Pairwise

In [1]:
import pandas as pd
import anndata
import xarray as xr
from ALLCools.plot import *
from ALLCools.mcds import MCDS
from ALLCools.clustering import PairwiseDMG, cluster_enriched_features
import pathlib

## Parameters

In [3]:
adata_path = '../step_by_step/100kb/adata.with_coords.h5ad'
cluster_col = 'L1'

# change this to the paths to your MCDS files
gene_fraction_dir = 'gene_frac/'
obs_dim = 'cell'
var_dim = 'gene'

# DMG
mc_type = 'CHN'
top_n = 1000
adj_p_cutoff = 1e-3
delta_rate_cutoff = 0.3
auroc_cutoff = 0.9
random_state = 0
n_jobs = 30

## Load

In [4]:
adata = anndata.read_h5ad(adata_path)

cell_meta = adata.obs.copy()
cell_meta.index.name = obs_dim

gene_meta = pd.read_csv(f'{gene_fraction_dir}/GeneMetadata.csv.gz', index_col=0)

gene_mcds = MCDS.open(f'{gene_fraction_dir}/*_da_frac.mcds', use_obs=cell_meta.index)
gene_mcds

Unnamed: 0,Array,Chunk
Bytes,285.31 kB,285.31 kB
Shape,"(35664,)","(35664,)"
Count,15 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 15 Tasks 1 Chunks Type object numpy.ndarray",35664  1,

Unnamed: 0,Array,Chunk
Bytes,285.31 kB,285.31 kB
Shape,"(35664,)","(35664,)"
Count,15 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.31 kB,285.31 kB
Shape,"(35664,)","(35664,)"
Count,13 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray",35664  1,

Unnamed: 0,Array,Chunk
Bytes,285.31 kB,285.31 kB
Shape,"(35664,)","(35664,)"
Count,13 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.31 kB,285.31 kB
Shape,"(35664,)","(35664,)"
Count,13 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray",35664  1,

Unnamed: 0,Array,Chunk
Bytes,285.31 kB,285.31 kB
Shape,"(35664,)","(35664,)"
Count,13 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.41 GB,374.90 MB
Shape,"(4958, 35664, 2)","(1314, 35664, 2)"
Count,18 Tasks,4 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.41 GB 374.90 MB Shape (4958, 35664, 2) (1314, 35664, 2) Count 18 Tasks 4 Chunks Type float32 numpy.ndarray",2  35664  4958,

Unnamed: 0,Array,Chunk
Bytes,1.41 GB,374.90 MB
Shape,"(4958, 35664, 2)","(1314, 35664, 2)"
Count,18 Tasks,4 Chunks
Type,float32,numpy.ndarray


## Pairwise DMG

In [5]:
pwdmg = PairwiseDMG(max_cell_per_group=1000,
                    top_n=top_n,
                    adj_p_cutoff=adj_p_cutoff,
                    delta_rate_cutoff=delta_rate_cutoff,
                    auroc_cutoff=auroc_cutoff,
                    random_state=random_state,
                    n_jobs=n_jobs)

In [6]:
pwdmg.fit_predict(x=gene_mcds[f'{var_dim}_da_frac'].sel(mc_type=mc_type), 
                  groups=cell_meta[cluster_col])

Generating cluster AnnData files
Computing pairwise DMG
406 pairwise DMGs
1/406 finished
41/406 finished
81/406 finished
121/406 finished
161/406 finished
201/406 finished
241/406 finished
281/406 finished
321/406 finished
361/406 finished
401/406 finished


In [7]:
pwdmg.dmg_table.to_hdf(f'{cluster_col}.PairwiseDMG.{mc_type}.hdf', key='data')
pwdmg.dmg_table.head()

Unnamed: 0_level_0,pvals_adj,left-right,delta,hypo_in,hyper_in,AUROC
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000037610.15,7.447165e-59,c1-c11,1.401063,c11,c1,0.999361
ENSMUSG00000040118.15,7.447165e-59,c1-c11,-1.480275,c1,c11,1.0
ENSMUSG00000010066.15,7.447165e-59,c1-c11,1.672248,c11,c1,0.999306
ENSMUSG00000026058.11,7.447165e-59,c1-c11,-0.809622,c1,c11,0.999927
ENSMUSG00000031543.18,7.447165e-59,c1-c11,2.033618,c11,c1,0.999178


## Aggregating Cluster DMG

Weighted total AUROC aggregated from the pairwise comparisons.

### Aggregate Pairwise Comparisons

In [11]:
cluster_dmgs = pwdmg.aggregate_pairwise_dmg(adata, groupby=cluster_col)

In [13]:
# save all the DMGs
with pd.HDFStore(f'{cluster_col}.ClusterRankedPWDMG.{mc_type}.hdf') as hdf:
    for cluster, dmgs in cluster_dmgs.items():
        hdf[cluster] = dmgs[dmgs > 0.0001]

#