In [1]:
import pandas as pd
import anndata
import numpy

## Parameter

In [2]:
delta_dmr_rate_cutoff = 0.3

In [3]:
# Parameters
delta_dmr_rate_cutoff = 0.3


## Load Data

In [4]:
dmr_hits = anndata.read_h5ad('RelatedClusterAndDMR.hypo_hits.h5ad')
dmr_hits = pd.DataFrame(dmr_hits.X.astype(bool).todense(), index=dmr_hits.obs_names, columns=dmr_hits.var_names)
dmr_hits.head()

index,CA1_Chrm3,CA3_Cadm2,DG_dg-all
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Subchr1_14,True,False,False
Subchr1_37,False,True,False
Subchr1_38,True,True,False
Subchr1_61,False,False,True
Subchr1_79,True,False,True


In [5]:
dmr_rate = pd.read_msgpack('RelatedClusterAndDMR.mcg_rate.msg')
dmr_rate.head()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,CA1_Chrm3,CA3_Cadm2,DG_dg-all
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Subchr1_14,0.419355,0.482759,0.672131
Subchr1_37,0.58427,0.248,0.669725
Subchr1_38,0.290909,0.462687,0.907407
Subchr1_61,0.574713,0.659341,0.418182
Subchr1_79,0.357143,0.510638,0.067797


In [6]:
dmg = pd.read_msgpack('RelatedDMG.msg')
dmg['cluster_from'] = dmg['cluster_from'].str.replace(' ', '_')
dmg['cluster_to'] = dmg['cluster_to'].str.replace(' ', '_')
dmg.head()

Unnamed: 0,pvals_adj,gene_id,cluster_from,cluster_to,gene_name,-lgp,AUROC
0,0.0,ENSMUSG00000098760.1,CA1_Chrm3,DG_dg-all,Gm2164,1000.0,0.999939
1,0.0,ENSMUSG00000054728.16,CA1_Chrm3,DG_dg-all,Phactr1,1000.0,0.999897
2,0.0,ENSMUSG00000034275.18,CA1_Chrm3,DG_dg-all,Igsf9b,1000.0,0.999874
3,0.0,ENSMUSG00000027674.16,CA1_Chrm3,DG_dg-all,Pex5l,1000.0,0.999839
4,0.0,ENSMUSG00000057716.6,CA1_Chrm3,DG_dg-all,Tmem178b,1000.0,0.999798


In [7]:
corr = pd.read_msgpack('RelatedCorr.msg')
corr.head()

Unnamed: 0,DMR,Gene,Corr,chrom,DMR_to_gene_dist
154,Subchr1_284,ENSMUSG00000051951.5,0.316056,chr1,-480154.0
168,Subchr1_298,ENSMUSG00000051951.5,0.452112,chr1,-472864.0
204,Subchr1_308,ENSMUSG00000051951.5,0.396429,chr1,-467551.0
212,Subchr1_309,ENSMUSG00000051951.5,0.372922,chr1,-466636.0
273,Subchr1_328,ENSMUSG00000051951.5,0.536126,chr1,-454392.5


## prepare DMR pairwise

In [8]:
pairwise_dmr_index = {}
for _, (cluster_from, cluster_to) in dmg[['cluster_from', 'cluster_to']].drop_duplicates().iterrows():
    # cluster_from is hypo, cluster_to is hyper
    rate_delta = (dmr_rate[cluster_to] - dmr_rate[cluster_from]) > delta_dmr_rate_cutoff
    # cluster_to is not hypo, cluster_from is hypo, and rate_delta > delta_dmr_rate_cutoff
    total_judge = (~dmr_hits[cluster_to]) & dmr_hits[cluster_from] & rate_delta

    # remaining dmr is sig hypo in cluster_from and hyper in cluster_to
    pairwise_dmr_index[(cluster_from, cluster_to)] = total_judge[total_judge].index

## final DMG DMR for pairwise cluster

In [9]:
with pd.HDFStore('FinalDMGDMR.h5') as f:
    for (cluster_from, cluster_to), cluster_pair_df in dmg.groupby(
        ['cluster_from', 'cluster_to']):
        # first get related dmr and dmg separately
        cluster_pair_related_dmr = pairwise_dmr_index[(cluster_from, cluster_to)]
        cluster_pair_related_gene = cluster_pair_df['gene_id'].unique()
        
        # use them to filter corr
        cluster_pair_related_corr = corr[
            corr['DMR'].isin(cluster_pair_related_dmr)
            & corr['Gene'].isin(cluster_pair_related_gene)]
        
        # then use remained dmr and dmg as final list
        # the remaining DMR DMG is:
        # - sig hypo in one clsuter compare to another
        # - have sig correlateion
        cluster_pair_related_dmr = pd.Series(cluster_pair_related_corr['DMR'].unique())
        cluster_pair_related_gene = pd.Series(cluster_pair_related_corr['Gene'].unique())
        
        print(cluster_from, cluster_to, cluster_pair_related_dmr.size, cluster_pair_related_gene.size)
        this_dict = {}
        f[f'{cluster_from}/{cluster_to}/DMR'] = cluster_pair_related_dmr
        f[f'{cluster_from}/{cluster_to}/Gene'] = cluster_pair_related_gene
        f[f'{cluster_from}/{cluster_to}/Corr'] = cluster_pair_related_corr

CA1_Chrm3 CA3_Cadm2 18516 470
CA1_Chrm3 DG_dg-all 29715 435


  check_attribute_name(name)


CA3_Cadm2 CA1_Chrm3 10307 329
CA3_Cadm2 DG_dg-all 20839 451
DG_dg-all CA1_Chrm3 12631 549
DG_dg-all CA3_Cadm2 12817 691
