In [1]:
import pandas as pd
import anndata
import numpy

## Parameter

In [2]:
delta_dmr_rate_cutoff = 0.3

In [3]:
# Parameters
delta_dmr_rate_cutoff = 0.3


## Load Data

In [4]:
dmr_hits = anndata.read_h5ad('RelatedClusterAndDMR.hypo_hits.h5ad')
dmr_hits = pd.DataFrame(dmr_hits.X.astype(bool).todense(), index=dmr_hits.obs_names, columns=dmr_hits.var_names)
dmr_hits.head()

index,CA1_Ak5,CA1_Chrm3,CA1_Kif26a,CA1_Lingo2,CA1_Ptprg,CA3_Cadm2,CA3_Efnb2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Subchr1_14,False,True,False,False,False,False,False
Subchr1_23,False,False,False,False,False,False,True
Subchr1_37,False,False,False,False,False,True,True
Subchr1_38,False,True,False,False,False,True,True
Subchr1_65,False,False,False,True,False,False,False


In [5]:
dmr_rate = pd.read_msgpack('RelatedClusterAndDMR.mcg_rate.msg')
dmr_rate.head()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,CA1_Ak5,CA1_Chrm3,CA1_Kif26a,CA1_Lingo2,CA1_Ptprg,CA3_Cadm2,CA3_Efnb2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Subchr1_14,0.828571,0.419355,0.7,1.0,0.767442,0.482759,0.8125
Subchr1_23,0.956522,0.859375,0.8,0.4,0.860465,0.851852,0.444444
Subchr1_37,0.716216,0.58427,0.846154,0.875,0.694444,0.248,0.285714
Subchr1_38,0.8,0.290909,0.875,0.857143,0.803279,0.462687,0.454545
Subchr1_65,1.0,1.0,1.0,0.5,1.0,0.978723,1.0


In [6]:
dmg = pd.read_msgpack('RelatedDMG.msg')
dmg['cluster_from'] = dmg['cluster_from'].str.replace(' ', '_')
dmg['cluster_to'] = dmg['cluster_to'].str.replace(' ', '_')
dmg.head()

Unnamed: 0,pvals_adj,gene_id,cluster_from,cluster_to,gene_name,-lgp,AUROC
0,0.0,ENSMUSG00000098760.1,CA1_Chrm3,CA3_Cadm2,Gm2164,1000.0,0.999988
1,0.0,ENSMUSG00000009681.10,CA1_Chrm3,CA3_Cadm2,Bcr,1000.0,0.999958
2,0.0,ENSMUSG00000052105.17,CA1_Chrm3,CA3_Cadm2,Mtcl1,1000.0,0.999687
3,0.0,ENSMUSG00000048251.15,CA1_Chrm3,CA3_Cadm2,Bcl11b,1000.0,0.999449
4,0.0,ENSMUSG00000104283.1,CA1_Chrm3,CA3_Cadm2,Gm37459,1000.0,0.999421


In [7]:
corr = pd.read_msgpack('RelatedCorr.msg')
corr.head()

Unnamed: 0,DMR,Gene,Corr,chrom,DMR_to_gene_dist
1,Subchr1_79,ENSMUSG00000089699.1,0.314964,chr1,-405545.0
10,Subchr1_96,ENSMUSG00000089699.1,0.49814,chr1,-394994.0
36,Subchr1_135,ENSMUSG00000089699.1,0.335649,chr1,-370929.0
73,Subchr1_176,ENSMUSG00000089699.1,0.508618,chr1,-347283.0
83,Subchr1_177,ENSMUSG00000089699.1,0.577361,chr1,-347191.0


## prepare DMR pairwise

In [8]:
pairwise_dmr_index = {}
for _, (cluster_from, cluster_to) in dmg[['cluster_from', 'cluster_to']].drop_duplicates().iterrows():
    # cluster_from is hypo, cluster_to is hyper
    rate_delta = (dmr_rate[cluster_to] - dmr_rate[cluster_from]) > delta_dmr_rate_cutoff
    # cluster_to is not hypo, cluster_from is hypo, and rate_delta > delta_dmr_rate_cutoff
    total_judge = (~dmr_hits[cluster_to]) & dmr_hits[cluster_from] & rate_delta

    # remaining dmr is sig hypo in cluster_from and hyper in cluster_to
    pairwise_dmr_index[(cluster_from, cluster_to)] = total_judge[total_judge].index

## final DMG DMR for pairwise cluster

In [9]:
with pd.HDFStore('FinalDMGDMR.h5') as f:
    for (cluster_from, cluster_to), cluster_pair_df in dmg.groupby(
        ['cluster_from', 'cluster_to']):
        # first get related dmr and dmg separately
        cluster_pair_related_dmr = pairwise_dmr_index[(cluster_from, cluster_to)]
        cluster_pair_related_gene = cluster_pair_df['gene_id'].unique()
        
        # use them to filter corr
        cluster_pair_related_corr = corr[
            corr['DMR'].isin(cluster_pair_related_dmr)
            & corr['Gene'].isin(cluster_pair_related_gene)]
        
        # then use remained dmr and dmg as final list
        # the remaining DMR DMG is:
        # - sig hypo in one clsuter compare to another
        # - have sig correlateion
        cluster_pair_related_dmr = pd.Series(cluster_pair_related_corr['DMR'].unique())
        cluster_pair_related_gene = pd.Series(cluster_pair_related_corr['Gene'].unique())
        
        print(cluster_from, cluster_to, cluster_pair_related_dmr.size, cluster_pair_related_gene.size)
        this_dict = {}
        f[f'{cluster_from}/{cluster_to}/DMR'] = cluster_pair_related_dmr
        f[f'{cluster_from}/{cluster_to}/Gene'] = cluster_pair_related_gene
        f[f'{cluster_from}/{cluster_to}/Corr'] = cluster_pair_related_corr

CA1_Ak5 CA1_Chrm3 496 66
CA1_Ak5 CA1_Kif26a 1311 100
CA1_Ak5 CA1_Lingo2 2126 158
CA1_Ak5 CA1_Ptprg 71 9
CA1_Ak5 CA3_Cadm2 2678 246
CA1_Ak5 CA3_Efnb2 1911 155
CA1_Chrm3 CA1_Ak5 8712 122
CA1_Chrm3 CA1_Kif26a 13071 275
CA1_Chrm3 CA1_Lingo2 14297 359
CA1_Chrm3 CA1_Ptprg 2685 60
CA1_Chrm3 CA3_Cadm2 18516 470
CA1_Chrm3 CA3_Efnb2 34351 670
CA1_Kif26a CA1_Ak5 5813 186
CA1_Kif26a CA1_Chrm3 4153 280
CA1_Kif26a CA1_Lingo2 7991 420
CA1_Kif26a CA1_Ptprg 2512 156
CA1_Kif26a CA3_Cadm2 16519 687
CA1_Kif26a CA3_Efnb2 19552 709
CA1_Lingo2 CA1_Ak5 6613 249
CA1_Lingo2 CA1_Chrm3 4754 317
CA1_Lingo2 CA1_Kif26a 6139 343
CA1_Lingo2 CA1_Ptprg 2998 168
CA1_Lingo2 CA3_Cadm2 8313 523
CA1_Lingo2 CA3_Efnb2 12697 635
CA1_Ptprg CA1_Ak5 1155 31
CA1_Ptprg CA1_Chrm3 748 60
CA1_Ptprg CA1_Kif26a 3690 111
CA1_Ptprg CA1_Lingo2 5206 199
CA1_Ptprg CA3_Cadm2 9999 388
CA1_Ptprg CA3_Efnb2 16240 461
CA3_Cadm2 CA1_Ak5 16816 278
CA3_Cadm2 CA1_Chrm3 10307 329
CA3_Cadm2 CA1_Kif26a 27485 501
CA3_Cadm2 CA1_Lingo2 16058 409
CA3_Cadm2 CA