In [1]:
import pathlib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import anndata
from pybedtools import BedTool

## Parameter

In [2]:
dmr_path = '/home/hanliu/project/mouse_rostral_brain/DMR/ITSpatial/ITSpatial_rms_results_collapsed.tsv'
dmr_prefix = 'DGmCH'

black_list_path = '/home/hanliu/ref/blacklist/mm10-blacklist.v2.bed.gz'
skip_quantile = 0.25
delta_to_mean = 0.1
dms_cutoff = 1

In [3]:
# Parameters
dmr_path = "/home/hanliu/project/mouse_rostral_brain/DMR/SubType/raw/SubType-chr11/SubType-chr11_rms_results_collapsed.tsv"
dmr_prefix = "Sub"
black_list_path = "/home/hanliu/ref/blacklist/mm10-blacklist.v2.bed.gz"
skip_quantile = 0.25
delta_to_mean = 0.3
dms_cutoff = 1


## Read DMR

In [4]:
# get column and n_sample
dmr_column = pd.read_csv(dmr_path, sep='\t', nrows=0).columns
n_sample = dmr_column.size - 6
dmr_column = dmr_column.map(lambda i: i[18:] if 'methylation_level_' in i else i)
samples = dmr_column[6:]

dmr_region_infos = []
mc_rates = []
hypo_sig_dict = {sample: [] for sample in samples}
hyper_sig_dict = {sample: [] for sample in samples}

dmr_df = pd.read_csv(dmr_path, sep='\t', skiprows=1, names=dmr_column)
print(dmr_df.shape[0], 'dmr before filter')
dmr_df.index = dmr_df.index.map(lambda i: f'{dmr_prefix}_{i}')

# save raw info
dmr_bed = dmr_df.iloc[:, :4].copy()
mc_rate = dmr_df.iloc[:, 6:].copy()

233508 dmr before filter


## Save unfiltered dmr info

In [5]:
with pd.HDFStore(pathlib.Path(dmr_path).parent / 'DMRInfo.h5') as hdf:
    hdf['Rate'] = mc_rate
    hdf['Bed'] = dmr_bed

In [6]:
dmr_bed.reset_index().iloc[:, [1, 2, 3, 0]].to_csv(pathlib.Path(dmr_path).parent / 'TotalDMR.nofilter.bed', 
                                                   index=None, header=None, sep='\t')

## Filter DMR based on min-max delta and blacklist

In [7]:
# filter by min max delta first, if DMR do not pass min max filter, it won't have sig sample anyway
delta_judge = (mc_rate.max(axis=1) - mc_rate.min(axis=1)) > delta_to_mean
delta_index = mc_rate.index[delta_judge]

print(delta_index.size, 'dmr pass delta filter')

218014 dmr pass delta filter


In [8]:
# filter by blacklist
black_list = pd.read_csv(black_list_path, sep='\t', header=None)
black_list = BedTool.from_dataframe(black_list)

_dmr_bed = BedTool.from_dataframe(dmr_bed.reset_index().iloc[:, [1,2,3,0]])
filtered_dmr_bed = _dmr_bed.intersect(black_list, v=True).to_dataframe().set_index('name')

white_index = filtered_dmr_bed.index
print(white_index.size, 'dmr pass blacklist filter')

229555 dmr pass blacklist filter


In [9]:
dmr_df = dmr_df.loc[delta_index & white_index]
dmr_df = dmr_df[dmr_df['number_of_dms'] >= dms_cutoff]
print(dmr_df.shape[0], 'dmr after filter')
dmr_df.head()

214399 dmr after filter


Unnamed: 0,#chr,start,end,number_of_dms,hypermethylated_samples,hypomethylated_samples,CT-L6_Il1rap,NP-L6_Cntnap5a,CGE-Lamp5_Sorcs1,CGE-Vip_Grm8,...,Foxp2_Trpc7,CGE-Lamp5_Grk5,IT-L5_Cdh8,IG-CA2_Peak1,ANP_anp-dg,OLF_Xkr6,D1L-Fstl4_Cadm1,IT-L4_Shc3,D1L-PAL_Plcxd3,L6b_Pkhd1
Sub_277,chr11,3201252,3201252,1,"LSX-Inh_Dock10,ASC_cortex-olf,DG_dg-all,CA3-St...","PT-L5_Tenm2,Unc5c_Unc5c,Gfra1_Gfra1,PT-L5_Kcnh...",0.710526,,0.6,,...,0.653846,0.709677,0.65,1.0,1.0,1.0,0.75,0.636364,0.764706,0.8
Sub_278,chr11,3201508,3201734,4,"CT-L6_Il1rap,LSX-Inh_Dock10,LSX-Inh_Lats2,CLA_...","MSN-D2_Nrp2,LSX-Inh_Zeb2,OLF_Trpc4,MGE-Pvalb_T...",0.293785,0.293333,0.045455,0.16,...,0.125,0.112426,0.248555,0.25,0.142857,0.230769,0.046154,0.183007,0.064103,0.35
Sub_279,chr11,3203739,3203746,2,"IT-L5_Grik3,MGE-Sst_Bmper,NP-L6_Cyp7b1,CLA_Nrp...","MSN-D2_Nrp2,OLF_Trpc4,MSN-D2_Slc24a2,D1L-Fstl4...",0.720721,0.698113,0.785714,0.75,...,0.518072,0.616,0.672897,0.642857,0.666667,0.857143,0.394737,0.616071,0.73494,1.0
Sub_280,chr11,3204045,3204262,3,"PAL-Inh_Meis2,MGE-Pvalb_Sema5a,PT-L5_Ptprt,IT-...","CT-L6_Il1rap,CT-L6_Megf9,MSN-D2_Nrp2,L6b_Nrp2,...",0.649123,0.763889,0.96875,0.8,...,0.816,0.765306,0.916129,0.978723,0.75,0.666667,0.806452,0.944444,0.861111,0.444444
Sub_281,chr11,3204354,3204466,3,"CGE-Lamp5_Sorcs1,LSX-Inh_Dock10,CGE-Vip_Ccser1...","CT-L6_Il1rap,NP-L6_Cntnap5a,CT-L6_Megf9,NP-L6_...",0.015625,0.010417,0.913043,0.90625,...,0.607407,0.805556,0.433526,0.171875,0.777778,0.714286,0.753086,0.29,0.827586,0.0


## Calculate sample delta to robust mean

In [10]:
mc_rate = mc_rate.loc[dmr_df.index].copy()
n_sample = mc_rate.shape[1]

assert skip_quantile < 0.5
skip_n = min(1, round(n_sample * skip_quantile))

In [11]:
# ~1M rows / min, depending on n_sample, not very slow
def robust_mean(row):
    return np.nanmean(np.sort(row)[skip_n:-skip_n])
robust_mean = np.apply_along_axis(robust_mean, 1, mc_rate.values)
robust_mean = pd.Series(robust_mean, index=mc_rate.index)

In [12]:
delta = mc_rate - robust_mean[:, None]
# 0 is hypo DMR, 1 is hyper DMR, nan is not significant
judge = (delta > 0)[delta.abs() > delta_to_mean]

In [13]:
robust_mean_hypo_records = {}
robust_mean_hyper_records = {}

for sample, sample_col in judge.iteritems():
    hypo_dmr = sample_col[sample_col == 0].index
    hyper_dmr = sample_col[sample_col == 1].index
    
    robust_mean_hypo_records[sample] = hypo_dmr
    robust_mean_hyper_records[sample] = hyper_dmr

## Assign DMR to each sample

In [14]:
# parse results and add the robust mean judge
for sample in samples:
    # here saved a list of np.array
    hypo_index = dmr_df[dmr_df['hypomethylated_samples'].fillna('').apply(lambda i: sample in i)].index
    hyper_index = dmr_df[dmr_df['hypermethylated_samples'].fillna('').apply(lambda i: sample in i)].index
    hypo_sig_dict[sample] = hypo_index & robust_mean_hypo_records[sample]
    hyper_sig_dict[sample] = hyper_index & robust_mean_hyper_records[sample]
    print(sample, 'HypoDMR ', hypo_sig_dict[sample].size, sep='\t')
    print(sample, 'HyperDMR', hyper_sig_dict[sample].size, sep='\t')


CT-L6_Il1rap	HypoDMR 	18637
CT-L6_Il1rap	HyperDMR	2235


NP-L6_Cntnap5a	HypoDMR 	6166
NP-L6_Cntnap5a	HyperDMR	11709


CGE-Lamp5_Sorcs1	HypoDMR 	11948
CGE-Lamp5_Sorcs1	HyperDMR	6410


CGE-Vip_Grm8	HypoDMR 	8019
CGE-Vip_Grm8	HyperDMR	6813


LSX-Inh_Dock10	HypoDMR 	4725
LSX-Inh_Dock10	HyperDMR	10245


CGE-Vip_Ccser1	HypoDMR 	6908
CGE-Vip_Ccser1	HyperDMR	9095
CGE-Vip_Ntng1	HypoDMR 	5803
CGE-Vip_Ntng1	HyperDMR	3569


ASC_cortex-olf	HypoDMR 	20441
ASC_cortex-olf	HyperDMR	9373


MGE-Sst_Chodl	HypoDMR 	6696
MGE-Sst_Chodl	HyperDMR	7547


IT-L6_Oxr1	HypoDMR 	17098
IT-L6_Oxr1	HyperDMR	2485


VLMC_Col4a1	HypoDMR 	29571
VLMC_Col4a1	HyperDMR	8786


LSX-Inh_Lats2	HypoDMR 	14515
LSX-Inh_Lats2	HyperDMR	4587


L6b_Kcnk2	HypoDMR 	15389
L6b_Kcnk2	HyperDMR	5258
Chd7_Megf11	HypoDMR 	9724
Chd7_Megf11	HyperDMR	7747


MGE-Sst_Bmper	HypoDMR 	4777
MGE-Sst_Bmper	HyperDMR	11607


CT-L6_Megf9	HypoDMR 	16964
CT-L6_Megf9	HyperDMR	2232
Chd7_Kcnc2	HypoDMR 	4392
Chd7_Kcnc2	HyperDMR	5287


DG-po_Kctd8	HypoDMR 	13279
DG-po_Kctd8	HyperDMR	1431


DG_dg-all	HypoDMR 	30888
DG_dg-all	HyperDMR	2081


NP-L6_Cyp7b1	HypoDMR 	7756
NP-L6_Cyp7b1	HyperDMR	10980


D1L-Fstl4_Crim1	HypoDMR 	8820
D1L-Fstl4_Crim1	HyperDMR	7575


PT-L5_Tenm2	HypoDMR 	18556
PT-L5_Tenm2	HyperDMR	1894


Unc5c_Unc5c	HypoDMR 	13181
Unc5c_Unc5c	HyperDMR	5935


CGE-Lamp5_Grid1	HypoDMR 	18451
CGE-Lamp5_Grid1	HyperDMR	3962


OLF-Exc_Pld5	HypoDMR 	18368
OLF-Exc_Pld5	HyperDMR	1311


PT-L5_Tmtc2	HypoDMR 	22561
PT-L5_Tmtc2	HyperDMR	2226


CLA_Cdh8	HypoDMR 	18153
CLA_Cdh8	HyperDMR	3659


CA3-St18_Tead1	HypoDMR 	27149
CA3-St18_Tead1	HyperDMR	3290


PAL-Inh_Meis2	HypoDMR 	186
PAL-Inh_Meis2	HyperDMR	8128


NP-L6_Boc	HypoDMR 	7087
NP-L6_Boc	HyperDMR	11290


IT-L23_Foxp1	HypoDMR 	21028
IT-L23_Foxp1	HyperDMR	1855


MGC_mgc-all	HypoDMR 	31286
MGC_mgc-all	HyperDMR	13064


Chd7_Trpc7	HypoDMR 	5474
Chd7_Trpc7	HyperDMR	12015


LSX-Inh_Nxph1	HypoDMR 	7383
LSX-Inh_Nxph1	HyperDMR	12093


CA3-St18_Nuak1	HypoDMR 	16436
CA3-St18_Nuak1	HyperDMR	1960


Gfra1_Gfra1	HypoDMR 	28576
Gfra1_Gfra1	HyperDMR	840


MSN-D2_Nrp2	HypoDMR 	10813
MSN-D2_Nrp2	HyperDMR	6409


PT-L5_Kcnh1	HypoDMR 	28787
PT-L5_Kcnh1	HyperDMR	1448


LSX-Inh_Zeb2	HypoDMR 	12121
LSX-Inh_Zeb2	HyperDMR	6696


DG-po_Bcl11a	HypoDMR 	8452
DG-po_Bcl11a	HyperDMR	146


L6b_Nrp2	HypoDMR 	8508
L6b_Nrp2	HyperDMR	8823


PAL-Inh_Tmem178	HypoDMR 	2880
PAL-Inh_Tmem178	HyperDMR	15496


PAL-Inh_Tcf7l2	HypoDMR 	2797
PAL-Inh_Tcf7l2	HyperDMR	14102


CT-L6_Hcrtr2	HypoDMR 	21895
CT-L6_Hcrtr2	HyperDMR	2945


OLF-Exc_Cdh9	HypoDMR 	16291
OLF-Exc_Cdh9	HyperDMR	1164


PT-L5_Abca12	HypoDMR 	16543
PT-L5_Abca12	HyperDMR	1382


MSN-D1_Plxnc1	HypoDMR 	11635
MSN-D1_Plxnc1	HyperDMR	7634


D1L-Fstl4_Sipa1l2	HypoDMR 	10123
D1L-Fstl4_Sipa1l2	HyperDMR	8135


PAL-Inh_Chat	HypoDMR 	5399
PAL-Inh_Chat	HyperDMR	16348


PT-L5_Unc5b	HypoDMR 	16361
PT-L5_Unc5b	HyperDMR	2176


CLA_Nrp2	HypoDMR 	16345
CLA_Nrp2	HyperDMR	5091


MGE-Pvalb_Ptprk	HypoDMR 	6067
MGE-Pvalb_Ptprk	HyperDMR	4730


NP-L6_Cntnap4	HypoDMR 	8700
NP-L6_Cntnap4	HyperDMR	4861


ODC_odc-small	HypoDMR 	16801
ODC_odc-small	HyperDMR	9570


IG-CA2_Chrm3	HypoDMR 	35151
IG-CA2_Chrm3	HyperDMR	1909


CGE-Vip_Clstn2	HypoDMR 	11125
CGE-Vip_Clstn2	HyperDMR	8009


PAL-Inh_Deptor	HypoDMR 	7650
PAL-Inh_Deptor	HyperDMR	2989


OLF_Trpc4	HypoDMR 	10230
OLF_Trpc4	HyperDMR	3963


MGE-Pvalb_Entpd3	HypoDMR 	7357
MGE-Pvalb_Entpd3	HyperDMR	7994


OLF_Pag1	HypoDMR 	10014
OLF_Pag1	HyperDMR	2988


EP_Tspan5	HypoDMR 	15699
EP_Tspan5	HyperDMR	2579


CA3_Efnb2	HypoDMR 	20017
CA3_Efnb2	HyperDMR	2726


CA3_Cadm2	HypoDMR 	32230
CA3_Cadm2	HyperDMR	1653


CA1_Chrm3	HypoDMR 	39672
CA1_Chrm3	HyperDMR	550


MGE-Sst_Ubtd1	HypoDMR 	6379
MGE-Sst_Ubtd1	HyperDMR	8916


PT-L5_Plcb4	HypoDMR 	16297
PT-L5_Plcb4	HyperDMR	2468


CA1_Kif26a	HypoDMR 	14551
CA1_Kif26a	HyperDMR	1208
EP_Adcy8	HypoDMR 	13436


EP_Adcy8	HyperDMR	2973


MGE-Pvalb_Thsd7a	HypoDMR 	9885
MGE-Pvalb_Thsd7a	HyperDMR	7333


MSN-D2_Slc24a2	HypoDMR 	13046
MSN-D2_Slc24a2	HyperDMR	6929


MGE-Sst_Kcnip4	HypoDMR 	6063
MGE-Sst_Kcnip4	HyperDMR	11081


MGE-Sst_Rxra	HypoDMR 	5257
MGE-Sst_Rxra	HyperDMR	9658


LSX-Inh_Foxp2	HypoDMR 	8504
LSX-Inh_Foxp2	HyperDMR	11845


PAL-Inh_Onecut2	HypoDMR 	3795
PAL-Inh_Onecut2	HyperDMR	16168


LSX-Inh_Enox1	HypoDMR 	9552
LSX-Inh_Enox1	HyperDMR	8452


CA1_Ptprg	HypoDMR 	26780
CA1_Ptprg	HyperDMR	655


CGE-Vip_Ptprm	HypoDMR 	5678
CGE-Vip_Ptprm	HyperDMR	9145


OPC_opc-small	HypoDMR 	26682
OPC_opc-small	HyperDMR	4757


L6b_Adcy8	HypoDMR 	9202
L6b_Adcy8	HyperDMR	7681


OLF_Gabbr2	HypoDMR 	7588
OLF_Gabbr2	HyperDMR	3353


IT-L23_Tenm2	HypoDMR 	22021
IT-L23_Tenm2	HyperDMR	1010


PAL-Inh_Igdcc3	HypoDMR 	1359
PAL-Inh_Igdcc3	HyperDMR	15166


MSN-D2_Casz1	HypoDMR 	11399
MSN-D2_Casz1	HyperDMR	8349


IT-L5_Etv1	HypoDMR 	17622
IT-L5_Etv1	HyperDMR	1808


CA1_Lingo2	HypoDMR 	13766
CA1_Lingo2	HyperDMR	162


PT-L5_Nectin1	HypoDMR 	23896
PT-L5_Nectin1	HyperDMR	1409


D1L-Fstl4_Grm3	HypoDMR 	13913
D1L-Fstl4_Grm3	HyperDMR	6556


PT-L5_Astn2	HypoDMR 	15430
PT-L5_Astn2	HyperDMR	1393


MGE-Sst_Dock4	HypoDMR 	8930
MGE-Sst_Dock4	HyperDMR	10088


IT-L23_Ptprt	HypoDMR 	17557
IT-L23_Ptprt	HyperDMR	1231


MSN-D2_Col14a1	HypoDMR 	13273
MSN-D2_Col14a1	HyperDMR	7807


OLF-Exc_Unc13c	HypoDMR 	16758
OLF-Exc_Unc13c	HyperDMR	2102


CT-L6_Map4	HypoDMR 	18082
CT-L6_Map4	HyperDMR	2926


IG-CA2_Xpr1	HypoDMR 	21028
IG-CA2_Xpr1	HyperDMR	2885


VLMC_Mapk4	HypoDMR 	21386
VLMC_Mapk4	HyperDMR	10015


ANP_anp-olf-cnu	HypoDMR 	17635
ANP_anp-olf-cnu	HyperDMR	7082


CLA_Bcl11a	HypoDMR 	18829
CLA_Bcl11a	HyperDMR	2954


IT-L23_Cux1	HypoDMR 	27730
IT-L23_Cux1	HyperDMR	643


CGE-Lamp5_Nrxn3	HypoDMR 	9390
CGE-Lamp5_Nrxn3	HyperDMR	3788


EC_Sema3g	HypoDMR 	11694
EC_Sema3g	HyperDMR	997


MGE-Sst_Rerg	HypoDMR 	7826
MGE-Sst_Rerg	HyperDMR	9777


DG-po_Calb2	HypoDMR 	18418
DG-po_Calb2	HyperDMR	1162


MSN-D1_Ntn1	HypoDMR 	12227
MSN-D1_Ntn1	HyperDMR	7724


MSN-D1_Hrh1	HypoDMR 	16554
MSN-D1_Hrh1	HyperDMR	7873


MGE-Sst_Ptpre	HypoDMR 	8685
MGE-Sst_Ptpre	HyperDMR	8413


MGE-Sst_Frmd6	HypoDMR 	5853
MGE-Sst_Frmd6	HyperDMR	9179


MGE-Pvalb_Gfra2	HypoDMR 	9221
MGE-Pvalb_Gfra2	HyperDMR	10019


EP_Rgs8	HypoDMR 	14564
EP_Rgs8	HyperDMR	3527


D1L-PAL_Flrt2	HypoDMR 	7438
D1L-PAL_Flrt2	HyperDMR	10956


VLMC-Pia_vlmc-pia-all	HypoDMR 	25957
VLMC-Pia_vlmc-pia-all	HyperDMR	9329


IT-L6_Man1c1	HypoDMR 	17503
IT-L6_Man1c1	HyperDMR	1526


OLF-Exc_Sgcd	HypoDMR 	15889
OLF-Exc_Sgcd	HyperDMR	6280


OLF-Exc_Lrrtm3	HypoDMR 	18933
OLF-Exc_Lrrtm3	HyperDMR	1580


IT-L5_Grik3	HypoDMR 	20960
IT-L5_Grik3	HyperDMR	1467


Foxp2_Homer2	HypoDMR 	5765
Foxp2_Homer2	HyperDMR	6900


IT-L6_Fstl4	HypoDMR 	22740
IT-L6_Fstl4	HyperDMR	1684


MGE-Sst_Etv1	HypoDMR 	5310
MGE-Sst_Etv1	HyperDMR	5975


D1L-Fstl4_Trps1	HypoDMR 	14514
D1L-Fstl4_Trps1	HyperDMR	6621


MSN-D1_Khdrbs3	HypoDMR 	17020
MSN-D1_Khdrbs3	HyperDMR	8054


MGE-Sst_Unc5b	HypoDMR 	8567
MGE-Sst_Unc5b	HyperDMR	7471


IT-L6_Cadps2	HypoDMR 	19736
IT-L6_Cadps2	HyperDMR	2015


LSX-Inh_Cacna1i	HypoDMR 	4407
LSX-Inh_Cacna1i	HyperDMR	6091


Foxp2_Inpp4b	HypoDMR 	5865
Foxp2_Inpp4b	HyperDMR	7512


NP-L6_Olfml2b	HypoDMR 	6651
NP-L6_Olfml2b	HyperDMR	7987


MGE-Pvalb_Sema5a	HypoDMR 	9482
MGE-Pvalb_Sema5a	HyperDMR	7924


MGE-Pvalb_Cnih3	HypoDMR 	8774
MGE-Pvalb_Cnih3	HyperDMR	10097


Foxp2_Dchs2	HypoDMR 	7249
Foxp2_Dchs2	HyperDMR	6847


ASC_str-hpf	HypoDMR 	18055
ASC_str-hpf	HyperDMR	8216


CGE-Vip_Robo1	HypoDMR 	7854
CGE-Vip_Robo1	HyperDMR	12512


OLF_Kcnd3	HypoDMR 	6805
OLF_Kcnd3	HyperDMR	2968


CA3-St18_Epha5	HypoDMR 	27491
CA3-St18_Epha5	HyperDMR	1931


PT-L5_Ptprt	HypoDMR 	13809
PT-L5_Ptprt	HyperDMR	2178


CA1_Ak5	HypoDMR 	16775
CA1_Ak5	HyperDMR	696


CGE-Vip_Fstl4	HypoDMR 	9356
CGE-Vip_Fstl4	HyperDMR	8056


IT-L4_Astn2	HypoDMR 	25698
IT-L4_Astn2	HyperDMR	1320


OLF-Exc_Cux2	HypoDMR 	16579
OLF-Exc_Cux2	HyperDMR	2408


CGE-Lamp5_Dock5	HypoDMR 	21155
CGE-Lamp5_Dock5	HyperDMR	3511


ASC_mid	HypoDMR 	19401
ASC_mid	HyperDMR	8882


PAL-Inh_Meis1	HypoDMR 	6908
PAL-Inh_Meis1	HyperDMR	7658


PAL-Inh_Ptprd	HypoDMR 	2050
PAL-Inh_Ptprd	HyperDMR	17377


CGE-Vip_Galnt17	HypoDMR 	9505
CGE-Vip_Galnt17	HyperDMR	9666


EC_Abhd2	HypoDMR 	37831
EC_Abhd2	HyperDMR	7267


ODC_odc-large	HypoDMR 	17954
ODC_odc-large	HyperDMR	8473


PAL-Inh_Rarb	HypoDMR 	3551
PAL-Inh_Rarb	HyperDMR	16167


OLF-Exc_Rmst	HypoDMR 	3561
OLF-Exc_Rmst	HyperDMR	1610


OPC_opc-large	HypoDMR 	22855
OPC_opc-large	HyperDMR	6691


NP-L6_Kcnab1	HypoDMR 	5922
NP-L6_Kcnab1	HyperDMR	11192


PC_pc-all	HypoDMR 	49550
PC_pc-all	HyperDMR	7362


MGE-Pvalb_Cacna1i	HypoDMR 	12976
MGE-Pvalb_Cacna1i	HyperDMR	7802


OLF-Exc_Bmpr1b	HypoDMR 	17267
OLF-Exc_Bmpr1b	HyperDMR	475
OLF_Mapk10	HypoDMR 	7982
OLF_Mapk10	HyperDMR	2175


Foxp2_Trpc7	HypoDMR 	13249
Foxp2_Trpc7	HyperDMR	6336


CGE-Lamp5_Grk5	HypoDMR 	21182
CGE-Lamp5_Grk5	HyperDMR	3531


IT-L5_Cdh8	HypoDMR 	24206
IT-L5_Cdh8	HyperDMR	1477


IG-CA2_Peak1	HypoDMR 	33994
IG-CA2_Peak1	HyperDMR	2337


ANP_anp-dg	HypoDMR 	28507
ANP_anp-dg	HyperDMR	4468
OLF_Xkr6	HypoDMR 	11978
OLF_Xkr6	HyperDMR	2399


D1L-Fstl4_Cadm1	HypoDMR 	8203
D1L-Fstl4_Cadm1	HyperDMR	7377


IT-L4_Shc3	HypoDMR 	24104
IT-L4_Shc3	HyperDMR	1582


D1L-PAL_Plcxd3	HypoDMR 	7072
D1L-PAL_Plcxd3	HyperDMR	11354
L6b_Pkhd1	HypoDMR 	4756
L6b_Pkhd1	HyperDMR	1008


In [15]:
with pd.HDFStore(pathlib.Path(dmr_path).parent / 'DMRInfo.h5') as hdf:
    for sample, hypo_index in hypo_sig_dict.items():
        hdf[f'HypoDMR/{sample}'] = pd.Series(hypo_index)
    for sample, hyper_index in hyper_sig_dict.items():
        hdf[f'HyperDMR/{sample}'] = pd.Series(hyper_index)

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


## Dump DMR bed

In [16]:
hypo_dir = pathlib.Path(dmr_path).parent / 'HypoDMR'
hypo_dir.mkdir(exist_ok=True)
for sample, hypo_index in hypo_sig_dict.items():
    _bed = dmr_bed.loc[hypo_index].reset_index().iloc[:, [1, 2, 3, 0]].to_csv(
        hypo_dir / f'{sample}.DMS{dms_cutoff}.bed', sep='\t', index=None, header=None)

In [17]:
hyper_dir = pathlib.Path(dmr_path).parent / 'HyperDMR'
hyper_dir.mkdir(exist_ok=True)
for sample, hyper_index in hyper_sig_dict.items():
    _bed = dmr_bed.loc[hyper_index].reset_index().iloc[:, [1, 2, 3, 0]].to_csv(
        hyper_dir / f'{sample}.DMS{dms_cutoff}.bed', sep='\t', index=None, header=None)

## DMR hits matrix

In [18]:
sig_dict = hypo_sig_dict

rows = []
cols = []
datas = []
for i, (sample, dmr_index) in enumerate(sig_dict.items()):
    col = dmr_index.map(lambda i: i.split('_')[1]).astype(int).values
    row = (np.ones_like(col) * i).astype(int)
    data = np.ones_like(col)
    rows.append(row)
    cols.append(col)
    datas.append(data)
datas = np.concatenate(datas)
cols = np.concatenate(cols)
rows = np.concatenate(rows)
hits = csr_matrix((datas, (rows, cols)),
                  shape=(mc_rate.shape[1], dmr_bed.shape[0]))

# obs is DMR, var is sample, because all analysis is dmr focused
dmr_hits = anndata.AnnData(X=hits.T,
                           obs=dmr_bed,
                           var=pd.DataFrame([], index=mc_rate.columns))

dmr_hits.write_h5ad(hypo_dir / 'TotalHits.h5ad')


... storing '#chr' as categorical


In [19]:
sig_dict = hyper_sig_dict

rows = []
cols = []
datas = []
for i, (sample, dmr_index) in enumerate(sig_dict.items()):
    col = dmr_index.map(lambda i: i.split('_')[1]).astype(int).values
    row = (np.ones_like(col) * i).astype(int)
    data = np.ones_like(col)
    rows.append(row)
    cols.append(col)
    datas.append(data)
datas = np.concatenate(datas)
cols = np.concatenate(cols)
rows = np.concatenate(rows)
hits = csr_matrix((datas, (rows, cols)),
                  shape=(mc_rate.shape[1], dmr_bed.shape[0]))

# obs is DMR, var is sample, because all analysis is dmr focused
dmr_hits = anndata.AnnData(X=hits.T,
                           obs=dmr_bed,
                           var=pd.DataFrame([], index=mc_rate.columns))

dmr_hits.write_h5ad(hyper_dir / 'TotalHits.h5ad')

... storing '#chr' as categorical
