In [1]:
import pathlib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import anndata
from pybedtools import BedTool

## Parameter

In [2]:
dmr_path = '/home/hanliu/project/mouse_rostral_brain/DMR/ITSpatial/ITSpatial_rms_results_collapsed.tsv'
dmr_prefix = 'DGmCH'

black_list_path = '/home/hanliu/ref/blacklist/mm10-blacklist.v2.bed.gz'
skip_quantile = 0.25
delta_to_mean = 0.1
dms_cutoff = 1

In [3]:
# Parameters
dmr_path = "/home/hanliu/project/mouse_rostral_brain/DMR/ITSpatial_DissectionRegion/ITSpatial_rms_results_collapsed.tsv"
dmr_prefix = "ITSpatial"
black_list_path = "/home/hanliu/ref/blacklist/mm10-blacklist.v2.bed.gz"
skip_quantile = 0.25
delta_to_mean = 0.3
dms_cutoff = 2


## Read DMR

In [4]:
# get column and n_sample
dmr_column = pd.read_csv(dmr_path, sep='\t', nrows=0).columns
n_sample = dmr_column.size - 6
dmr_column = dmr_column.map(lambda i: i[18:] if 'methylation_level_' in i else i)
samples = dmr_column[6:]

dmr_region_infos = []
mc_rates = []
hypo_sig_dict = {sample: [] for sample in samples}
hyper_sig_dict = {sample: [] for sample in samples}

dmr_df = pd.read_csv(dmr_path, sep='\t', skiprows=1, names=dmr_column)
print(dmr_df.shape[0], 'dmr before filter')
dmr_df.index = dmr_df.index.map(lambda i: f'{dmr_prefix}_{i}')

# save raw info
dmr_bed = dmr_df.iloc[:, :4].copy()
mc_rate = dmr_df.iloc[:, 6:].copy()

1520507 dmr before filter


## Save unfiltered dmr info

In [5]:
with pd.HDFStore(pathlib.Path(dmr_path).parent / 'DMRInfo.h5') as hdf:
    hdf['Rate'] = mc_rate
    hdf['Bed'] = dmr_bed

In [6]:
dmr_bed.reset_index().iloc[:, [1, 2, 3, 0]].to_csv(pathlib.Path(dmr_path).parent / 'TotalDMR.nofilter.bed', 
                                                   index=None, header=None, sep='\t')

## Filter DMR based on min-max delta and blacklist

In [7]:
# filter by min max delta first, if DMR do not pass min max filter, it won't have sig sample anyway
delta_judge = (mc_rate.max(axis=1) - mc_rate.min(axis=1)) > delta_to_mean
delta_index = mc_rate.index[delta_judge]

print(delta_index.size, 'dmr pass delta filter')

1334398 dmr pass delta filter


In [8]:
# filter by blacklist
black_list = pd.read_csv(black_list_path, sep='\t', header=None)
black_list = BedTool.from_dataframe(black_list)

_dmr_bed = BedTool.from_dataframe(dmr_bed.reset_index().iloc[:, [1,2,3,0]])
filtered_dmr_bed = _dmr_bed.intersect(black_list, v=True).to_dataframe().set_index('name')

white_index = filtered_dmr_bed.index
print(white_index.size, 'dmr pass blacklist filter')

1500682 dmr pass blacklist filter


In [9]:
dmr_df = dmr_df.loc[delta_index & white_index]
dmr_df = dmr_df[dmr_df['number_of_dms'] >= dms_cutoff]
print(dmr_df.shape[0], 'dmr after filter')
dmr_df.head()

739486 dmr after filter


Unnamed: 0,#chr,start,end,number_of_dms,hypermethylated_samples,hypomethylated_samples,IT-L6+ACA-1,IT-L5+MOs-2,IT-L23+MOs-1,IT-L5+ACA-3,...,IT-L5+SSs-1,IT-L6+PFC-2,IT-L6+MOs-2,IT-L5+SSp-3,IT-L6+PFC-1,IT-L6+ACA-2,IT-L6+SSp-3,IT-L6+ACA-3,IT-L6+MOp-4,IT-L6+MOp-3
ITSpatial_4,chr1,3013472,3013579,3,IT-L23+PFC-1,"IT-L23+SSp-1,IT-L23+SSs-2,IT-L4+MOp-1,IT-L23+S...",0.947368,0.911765,0.824324,0.923077,...,0.842105,0.865385,0.935484,0.921569,0.931034,1.0,0.829268,0.96875,0.95,0.82
ITSpatial_7,chr1,3027398,3027514,2,"IT-L5+ACA-3,IT-L6+MOp-2,IT-L5+MOp-4,IT-L5+ACA-...","IT-L23+SSp-1,IT-L23+SSp-2,IT-L23+MOp-2,IT-L23+...",0.818182,0.706897,0.445783,0.790323,...,0.821429,0.719298,0.674419,0.674419,0.657895,0.685185,0.653061,0.829787,0.767442,0.710526
ITSpatial_10,chr1,3046233,3046317,2,,"IT-L23+SSp-1,IT-L23+MOp-2,IT-L23+SSp-3,IT-L23+...",0.964286,0.854167,0.793103,0.842857,...,0.941176,0.967213,0.925926,0.902439,0.928571,0.911765,0.875,0.860465,0.942857,0.949153
ITSpatial_11,chr1,3046826,3047027,2,IT-L5+SSp-3,"IT-L4+SSp-1,IT-L4+SSp-3,IT-L4+SSp-2,IT-L4+SSp-...",0.6875,0.854839,0.761905,0.957746,...,0.857143,0.814815,0.852941,0.98,0.76,0.735294,0.87234,0.75,0.729167,0.875
ITSpatial_13,chr1,3057752,3057873,3,"IT-L23+MOp-1,IT-L23+PFC-1,IT-L5+AI,IT-L6+SSs-1...","IT-L5+ACA-3,IT-L5+SSp-5,IT-L23+ACA-3,IT-L4+SSp...",0.863636,0.769231,0.793651,0.47619,...,0.809524,0.933333,0.857143,0.630435,0.933333,0.930233,0.84375,0.95082,0.825397,0.923077


## Calculate sample delta to robust mean

In [10]:
mc_rate = mc_rate.loc[dmr_df.index].copy()
n_sample = mc_rate.shape[1]

assert skip_quantile < 0.5
skip_n = min(1, round(n_sample * skip_quantile))

In [11]:
# ~1M rows / min, depending on n_sample, not very slow
def robust_mean(row):
    return np.nanmean(np.sort(row)[skip_n:-skip_n])
robust_mean = np.apply_along_axis(robust_mean, 1, mc_rate.values)
robust_mean = pd.Series(robust_mean, index=mc_rate.index)

In [12]:
delta = mc_rate - robust_mean[:, None]
# 0 is hypo DMR, 1 is hyper DMR, nan is not significant
judge = (delta > 0)[delta.abs() > delta_to_mean]

In [13]:
robust_mean_hypo_records = {}
robust_mean_hyper_records = {}

for sample, sample_col in judge.iteritems():
    hypo_dmr = sample_col[sample_col == 0].index
    hyper_dmr = sample_col[sample_col == 1].index
    
    robust_mean_hypo_records[sample] = hypo_dmr
    robust_mean_hyper_records[sample] = hyper_dmr

## Assign DMR to each sample

In [14]:
# parse results and add the robust mean judge
for sample in samples:
    # here saved a list of np.array
    hypo_index = dmr_df[dmr_df['hypomethylated_samples'].fillna('').apply(lambda i: sample in i)].index
    hyper_index = dmr_df[dmr_df['hypermethylated_samples'].fillna('').apply(lambda i: sample in i)].index
    hypo_sig_dict[sample] = hypo_index & robust_mean_hypo_records[sample]
    hyper_sig_dict[sample] = hyper_index & robust_mean_hyper_records[sample]
    print(sample, 'HypoDMR ', hypo_sig_dict[sample].size, sep='\t')
    print(sample, 'HyperDMR', hyper_sig_dict[sample].size, sep='\t')


IT-L6+ACA-1	HypoDMR 	8872
IT-L6+ACA-1	HyperDMR	26373


IT-L5+MOs-2	HypoDMR 	12288
IT-L5+MOs-2	HyperDMR	562


IT-L23+MOs-1	HypoDMR 	26249
IT-L23+MOs-1	HyperDMR	3418


IT-L5+ACA-3	HypoDMR 	13995
IT-L5+ACA-3	HyperDMR	21308


IT-L6+MOp-2	HypoDMR 	16510
IT-L6+MOp-2	HyperDMR	11372


IT-L6+MOs-3	HypoDMR 	18954
IT-L6+MOs-3	HyperDMR	15670


IT-L5+SSp-5	HypoDMR 	8004
IT-L5+SSp-5	HyperDMR	3639


IT-L6+SSs-2	HypoDMR 	5670
IT-L6+SSs-2	HyperDMR	11268


IT-L4+MOs-3	HypoDMR 	15313
IT-L4+MOs-3	HyperDMR	3392


IT-L5+SSp-2	HypoDMR 	26589
IT-L5+SSp-2	HyperDMR	6673


IT-L4+MOs-1	HypoDMR 	28219
IT-L4+MOs-1	HyperDMR	6886


IT-L23+ACA-3	HypoDMR 	12076
IT-L23+ACA-3	HyperDMR	7023


IT-L23+PFC-2	HypoDMR 	17763
IT-L23+PFC-2	HyperDMR	12923


IT-L23+ACA-1	HypoDMR 	14664
IT-L23+ACA-1	HyperDMR	8339


IT-L23+MOp-3	HypoDMR 	29238
IT-L23+MOp-3	HyperDMR	2991


IT-L4+MOp-3	HypoDMR 	16283
IT-L4+MOp-3	HyperDMR	4529


IT-L23+SSp-1	HypoDMR 	33608
IT-L23+SSp-1	HyperDMR	4064


IT-L23+SSp-2	HypoDMR 	39447
IT-L23+SSp-2	HyperDMR	4748


IT-L23+MOs-3	HypoDMR 	19616
IT-L23+MOs-3	HyperDMR	2296


IT-L23+ACA-2	HypoDMR 	13877
IT-L23+ACA-2	HyperDMR	4952


IT-L23+MOp-2	HypoDMR 	34715
IT-L23+MOp-2	HyperDMR	3226


IT-L23+SSs-2	HypoDMR 	41435
IT-L23+SSs-2	HyperDMR	5059


IT-L4+SSp-1	HypoDMR 	32249
IT-L4+SSp-1	HyperDMR	11747


IT-L23+SSp-3	HypoDMR 	32603
IT-L23+SSp-3	HyperDMR	4654


IT-L4+MOp-4	HypoDMR 	11068
IT-L4+MOp-4	HyperDMR	4846


IT-L4+MOp-1	HypoDMR 	28489
IT-L4+MOp-1	HyperDMR	7988


IT-L23+SSp-5	HypoDMR 	18590
IT-L23+SSp-5	HyperDMR	6130


IT-L4+SSp-2	HypoDMR 	36507
IT-L4+SSp-2	HyperDMR	12951


IT-L23+SSp-4	HypoDMR 	25530
IT-L23+SSp-4	HyperDMR	5674


IT-L23+AI	HypoDMR 	18230
IT-L23+AI	HyperDMR	13430


IT-L23+SSs-1	HypoDMR 	25346
IT-L23+SSs-1	HyperDMR	4450


IT-L5+MOp-3	HypoDMR 	14971
IT-L5+MOp-3	HyperDMR	1091


IT-L4+SSp-5	HypoDMR 	14702
IT-L4+SSp-5	HyperDMR	8898


IT-L23+MOs-2	HypoDMR 	22323
IT-L23+MOs-2	HyperDMR	1843


IT-L5+MOp-2	HypoDMR 	18997
IT-L5+MOp-2	HyperDMR	946


IT-L4+SSp-4	HypoDMR 	22620
IT-L4+SSp-4	HyperDMR	10375


IT-L23+MOp-1	HypoDMR 	36149
IT-L23+MOp-1	HyperDMR	5263


IT-L5+MOs-3	HypoDMR 	8939
IT-L5+MOs-3	HyperDMR	1130


IT-L5+ACA-1	HypoDMR 	8135
IT-L5+ACA-1	HyperDMR	10833


IT-L5+ORB	HypoDMR 	11562
IT-L5+ORB	HyperDMR	7598


IT-L4+SSs-2	HypoDMR 	37579
IT-L4+SSs-2	HyperDMR	11516


IT-L4+SSp-3	HypoDMR 	34970
IT-L4+SSp-3	HyperDMR	11501


IT-L4+SSs-1	HypoDMR 	31504
IT-L4+SSs-1	HyperDMR	11533


IT-L23+ORB	HypoDMR 	6914
IT-L23+ORB	HyperDMR	6211


IT-L23+PFC-1	HypoDMR 	19581
IT-L23+PFC-1	HyperDMR	26082


IT-L5+PFC-2	HypoDMR 	9508
IT-L5+PFC-2	HyperDMR	10853


IT-L5+MOs-1	HypoDMR 	22605
IT-L5+MOs-1	HyperDMR	1714


IT-L4+MOp-2	HypoDMR 	26524
IT-L4+MOp-2	HyperDMR	7939


IT-L5+AI	HypoDMR 	20276
IT-L5+AI	HyperDMR	10148


IT-L23+MOp-4	HypoDMR 	21478
IT-L23+MOp-4	HyperDMR	3928


IT-L5+PFC-1	HypoDMR 	13892
IT-L5+PFC-1	HyperDMR	18257


IT-L5+SSp-1	HypoDMR 	16197
IT-L5+SSp-1	HyperDMR	3364


IT-L5+MOp-4	HypoDMR 	9513
IT-L5+MOp-4	HyperDMR	1032


IT-L5+ACA-2	HypoDMR 	11292
IT-L5+ACA-2	HyperDMR	10285


IT-L5+MOp-1	HypoDMR 	19100
IT-L5+MOp-1	HyperDMR	916


IT-L5+SSs-2	HypoDMR 	26174
IT-L5+SSs-2	HyperDMR	7747


IT-L6+AI	HypoDMR 	7369
IT-L6+AI	HyperDMR	15784


IT-L6+SSp-2	HypoDMR 	11748
IT-L6+SSp-2	HyperDMR	16159


IT-L5+SSp-4	HypoDMR 	12800
IT-L5+SSp-4	HyperDMR	3961


IT-L6+SSp-4	HypoDMR 	5719
IT-L6+SSp-4	HyperDMR	11432


IT-L6+SSs-1	HypoDMR 	3580
IT-L6+SSs-1	HyperDMR	10205


IT-L6+SSp-1	HypoDMR 	7486
IT-L6+SSp-1	HyperDMR	10353


IT-L6+SSp-5	HypoDMR 	4879
IT-L6+SSp-5	HyperDMR	12273


IT-L5+SSs-1	HypoDMR 	17647
IT-L5+SSs-1	HyperDMR	7646


IT-L6+PFC-2	HypoDMR 	7093
IT-L6+PFC-2	HyperDMR	25612


IT-L6+MOs-2	HypoDMR 	21337
IT-L6+MOs-2	HyperDMR	14455


IT-L5+SSp-3	HypoDMR 	19427
IT-L5+SSp-3	HyperDMR	4000


IT-L6+PFC-1	HypoDMR 	7477
IT-L6+PFC-1	HyperDMR	25276


IT-L6+ACA-2	HypoDMR 	7990
IT-L6+ACA-2	HyperDMR	22491


IT-L6+SSp-3	HypoDMR 	8558
IT-L6+SSp-3	HyperDMR	13105


IT-L6+ACA-3	HypoDMR 	9309
IT-L6+ACA-3	HyperDMR	24155


IT-L6+MOp-4	HypoDMR 	12982
IT-L6+MOp-4	HyperDMR	10130


IT-L6+MOp-3	HypoDMR 	19113
IT-L6+MOp-3	HyperDMR	11024


In [15]:
with pd.HDFStore(pathlib.Path(dmr_path).parent / 'DMRInfo.h5') as hdf:
    for sample, hypo_index in hypo_sig_dict.items():
        hdf[f'HypoDMR/{sample}'] = pd.Series(hypo_index)
    for sample, hyper_index in hyper_sig_dict.items():
        hdf[f'HyperDMR/{sample}'] = pd.Series(hyper_index)

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)


  check_attribute_name(name)


## Dump DMR bed

In [16]:
hypo_dir = pathlib.Path(dmr_path).parent / 'HypoDMR'
hypo_dir.mkdir(exist_ok=True)
for sample, hypo_index in hypo_sig_dict.items():
    _bed = dmr_bed.loc[hypo_index].reset_index().iloc[:, [1, 2, 3, 0]].to_csv(
        hypo_dir / f'{sample}.DMS{dms_cutoff}.bed', sep='\t', index=None, header=None)

In [17]:
hyper_dir = pathlib.Path(dmr_path).parent / 'HyperDMR'
hyper_dir.mkdir(exist_ok=True)
for sample, hyper_index in hyper_sig_dict.items():
    _bed = dmr_bed.loc[hyper_index].reset_index().iloc[:, [1, 2, 3, 0]].to_csv(
        hyper_dir / f'{sample}.DMS{dms_cutoff}.bed', sep='\t', index=None, header=None)

## DMR hits matrix

In [18]:
sig_dict = hypo_sig_dict

rows = []
cols = []
datas = []
for i, (sample, dmr_index) in enumerate(sig_dict.items()):
    col = dmr_index.map(lambda i: i.split('_')[1]).astype(int).values
    row = (np.ones_like(col) * i).astype(int)
    data = np.ones_like(col)
    rows.append(row)
    cols.append(col)
    datas.append(data)
datas = np.concatenate(datas)
cols = np.concatenate(cols)
rows = np.concatenate(rows)
hits = csr_matrix((datas, (rows, cols)),
                  shape=(mc_rate.shape[1], dmr_bed.shape[0]))

# obs is DMR, var is sample, because all analysis is dmr focused
dmr_hits = anndata.AnnData(X=hits.T,
                           obs=dmr_bed,
                           var=pd.DataFrame([], index=mc_rate.columns))

dmr_hits.write_h5ad(hypo_dir / 'TotalHits.h5ad')


... storing '#chr' as categorical


In [19]:
sig_dict = hyper_sig_dict

rows = []
cols = []
datas = []
for i, (sample, dmr_index) in enumerate(sig_dict.items()):
    col = dmr_index.map(lambda i: i.split('_')[1]).astype(int).values
    row = (np.ones_like(col) * i).astype(int)
    data = np.ones_like(col)
    rows.append(row)
    cols.append(col)
    datas.append(data)
datas = np.concatenate(datas)
cols = np.concatenate(cols)
rows = np.concatenate(rows)
hits = csr_matrix((datas, (rows, cols)),
                  shape=(mc_rate.shape[1], dmr_bed.shape[0]))

# obs is DMR, var is sample, because all analysis is dmr focused
dmr_hits = anndata.AnnData(X=hits.T,
                           obs=dmr_bed,
                           var=pd.DataFrame([], index=mc_rate.columns))

dmr_hits.write_h5ad(hyper_dir / 'TotalHits.h5ad')

... storing '#chr' as categorical
