In [1]:
import pandas as pd
import numpy as np
import pathlib

In [2]:
def __judge_row(row, skip_n, hypo_delta, hyper_delta):
    # filter hypo- hyper- sample with additional delta cutoff
    
    mc_rate = row[2:]
    robust_mean = mc_rate.sort_values()[skip_n:-skip_n].mean()  # 2 for the hypo hyper sample col
    hyper_samples, hypo_samples = row[:2]  # methylpy: hyper is col 4, hypo is col 5
        
    # hyper- hypo- still need to be significant
    true_hypo_sample = mc_rate.index[(robust_mean - mc_rate) > hypo_delta] & hypo_samples
    true_hyper_sample = mc_rate.index[(mc_rate - robust_mean) > hyper_delta] & hyper_samples
    
    return true_hypo_sample, true_hyper_sample


def __process_chunk(chunk, skip_n, hypo_delta, hyper_delta):
    # change str to index
    chunk.iloc[:, 4:6] = chunk.iloc[:, 4:6].fillna('').applymap(lambda i: pd.Index(i.split(',')))
    mc_info = chunk.iloc[:, 4:]
    
    # separate columns
    result = mc_info.apply(lambda i: __judge_row(
    i, skip_n=skip_n, hypo_delta=hypo_delta, hyper_delta=hyper_delta), axis=1)    
    return result

In [3]:
dmr_path = '/home/hanliu/project/mouse_rostral_brain/DMR/MajorType/MajorType_rms_results_collapsed.tsv'

In [4]:
quantile = 0.2
hypo_delta = 0.3
hyper_delta = 0.1

In [8]:
# get column and n_sample
dmr_column = pd.read_csv(dmr_path, sep='\t', nrows=0).columns
n_sample = dmr_column.size - 6
dmr_column = dmr_column.map(lambda i: i[18:] if 'methylation_level_' in i else i)
samples = dmr_column[6:]
# calculate skip_n
skip_n = int(max(1, n_sample * quantile))
if n_sample - skip_n * 2 < 4:
    skip_n = 0


In [None]:
dmr_region_infos = []
mc_rates = []
hypo_sig_dict = {sample: [] for sample in samples}
hyper_sig_dict = {sample: [] for sample in samples}

n = 0
for chunk in pd.read_csv(dmr_path, sep='\t', chunksize=1000000, skiprows=1, names=dmr_column):
    n += 1
    print(n)
    
    # save basic info
    dmr_region_info = chunk.iloc[:, :4]
    mc_rate = chunk.iloc[:, 6:]
    mc_rates.append(mc_rate)
    dmr_region_infos.append(dmr_region_info)
    
    # judge sig sample with delta
    result = __process_chunk(chunk, skip_n=skip_n, hypo_delta=hypo_delta, hyper_delta=hyper_delta)
    # parse results
    hypo = result.apply(lambda i: i[0])
    hyper = result.apply(lambda i: i[1])
    for sample in samples:
        # here saved a list of np.array
        hypo_sig_dict[sample].append(hypo.index[hypo.apply(lambda i: sample in i)].values) 
        hyper_sig_dict[sample].append(hyper.index[hyper.apply(lambda i: sample in i)].values) 

# here saved a pd.Index for easy selecting dmr belong to sample
hypo_sig_dict = {k: pd.Index(np.concatenate(v)) for k, v in hypo_sig_dict.items()}
hyper_sig_dict = {k: pd.Index(np.concatenate(v)) for k, v in hyper_sig_dict.items()}

# total dmr region and rate info
total_dmr_region_info = pd.concat(dmr_region_infos)
total_mc_rate = pd.concat(mc_rates)

with pd.HDFStore('MajorType.DMR.h5', 'a') as hdf:
    hdf['Rate'] = total_mc_rate
    hdf['Bed'] = total_dmr_region_info
    
    for sample, hypo_index in hypo_sig_dict.items():
        hdf[f'HypoDMR/{sample}'] = pd.Series(hypo_index)
    for sample, hyper_index in hyper_sig_dict.items():
        hdf[f'HyperDMR/{sample}'] = pd.Series(hyper_index)

1


## Dump DMR

In [5]:
dms_cutoff = 1

In [6]:
hypo_dir = pathlib.Path('HypoBed')
hypo_dir.mkdir(exist_ok=True)

hyper_dir = pathlib.Path('HyperBed')
hyper_dir.mkdir(exist_ok=True)

In [9]:
hypo_index_dict = {}
hyper_index_dict = {}

with pd.HDFStore('MajorType.DMR.h5') as hdf:
    dmr_bed = hdf['Bed']
    for sample in samples:
        hypo_index_dict[sample] = hdf[f'HypoDMR/{sample}']
        hyper_index_dict[sample] = hdf[f'HyperDMR/{sample}']

In [10]:
for sample in samples:
    hypo_index = hypo_index_dict[sample].values
    hyper_index = hyper_index_dict[sample].values

    hypo_bed = dmr_bed.loc[hypo_index][dmr_bed['number_of_dms'] >= dms_cutoff]
    hyper_bed = dmr_bed.loc[hyper_index][
        dmr_bed['number_of_dms'] >= dms_cutoff]

    # ADD INDEX TO DMR
    hypo_bed.index = hypo_bed.index.map(lambda i: f'Major_{i}')
    hypo_bed.reset_index(drop=False, inplace=True)

    hyper_bed.index = hyper_bed.index.map(lambda i: f'Major_{i}')
    hyper_bed.reset_index(drop=False, inplace=True)

    hypo_bed.iloc[:, [1, 2, 3, 0]].to_csv(
        hypo_dir / f'{sample}.HypoDMR.DMS{dms_cutoff}.bed',
        header=None,
        index=None,
        sep='\t')
    hyper_bed.iloc[:, [1, 2, 3, 0]].to_csv(
        hyper_dir / f'{sample}.HyperDMR.DMS{dms_cutoff}.bed',
        header=None,
        index=None,
        sep='\t')
    print(sample, hypo_bed.shape[0], hyper_bed.shape[0])

  """
  import sys


IT-L23 287531 133827
ASC 225133 351537
DG-po 222168 13961
EC 557680 61204
PAL-Inh 5919 711369
IT-L6 246852 227029
LSX-Inh 80176 554844
IG-CA2 389300 124794
OLF-Exc 164180 220695
Unc5c 179383 75965
MGC 674356 369491
OLF 87791 224821
NP-L6 77080 639091
L6b 119475 523210
Gfra1 287492 5023
CT-L6 228623 313953
ODC 254194 354838
VLMC-Pia 397393 147872
IT-L4 327129 129858
Foxp2 96231 373261
MSN-D2 169580 402523
ANP 230357 123521
CGE-Vip 79417 454002
Chd7 61978 234761
CA3-St18 328161 106834
CGE-Lamp5 253206 240993
D1L-PAL 98521 341287
OPC 368619 228459
MGE-Pvalb 128885 543579
EP 185907 182131
PC 958587 88367
PT-L5 261820 259780
MGE-Sst 64006 580597
CLA 239452 275393
D1L-Fstl4 138892 382145
CA1 330653 67215
VLMC 369718 250618
CA3 368749 123398
IT-L5 292052 199178
MSN-D1 206516 428474
DG 387400 117673


In [11]:
with pd.HDFStore('MajorType.DMR.h5') as hdf:
    dmr_bed = hdf['Bed']
dmr_bed.index = dmr_bed.index.map(lambda i: f'Major_{i}')
dmr_bed.reset_index(drop=False, inplace=True)

In [10]:
dmr_bed.iloc[:, [1, 2, 3, 0]].to_csv('MajorDMR.total.bed', sep='\t', index=None, header=None)