In [1]:
import pandas as pd
import numpy as np
import anndata
from scipy.sparse import csr_matrix
import pathlib
import pyBigWig
import pybedtools
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
records = []

## ATAC

In [3]:
bw_dir = pathlib.Path(
    '/home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ATAC/')

for p in bw_dir.glob('*FB*bw'):
    time = '.'.join(p.name.split('.')[0].split('_')[:-1])
    records.append({'Path': str(p), 'Type': 'ATAC', 'DevTime': time})

## ChIP

In [4]:
bw_dir = pathlib.Path(
    '/home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/')

for p in bw_dir.glob('*FB*bw'):
    *time, tissue, data_type = p.name.split('.')[0].split('_')
    time = '.'.join(time)
    records.append({'Path': str(p), 'Type': data_type, 'DevTime': time})

## mC

In [5]:
bw_dir = pathlib.Path(
    '/home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/WGBS/BW/')

for p in bw_dir.glob('*rate*bw'):
    infos, mc_type, *_ = p.name.split('.')
    *time, tissue, rep = infos.split('_')
    mc_type = mc_type.split('-')[0]
    time = '.'.join(time)
    records.append({'Path': str(p), 'Type': mc_type, 'DevTime': time, 'Rep': rep})

In [6]:
total_bw = pd.DataFrame(records).fillna(1)
total_bw['DevTime'] = total_bw['DevTime'].apply(lambda i: i.replace('AD.', ''))
total_bw.shape

(100, 4)

In [7]:
total_bw.to_csv('BW_list.csv')

In [8]:
total_bw['DevTime'].value_counts()

E14.5    13
E13.5    13
E11.5    13
E15.5    13
E12.5    13
E16.5    13
P0       13
E10.5     6
P21       1
P56       1
P7        1
Name: DevTime, dtype: int64

In [9]:
total_bw['Type'].value_counts()

CHN         16
CGN         16
CAN         16
H3K27ac     10
H3K4me2      7
H3K4me3      7
H3K27me3     7
ATAC         7
H3K9ac       7
H3K4me1      7
Name: Type, dtype: int64

## DMR

In [10]:
slop = 250
pybedtools.BedTool(
    './merged_loops_merged.anchor.bed'
).saveas('DMR_temp.bed')#.slop(b=slop, g='/home/hanliu/ref/mouse/genome/mm10.main.chrom.sizes').saveas('DMR_temp.bed')

<BedTool(DMR_temp.bed)>

## Scan BW

In [11]:
in_bed = pathlib.Path('DMR_temp.bed').absolute()

cmds = []
for _, (path, dtype, time, rep) in total_bw.iterrows():
    output_path = pathlib.Path(f'{dtype}_{time}_{rep}.tab').absolute()
    in_bw = path
    cmd = f'bigWigAverageOverBed {in_bw} {in_bed} {output_path}'
    cmds.append(cmd)

## Run

In [12]:
def runner(cmd):
    subprocess.run(cmd,
                   check=True,
                   shell=True,
                   stderr=subprocess.PIPE,
                   stdout=subprocess.PIPE)


with ProcessPoolExecutor(30) as executor:
    futures = {}
    for cmd in cmds:
        f = executor.submit(runner, cmd)
        futures[f] = cmd

    for f in as_completed(futures):
        f.result()
        print(futures[f])

bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/E15_5_FB_H3K9ac.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K9ac_E15.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/E11_5_FB_H3K27ac.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K27ac_E11.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/AD_P21_FB_H3K27ac.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K27ac_P21_1.tab
bigWigAverageOverBed /home/hanliu/ddn/ha

bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/E12_5_FB_H3K4me1.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K4me1_E12.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ATAC/E11_5_FB.pooled.FE.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/ATAC_E11.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/E12_5_FB_H3K27me3.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K27me3_E12.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/

bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/WGBS/BW/E16_5_FB_2.CAN-Both.rate.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/CAN_E16.5_2.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/E16_5_FB_H3K27me3.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K27me3_E16.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/ChIP/E11_5_FB_H3K4me2.subtract.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/H3K4me2_E11.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn

bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/WGBS/BW/E12_5_FB_1.CHN-Both.rate.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/CHN_E12.5_1.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/WGBS/BW/E10_5_FB_2.CGN-Both.rate.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/CGN_E10.5_2.tab
bigWigAverageOverBed /home/hanliu/ddn/hanliu/Yupeng_ENCODE_developmental_mouse_tissue/WGBS/BW/E11_5_FB_2.CGN-Both.rate.bw /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/DMR_temp.bed /home/hanliu/project/mouse_rostral_brain/study/CompareREPTILE/AnnotateDMRWithENCODE/CGN_E11.5_2.tab
bigWigAverageOverBed /home/hanliu/ddn/hanli

## Assemble


In [13]:
output_files = list(pathlib.Path().glob('*tab'))

In [14]:
records = []
for p in output_files:
    dtype, time, rep = p.name[:-4].split('_')
    records.append([dtype, time, rep, p])
output_df = pd.DataFrame(records)
output_df.index = output_df[0] + '_' + output_df[1] + '_' + output_df[2].astype(str)

In [15]:
mean_records = []
for i, p in enumerate(output_df[3]):
    print(i, p)
    df = pd.read_csv(p, sep='\t', header=None)
    mean_records.append(df[5])

0 H3K27me3_P0_1.tab
1 H3K27me3_E13.5_1.tab
2 ATAC_E11.5_1.tab
3 ATAC_E12.5_1.tab
4 H3K4me3_E12.5_1.tab
5 ATAC_E13.5_1.tab
6 H3K27ac_E14.5_1.tab
7 H3K27me3_E14.5_1.tab
8 ATAC_P0_1.tab
9 H3K4me3_E14.5_1.tab
10 H3K4me1_E14.5_1.tab
11 H3K9ac_E14.5_1.tab
12 H3K9ac_E15.5_1.tab
13 H3K4me3_P0_1.tab
14 ATAC_E14.5_1.tab
15 H3K4me3_E15.5_1.tab
16 H3K4me2_E15.5_1.tab
17 H3K4me3_E16.5_1.tab
18 H3K9ac_E16.5_1.tab
19 H3K4me1_E15.5_1.tab
20 H3K9ac_E12.5_1.tab
21 H3K27ac_P21_1.tab
22 H3K4me2_E13.5_1.tab
23 H3K9ac_E11.5_1.tab
24 H3K4me2_P0_1.tab
25 H3K27ac_E15.5_1.tab
26 H3K4me1_E12.5_1.tab
27 H3K27me3_E12.5_1.tab
28 H3K4me1_E11.5_1.tab
29 H3K27ac_E16.5_1.tab
30 H3K27me3_E15.5_1.tab
31 H3K4me2_E14.5_1.tab
32 H3K4me1_E16.5_1.tab
33 H3K27ac_E12.5_1.tab
34 H3K4me1_E13.5_1.tab
35 H3K4me2_E12.5_1.tab
36 H3K4me2_E11.5_1.tab
37 H3K4me3_E13.5_1.tab
38 H3K27ac_E13.5_1.tab
39 H3K9ac_P0_1.tab
40 H3K27ac_P56_1.tab
41 H3K9ac_E13.5_1.tab
42 H3K27me3_E16.5_1.tab
43 H3K4me2_E16.5_1.tab
44 H3K27me3_E11.5_1.tab
45 H3K4me

In [16]:
total_data = np.vstack([i.values for i in mean_records])

In [17]:
adata = anndata.AnnData(X=csr_matrix(total_data.T),
                        obs=pd.DataFrame([], index=df[0]),
                        var=output_df)
adata.var.columns = ['DataType', 'DevTime', 'Rep', 'input_path']

In [18]:
adata.write_h5ad('LoopAnchor.DMR.ENCODE_FB_anno.h5ad')

... storing 'DataType' as categorical
... storing 'DevTime' as categorical
... storing 'Rep' as categorical


In [21]:
adata.obs

0_25k
1_25k
2_25k
0_10k
2_10k
3_25k
1_10k
3_10k
4_25k
6_25k
7_25k
