In [1]:
import pandas as pd
import numpy as np
import pybedtools
import pathlib
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed

## Prepare Region

In [2]:
black_list = pybedtools.BedTool('/home/hanliu/ref/blacklist/mm10-blacklist.v2.bed.gz')
genome_bin = pybedtools.BedTool('mm10.w5k.bed')
remove_black_list = genome_bin.intersect(black_list, v=True)
remove_black_list.saveas('mm10.w5k.remove_black.bed')

<BedTool(mm10.w5k.remove_black.bed)>

## BW paths

In [2]:
bw_dir = pathlib.Path('/gale/ddn/cemba/CEMBA/hanliu/mouse_rostral_brain/MultiLevelCluster/BW/SubType')
bw_paths = {i.name.split('.')[1]: str(i) for i in bw_dir.glob('*.CHN-Both.rate.bw')}

## bigWigAverageOverBed

In [3]:
cpu = 10

In [16]:
output_dir = '/home/hanliu/ddn/hanliu/mouse_rostral_brain/MultiLevelCluster/SubTypeBins/BW_Mean'
input_bed = pathlib.Path('mm10.w5k.remove_black.bed').absolute()

def runner(i):
    subprocess.run(i, shell=True, check=True)
    
with ProcessPoolExecutor(cpu) as executor:
    futures = []
    for name, bw in bw_paths.items():
        output_path = f'{output_dir}/{name}.mean.tsv'
        if pathlib.Path(output_path).exists():
            continue
        cmd = f'bigWigAverageOverBed {bw} {input_bed} {output_path}'
        
        future = executor.submit(runner, cmd)
        futures.append(future)
    
    for future in as_completed(futures):
        future.result()

## Aggregate

In [4]:
file_paths = list(pathlib.Path('BW_Mean/').glob('*.mean.tsv'))

In [5]:
region_index = pd.read_csv('mm10.w5k.remove_black.bed', index_col=3, header=None, sep='\t').index

In [6]:
records = []
for path in file_paths:
    name = path.name.split('.')[0]
    print(name)
    df = pd.read_csv(path, sep='\t', header=None, index_col=0, usecols=[0, 2, 5])
    data = df[df[2] > 1000][5].reindex(region_index)
    data.name = name
    records.append(data)
    
total_rate = pd.DataFrame(records)

VLMC_Col4a1
MGE-Pvalb_Gfra2
CGE-Vip_Ccser1
CGE-Lamp5_Nrxn3
Gfra1_Gfra1
EC_Abhd2
IT-L4_Astn2
LSX-Inh_Zeb2
CGE-Vip_Galnt17
LSX-Inh_Lats2
Chd7_Kcnc2
LSX-Inh_Dock10
LSX-Inh_Foxp2
MGE-Sst_Ptpre
ASC_str-hpf
MGE-Sst_Ubtd1
OLF-Exc_Unc13c
ANP_anp-dg
IT-L4_Shc3
DG-po_Kctd8
CGE-Vip_Clstn2
Foxp2_Homer2
ASC_mid
MSN-D2_Nrp2
OPC_opc-large
MSN-D2_Col14a1
NP-L6_Boc
PAL-Inh_Igdcc3
PT-L5_Kcnh1
MGC_mgc-all
EP_Rgs8
NP-L6_Olfml2b
MSN-D1_Plxnc1
MGE-Sst_Chodl
MGE-Sst_Dock4
PT-L5_Plcb4
CA3_Efnb2
LSX-Inh_Nxph1
ASC_cortex-olf
OLF-Exc_Sgcd
Chd7_Megf11
CA3-St18_Nuak1
IG-CA2_Peak1
MSN-D1_Khdrbs3
MGE-Sst_Rerg
ODC_odc-small
OLF-Exc_Rmst
MGE-Sst_Kcnip4
IG-CA2_Chrm3
OPC_opc-small
ODC_odc-large
CGE-Lamp5_Grk5
CA3-St18_Epha5
MGE-Pvalb_Cacna1i
CA3-St18_Tead1
CLA_Cdh8
IT-L23_Foxp1
Chd7_Trpc7
D1L-PAL_Flrt2
MGE-Sst_Frmd6
CGE-Vip_Ntng1
MSN-D2_Slc24a2
IT-L6_Oxr1
IT-L23_Tenm2
MSN-D2_Casz1
CLA_Nrp2
L6b_Adcy8
ANP_anp-olf-cnu
PT-L5_Tenm2
PT-L5_Ptprt
PT-L5_Tmtc2
MGE-Pvalb_Entpd3
D1L-Fstl4_Trps1
OLF_Pag1
MGE-Sst_Etv1
PAL-Inh_Chat
IT

In [7]:
total_rate = total_rate.loc[:, total_rate.isna().sum(axis=0) <= 2].copy()
_median = total_rate.median(axis=0)
total_rate_no_na = total_rate.fillna(_median)

assert total_rate_no_na.isna().values.sum() == 0
total_rate_no_na = total_rate_no_na.T

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """


Index(['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1',
       'chr1',
       ...
       'chrY', 'chrY', 'chrY', 'chrY', 'chrY', 'chrY', 'chrM', 'chrM', 'chrM',
       'chrM'],
      dtype='object', name=3, length=484530)

In [17]:
output_dir = 'SubType_5kb_mCH_per_chrom'
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True)

chroms = total_rate_no_na.index.map(lambda i: i.split('_')[0])
for chrom, sub_df in total_rate_no_na.groupby(chroms):
    sub_df.to_msgpack(output_dir / f'{chrom}.msg')
    print(chrom)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  import sys


chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrM
chrX
chrY
