In [1]:
import os, sys
import time
import argparse
import multiprocessing
import numpy as np
from utils.io import compactM, spreadM, downsampling
from all_parser import *

In [None]:
# python data_generate.py -hr 10kb -lr 40kb -s all -chunk 40 -stride 40 -bound 201 -scale 1 -c GM12878

In [4]:

cell_line = "GM12878"
high_res = "10kb"
low_res = "40kb"
ratio = 16 #默认16

pool_num = 23 if multiprocessing.cpu_count() > 23 else multiprocessing.cpu_count()


In [11]:
data_dir = os.path.join(root_dir, 'mat', cell_line)
in_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.find(high_res) >= 0]


In [13]:
print(f'Generating {low_res} files from {high_res} files by {ratio}x downsampling.')

Generating 40kb files from 10kb files by 16x downsampling.


In [None]:
def downsample(in_file, low_res, ratio):
    data = np.load(in_file)
    hic = data['hic'] #数据
    compact_idx = data['compact']

    down_hic = downsampling(hic, ratio)

    chr_name = os.path.basename(in_file).split('_')[0]

    out_file = os.path.join(os.path.dirname(in_file), f'{chr_name}_{low_res}.npz')

    np.savez_compressed(out_file, hic=down_hic, compact=compact_idx, ratio=ratio)
    print('Saving file:', out_file)

In [24]:
file = "data/RaoHiC/mat/GM12878/chr18_10kb.npz"
print(low_res)
print(ratio)
downsample(file, low_res, ratio)

40kb
16
Saving file: data/RaoHiC/mat/GM12878/chr18_40kb.npz


In [19]:
start = time.time()
print(f'Start a multiprocess pool with process_num = {pool_num}')
pool = multiprocessing.Pool(pool_num)

for file in in_files:
    pool.apply_async(downsample, (file, low_res, ratio))
pool.close()
pool.join()
print(f'All downsampling processes done. Running cost is {(time.time()-start)/60:.1f} min.')

Start a multiprocess pool with process_num = 23
Saving file: data/RaoHiC/mat/GM12878/chr18_40kb.npz
All downsampling processes done. Running cost is 0.3 min.
