In [2]:
import os, sys
import time
import argparse
import multiprocessing
import torch
import numpy as np
from utils.io import compactM, divide, pooling
from all_parser import *

In [27]:
#生成数据
def deephic_divider(n, high_file, down_file, scale=1, pool_type='max', chunk=40, stride=40, bound=201, lr_cutoff=100, hr_cutoff=255):
    
    hic_data = np.load(high_file)
    
    compact_idx = hic_data['compact']
    full_size = hic_data['hic'].shape[0]
    print("raw shape",hic_data['hic'].shape)
    hic = compactM(hic_data['hic'], compact_idx)# Compacting
    print("hic shape",hic.shape)

    #低分辨率的
    down_data = np.load(down_file)
    down_hic = compactM(down_data['hic'], compact_idx)

    # Clamping
    hic = np.minimum(hr_cutoff, hic)
    down_hic = np.minimum(lr_cutoff, down_hic)

    # Rescaling
    hic = hic / np.max(hic)
    down_hic = down_hic / lr_cutoff
    # Deviding and Pooling (pooling is not performed actually)
    #TODO:div_inds?
    div_dhic, div_inds = divide(down_hic, n, chunk, stride, bound)
    div_dhic = pooling(div_dhic, scale, pool_type=pool_type, verbose=False).numpy()

    div_hhic, _ = divide(hic, n, chunk, stride, bound, verbose=True)

    #返回的顺序
    return n, div_dhic, div_hhic, div_inds, compact_idx, full_size

In [4]:
#  python data_generate.py -hr 10kb -lr 40kb -s all -chunk 40 -stride 40 -bound 201 -scale 1 -c GM12878
cell_line = "GM12878"
high_res = "10kb"
low_res = "40kb"
lr_cutoff =100
dataset = "all"

chunk = 40
stride = 40 #步长
bound = 201
scale = 1
pool_type ="max"

chr_list = set_dict["human"]
postfix = cell_line.lower() if dataset == 'all' else dataset
pool_str = 'nonpool' if scale == 1 else f'{pool_type}pool{scale}'
print(f'Going to read {high_res} and {low_res} data, then deviding matrices with {pool_str}')

Going to read 10kb and 40kb data, then deviding matrices with nonpool


In [5]:
pool_num = 23 if multiprocessing.cpu_count() > 23 else multiprocessing.cpu_count()

data_dir = os.path.join(root_dir, 'mat', cell_line)
out_dir = os.path.join(root_dir, 'data')
mkdir(out_dir)

start = time.time()
pool = multiprocessing.Pool(processes=pool_num)
print(f'Start a multiprocess pool with processes = {pool_num} for generating DeepHiC data')
results = []

for n in chr_list:

    #文件名
    high_file = os.path.join(data_dir, f'chr{n}_{high_res}.npz')
    down_file = os.path.join(data_dir, f'chr{n}_{low_res}.npz')

    kwargs = {'scale':scale, 'pool_type':pool_type, 'chunk':chunk, 'stride':stride, 'bound':bound, 'lr_cutoff': lr_cutoff}

    res = pool.apply_async(deephic_divider, (n, high_file, down_file,), kwargs)
    results.append(res)


pool.close()
pool.join()
print(f'All DeepHiC data generated. Running cost is {(time.time()-start)/60:.1f} min.')




Start a multiprocess pool with processes = 23 for generating DeepHiC data
All DeepHiC data generated. Running cost is 0.1 min.


In [28]:
n = 18
n, div_dhic, div_hhic, div_inds, compact_idx, full_size = deephic_divider(n, high_file, down_file,scale,pool_type,chunk,stride,bound,lr_cutoff)

raw shape (7816, 7816)
hic shape (7468, 7468)
[Chr18] Deviding HiC matrix (7468x7468) into 2016 samples with chunk=40, stride=40, bound=201


In [23]:
print(div_dhic.shape)
print(div_hhic.shape)
print(div_inds.shape)
print(compact_idx.shape)
print(full_size)

(2016, 1, 40, 40)
(2016, 1, 40, 40)
(2016, 4)
(7468,)
7816


In [6]:
# return: n, div_dhic, div_hhic, div_inds, compact_idx, full_size
data = np.concatenate([r.get()[1] for r in results])
target = np.concatenate([r.get()[2] for r in results])
inds = np.concatenate([r.get()[3] for r in results])


KeyError: '18'

In [None]:


compacts = {r.get()[0]: r.get()[4] for r in results}
sizes = {r.get()[0]: r.get()[5] for r in results}

filename = f'deephic_{high_res}{low_res}_c{chunk}_s{stride}_b{bound}_{pool_str}_{postfix}.npz'
deephic_file = os.path.join(out_dir, filename)


#保存文件
np.savez_compressed(deephic_file, data=data, target=target, inds=inds, compacts=compacts, sizes=sizes)
print('Saving file:', deephic_file)

In [29]:
deephic_file

NameError: name 'deephic_file' is not defined