In [None]:
import os, sys, math, random, argparse
import numpy as np

path = os.getcwd()
random.seed(100)

In [None]:
# 인자 받기
parser = argparse.ArgumentParser(description='Train deep learning model for enhancing Hi-C contact map', add_help=True)
req_args = parser.add_argument_group('Required Arguments')
req_args.add_argument('-i', dest='input_data_dir', required=True,
                      help='REQUIRED: Hi-C data directory containig numpy matrix files (Directory of Hi-C matrix for training input) ==== (example) /HiHiC-main/data_DFHiC====')
req_args.add_argument('-p', dest='output_model_dir', required=True,
                      help='REQUIRED: Directory to save training weight (Directory for saving pretrained model) ==== (example) /HiHiC-main/pretrained ====')
req_args.add_argument('-m', dest='model', required=True, choices=['HiCARN', 'DeepHiC', 'HiCNN2', 'HiCSR', 'DFHiC', 'hicplus', 'SRHiC'],
                      help='REQUIRED: Model name that you want to use (One of HiCARN, DeepHiC, HiCNN2, HiCSR, DFHiC, hicplus, and SRHiC) ==== (example) DFHiC ====')
req_args.add_argument('-e', dest='epoch', required=True,
                      help='REQUIRED: Number of train epoch ==== (example) 500 ====')
req_args.add_argument('-g', dest='gpu', required=True,
                      help='REQUIRED: Number of gpu for training  ==== (example) 0 ====')
req_args.add_argument('-o', dest='output_performance_dir', required=True,
                      help='REQUIRED: Directory to save training performance trend ==== (example) HiHiC-main/performance ====')

args = parser.parse_args()
input_data_dir = args.input_data_dir 
output_model_dir = args.output_model_dir
model = args.model
epoch = args.epoch
gpu = args.gpu
output_performance_dir = args.output_performance_dir
print(f"\n\n\n...Start {model} training...\nusing Hi-C data of {input_data_dir}")

In [None]:
save_dir = f'{output_dir}/data_{model}/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)   

# 모델이 원하는 포멧으로 저장
if model == "DFHiC":
    hr_mats_train,lr_mats_train,distance_train = DFHiC_data_split([f'chr{idx}' for idx in list(range(1,18))]) # train: 1~17
    hr_mats_test,lr_mats_test,distance_test = DFHiC_data_split([f'chr{idx}' for idx in list(range(18,23))]) # test: 18~22

    np.savez(save_dir+f'train_data_raw_ratio{data_ratio}.npz', train_lr=lr_mats_train,train_hr=hr_mats_train,distance=distance_train)
    np.savez(save_dir+f'test_data_raw_ratio{data_ratio}.npz', test_lr=lr_mats_test,test_hr=hr_mats_test,distance=distance_test)

elif model == "deepHiC":      
    hr_mats_train,lr_mats_train,coordinates_train = DeepHiC_data_split([f'chr{idx}' for idx in list(range(1,15))]) # train:1~15
    hr_mats_valid,lr_mats_valid,coordinates_valid = DeepHiC_data_split([f'chr{idx}' for idx in list(range(15,18))]) # valid:15~17
    hr_mats_test,lr_mats_test,coordinates_test = DeepHiC_data_split([f'chr{idx}' for idx in list(range(18,23))]) # test:18~22

    compacts = {int(k.split('chr')[1]) : np.nonzero(v)[0] for k, v in hr_contacts_dict.items()}
    size = {item.split()[0].split('chr')[1]:int(item.strip().split()[1])for item in open('chromosome.txt').readlines()}

    os.mkdir(save_dir+'Train_and_Validation/')
    os.mkdir(save_dir+'Test/')

    np.savez(save_dir+f'Train_and_Validation/train_ratio{data_ratio}.npz', data=lr_mats_train,target=hr_mats_train,inds=np.array(coordinates_train, dtype=np.int_),compacts=compacts,size=size)
    np.savez(save_dir+f'Train_and_Validation/valid_ratio{data_ratio}.npz', data=lr_mats_valid,target=hr_mats_valid,inds=np.array(coordinates_valid, dtype=np.int_),compacts=compacts,size=size)
    np.savez(save_dir+f'Test/test_ratio{data_ratio}.npz', data=lr_mats_test,target=hr_mats_test,inds=np.array(coordinates_test, dtype=np.int_),compacts=compacts,size=size)
    
elif model == "HiCARN":          
    hr_mats_train,lr_mats_train,coordinates_train = HiCARN_data_split([f'chr{idx}' for idx in list(range(1,15))]) # train:1~14
    hr_mats_valid,lr_mats_valid,coordinates_valid = HiCARN_data_split([f'chr{idx}' for idx in list(range(15,18))]) # valid:15~17
    hr_mats_test,lr_mats_test,coordinates_test = HiCARN_data_split([f'chr{idx}' for idx in list(range(18,23))]) # test:18~22

    compacts = {int(k.split('chr')[1]) : np.nonzero(v)[0] for k, v in hr_contacts_dict.items()}
    size = {item.split()[0].split('chr')[1]:int(item.strip().split()[1])for item in open('chromosome.txt').readlines()}

    os.mkdir(save_dir+'Train_and_Validation/')
    os.mkdir(save_dir+'Test/')

    np.savez(save_dir+f'Train_and_Validation/train_ratio{data_ratio}.npz', data=lr_mats_train,target=hr_mats_train,inds=np.array(coordinates_train, dtype=np.int_),compacts=compacts,size=size)
    np.savez(save_dir+f'Train_and_Validation/valid_ratio{data_ratio}.npz', data=lr_mats_valid,target=hr_mats_valid,inds=np.array(coordinates_valid, dtype=np.int_),compacts=compacts,size=size)
    np.savez(save_dir+f'Test/test_ratio{data_ratio}.npz', data=lr_mats_test,target=hr_mats_test,inds=np.array(coordinates_test, dtype=np.int_),compacts=compacts,size=size)

elif model == "HiCNN":     
    hr_mats_train,lr_mats_train,hr_coordinates_train,lr_coordinates_train = HiCNN_data_split([f'chr{idx}' for idx in list(range(1,15))]) # train:1~14
    hr_mats_valid,lr_mats_valid,hr_coordinates_valid,lr_coordinates_valid = HiCNN_data_split([f'chr{idx}' for idx in list(range(15,18))]) # valid:15~17
    # hr_mats_test,lr_mats_test,hr_coordinates_test,lr_coordinates_test = HiCNN_data_split([f'chr{idx}' for idx in list(range(18,23))]) # test:18~22

    np.save(save_dir+f'subMats_train_target_ratio{data_ratio}', hr_mats_train)
    np.save(save_dir+f'subMats_train_ratio{data_ratio}', lr_mats_train)
    np.save(save_dir+f'index_train_target', hr_coordinates_train)
    np.save(save_dir+f'index_train_data', lr_coordinates_train)
    np.save(save_dir+f'subMats_valid_target_ratio{data_ratio}', hr_mats_valid)
    np.save(save_dir+f'subMats_valid_ratio{data_ratio}', lr_mats_valid)
    np.save(save_dir+f'index_valid_target', hr_coordinates_valid)
    np.save(save_dir+f'index_valid_data', lr_coordinates_valid)
    
elif model == "SRHiC":  
    hr_mats_train,lr_mats_train = SRHiC_data_split([f'chr{idx}' for idx in list(range(1,18))]) # train: 1~17
    hr_mats_test,lr_mats_test = SRHiC_data_split([f'chr{idx}' for idx in list(range(18,23))]) # valid:15~17

    train = np.concatenate((lr_mats_train[:,0,:,:], np.concatenate((hr_mats_train[:,0,:,:],np.zeros((hr_mats_train.shape[0],12,28))), axis=1)), axis=2)
    valid = np.concatenate((lr_mats_test[:,0,:,:], np.concatenate((hr_mats_test[:,0,:,:],np.zeros((hr_mats_test.shape[0],12,28))), axis=1)), axis=2)

    np.save(save_dir+f'train_data_raw_ratio{data_ratio}', train)
    np.save(save_dir+f'valid_data_raw_ratio{data_ratio}', valid)
    
else:
    assert model == "hicplus"
    hr_mats_train,lr_mats_train,hr_coordinates_train,lr_coordinates_train = hicplus_data_split([f'chr{idx}' for idx in list(range(1,18))]) # train:1~17
    hr_mats_test,lr_mats_test,hr_coordinates_test,lr_coordinates_test = hicplus_data_split([f'chr{idx}' for idx in list(range(18,23))]) # test:18~22

    np.save(save_dir+f'subMats_train_target_ratio{data_ratio}', hr_mats_train)
    np.save(save_dir+f'subMats_train_ratio{data_ratio}', lr_mats_train)
    np.save(save_dir+f'index_train_target', hr_coordinates_train)
    np.save(save_dir+f'index_train_data', lr_coordinates_train)
    np.save(save_dir+f'subMats_test_target_ratio{data_ratio}', hr_mats_test)
    np.save(save_dir+f'subMats_test_ratio{data_ratio}', lr_mats_test)
    np.save(save_dir+f'index_test_target', hr_coordinates_test)
    np.save(save_dir+f'index_test_data', lr_coordinates_test)  
    
print(f"\n\n... Generated data is saved in {save_dir}...")