In [5]:
import os

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data # 获取迭代数据
from torch.utils.data import Dataset,TensorDataset,DataLoader
from torch.autograd import Variable # 获取变量



import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import cooler



## pos data

In [6]:
sv_list = pd.read_csv("./ref_data/sv_list.csv")
k562_list = sv_list[sv_list['Cell Line']=="K562"]

In [14]:
k562_list[k562_list['chrom1']==k562_list['chrom2']].shape




(36, 6)

In [15]:
#列出raw_data/K562/cooler/文件夹下所有的文件
file_list = os.listdir("./raw_data/scihic/K562/cooler/")
#读取所有的mcool文件
mcool_list = [i for i in file_list if i.endswith(".mcool")]
#读取所有文件名中不包含bulk的


In [96]:
def get_pos_submatrix(clr,window,label_list):
    bin_table = clr.bins()[:]
    #找到有label对应的位置
    pos_data_list = []
    pos_label_list = []
    
    for i in range(len(label_list)):
        chrom1 = label_list['chrom1'].iat[i]
        chrom2 = label_list['chrom2'].iat[i]
        breakpoint1 = label_list['breakpoint1'].iat[i]
        breakpoint2 = label_list['breakpoint2'].iat[i]
        string = label_list['strands'].iat[i]

        #找到对应的bin
        bin1 = bin_table[(bin_table['chrom']==chrom1) &(bin_table['start'] < breakpoint1 ) & (breakpoint1 < bin_table['end'] )]
        bin2 = bin_table[(bin_table['chrom']==chrom2) &(bin_table['start'] < breakpoint2 ) & (breakpoint2 < bin_table['end'] )]

        #找到对应的matrix
        x_c = bin1.index[0]#得到的是索引
        y_c = bin2.index[0]

        x1 = x_c - int((window-1)/2)
        x2 = x_c + int((window-1)/2)
        y1 = y_c - int((window-1)/2)
        y2 = y_c + int((window-1)/2)
        
        #取对应的区域
        submatrix = clr.matrix(balance=False)[x1:x2+1, y1:y2+1]
        #pos_data_list中添加submatrix
        pos_data_list.append(submatrix)
        pos_label_list.append(string)

    return pos_data_list,pos_label_list



In [97]:
resolution = 200000
window = 21
cool_dir = "raw_data/scihic/K562/cooler/"
#读取所有的mcool文件
pos_data_list = []
pos_label_list = []
for mc in mcool_list:
    clr = cooler.Cooler(cool_dir+mc+"::/resolutions/"+str(resolution))
    sc_pos_data_list,sc_pos_label_list = get_pos_submatrix(clr,window,k562_list) 
    pos_data_list.append(sc_pos_data_list)
    pos_label_list.append(sc_pos_label_list)

    
pos_data_list = [item for sublist in pos_data_list for item in sublist]
pos_label_list = [item for sublist in pos_label_list for item in sublist]
pos_data_list = np.array(pos_data_list)
pos_label_list = np.array(pos_label_list)
#数据类型转换
pos_data_list = pos_data_list.astype(np.float32)
#保存
np.save("input_data/pos_data_list.npy",pos_data_list)
np.save("input_data/pos_label_list.npy",pos_label_list)


In [98]:
pos_data_list = np.load("input_data/pos_data_list.npy")
pos_label_list = np.load("input_data/pos_label_list.npy")

In [99]:
pos_data_list.shape

(14784, 21, 21)

## neg data

In [100]:
#列出raw_data/K562/cooler/文件夹下所有的文件
gm12878_file_list = os.listdir("./raw_data/scihic/GM12878/cooler/")
#读取所有的mcool文件
gm12878_mcool_list = [i for i in gm12878_file_list if i.endswith(".mcool")]



In [117]:

def get_matrix_gm12878(num_sample,window,clr):
    rand_data_list = []
    rand_label_list = []
    for i in range(num_sample):
        whole_matrix = clr.matrix(balance=False)[:]
        x = np.random.randint(0,whole_matrix.shape[0]-window)
        y = np.random.randint(0,whole_matrix.shape[1]-window)
        submatrix = whole_matrix[x:x+window,y:y+window]
        rand_data_list.append(submatrix)
        label = "no"
        rand_label_list.append(label)
    return rand_data_list,rand_label_list


In [118]:
cool_dir = "raw_data/scihic/GM12878/cooler/"
resolution = 200000
window = 21
num = 5

neg12878_data_list = []
neg12878_label_list = []

for mc in gm12878_mcool_list:
    clr = cooler.Cooler(cool_dir+mc+"::/resolutions/"+str(resolution))
    sc_neg_data_list,sc_neg_label_list = get_matrix_gm12878(num,window,clr)
    neg12878_data_list.append(sc_neg_data_list)
    neg12878_label_list.append(sc_neg_label_list)

In [124]:
neg12878_data_list = [item for sublist in neg12878_data_list for item in sublist]
neg12878_label_list = [item for sublist in neg12878_label_list for item in sublist]
neg12878_data_list = np.array(neg12878_data_list)
neg12878_label_list = np.array(neg12878_label_list)
#数据类型
neg12878_data_list = neg12878_data_list.astype(np.float32)
#保存
np.save("input_data/neg12878_data_list.npy",neg12878_data_list)
np.save("input_data/neg12878_label_list.npy",neg12878_label_list)

In [127]:
def determine_label(tmp_bin_x,tmp_bin_y,k562_list):
    
    label = "no"
    for i in range(len(tmp_bin_x)):
        chr1 = tmp_bin_x['chrom'].iat[i]
        start1 = tmp_bin_x['start'].iat[i]
        end1 = tmp_bin_x['end'].iat[i]
        for j in range(len(tmp_bin_y)):
            chr2 = tmp_bin_y['chrom'].iat[j]
            start2 = tmp_bin_y['start'].iat[j]
            end2 = tmp_bin_y['end'].iat[j]

            for m in range(len(k562_list)):
                chrom1 = k562_list['chrom1'].iat[m]
                chrom2 = k562_list['chrom2'].iat[m]
                breakpoint1 = k562_list['breakpoint1'].iat[m]
                breakpoint2 = k562_list['breakpoint2'].iat[m]
                string = k562_list['strands'].iat[m]
                if (chr1 == chrom1 and chr2==chrom2 and (breakpoint1 > start1 and breakpoint1<end1) and (breakpoint2 > start2 and breakpoint2<end2) ):
                    label = string
                    break
                elif (chr1 == chrom2 and chr2==chrom1 and (breakpoint1 > start2 and breakpoint1<end2) and (breakpoint2 > start1 and breakpoint2<end1)):
                    label = string
                    break
            if label != "no":
                break
        if label != "no":
            break
    return label

          

In [126]:
#从K562中获得
#很少，但吧
neg_data_list = []
neg_label_list = []


sample_x1=np.random.randint(7000, size=(50))
sample_y1=np.random.randint(7000, size=(50))
sample_x2 = sample_x1 + window
sample_y2 = sample_y1 + window

for x,y in zip(sample_x1,sample_y1):
    submatrix = matrix1[x:x+window, y:y+window]
    nozeron = np.count_nonzero(submatrix)
    tmp_bin_x = bin_table.loc[x:x+window,:]
    tmp_bin_y = bin_table.loc[y:y+window,:]
    #得到标签
    label = determine_label(tmp_bin_x,tmp_bin_y,k562_list)

    neg_data_list.append(submatrix)
    neg_label_list.append(label)


In [128]:
all_data = np.concatenate((pos_data_list,neg12878_data_list),axis=0)
all_label = np.concatenate((pos_label_list,neg12878_label_list),axis=0)

In [129]:
all_data = all_data.reshape(all_data.shape[0],1,window,window)

In [130]:
all_data.shape

(16494, 1, 21, 21)

In [41]:
np.save('input_data/data.npy',all_data)   # 保存为.npy格式

In [44]:
label_dic = {'++':0,'+-':1,'-+':2,'--':3,'no':4}

In [46]:
#将b按照label_dic转成对应int
all_label_index = [label_dic[i] for i in all_label]
np.save('input_data/label.npy',all_label_index)