In [1]:
import os
import h5py
import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
IN_MAP = np.asarray(
    [[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
)
def create_datapoints(seq, concentration, read_length):
    seq = seq[:-1]
    F = int(concentration)
    L = int(read_length)

    length = len(seq)
    seq = seq + 'N' * (50 - length)
    seq = seq.upper().replace("A", "1").replace("C", "2")
    seq = seq.replace("G", "3").replace("T", "4").replace("N", "0")

    X = np.asarray(list(map(int, list(seq))))
    X = IN_MAP[X.astype("int8")]

    if F == 0:
        F = 20 # use a arbitraty value
        Y = 0

    return X, F, L

In [4]:
def generate_dataset(target_protein, data_dir, t_file, c_file, protein_concentration, each_class_train_size):

    path = os.path.join(data_dir, f"protein_{target_protein}")
    try:
        os.makedirs(path)
    except FileExistsError:
        pass
    path += '/'

    X_train = [] # seq
    Y_train = [] # label, 0 for the background data
    F_train = [] # concentration
    L_train = [] # read length
    X_test = []
    Y_test = []
    F_test = []
    L_test = []

    protein_dict = {}
    protein_idx = 0
    step = -1
    protein_list = list()
    t_i, c_i = 0, 0

    with open(t_file, 'r') as f:
        for line in f:
            if t_i > each_class_train_size * 4.4: 
                break
            if t_i%4 == 1:
                data, concentration, l = create_datapoints(line, concentration=protein_concentration, read_length=20)
                label = 1
                if t_i < each_class_train_size * 4:
                    X_train.append(data)
                    Y_train.append(label)
                    F_train.append(concentration)
                    L_train.append(l)
                elif t_i < each_class_train_size * 4.4:
                    X_test.append(data)
                    Y_test.append(label)
                    F_test.append(concentration)
                    L_test.append(l)
            t_i += 1
        
        h5f = h5py.File(path + 'rbns' + '_' + 'train' + '.h5', 'w')
        print('Xtrain', np.asarray(X_train).shape)
        h5f.create_dataset('X', data=np.asarray(X_train), maxshape=(None, None, 4))
        h5f.create_dataset('Y', data=np.asarray(Y_train), maxshape=(None, ))
        h5f.create_dataset('F', data=np.asarray(F_train), maxshape=(None, ))
        h5f.create_dataset('L', data=np.asarray(L_train), maxshape=(None, ))
        h5f.close()

        h5f = h5py.File(path + 'rbns' + '_' + 'test' + '.h5', 'w')
        print('Xtest', np.asarray(X_test).shape)
        h5f.create_dataset('X', data=np.asarray(X_test), maxshape=(None, None, 4))
        h5f.create_dataset('Y', data=np.asarray(Y_test), maxshape=(None, ))
        h5f.create_dataset('F', data=np.asarray(F_test), maxshape=(None, ))
        h5f.create_dataset('L', data=np.asarray(L_test), maxshape=(None, ))
        h5f.close()

        X_train = []
        Y_train = []
        F_train = []
        L_train = []
        X_test = []
        Y_test = []
        F_test = []
        L_test = []
    
    with open(c_file, 'r') as f:
        for line in f:
            if c_i > each_class_train_size * 4.4: 
                break
            if c_i%4 == 1:
                data, concentration, l = create_datapoints(line, concentration=0, read_length=20)
                label = 0
                if c_i < each_class_train_size * 4:
                    X_train.append(data)
                    Y_train.append(label)
                    F_train.append(concentration)
                    L_train.append(l)
                elif c_i < each_class_train_size * 4.4:
                    X_test.append(data)
                    Y_test.append(label)
                    F_test.append(concentration)
                    L_test.append(l)
            c_i += 1
        
        h5f = h5py.File(path + 'rbns' + '_' + 'train' + '.h5', 'a')
        # print(np.asarray(X_train).shape)
        h5f['X'].resize((h5f['X'].shape[0] + np.asarray(X_train).shape[0]), axis=0)
        h5f['X'][-np.asarray(X_train).shape[0]:] = np.asarray(X_train)
        h5f['Y'].resize((h5f['Y'].shape[0] + np.asarray(Y_train).shape[0]), axis=0)
        h5f['Y'][-np.asarray(Y_train).shape[0]:] = np.asarray(Y_train)
        h5f['F'].resize((h5f['F'].shape[0] + np.asarray(F_train).shape[0]), axis=0)
        h5f['F'][-np.asarray(F_train).shape[0]:] = np.asarray(F_train)
        h5f['L'].resize((h5f['L'].shape[0] + np.asarray(L_train).shape[0]), axis=0)
        h5f['L'][-np.asarray(L_train).shape[0]:] = np.asarray(L_train)
        h5f.close()

        h5f = h5py.File(path + 'rbns' + '_' + 'test' + '.h5', 'a')
        h5f['X'].resize((h5f['X'].shape[0] + np.asarray(X_test).shape[0]), axis=0)
        h5f['X'][-np.asarray(X_test).shape[0]:] = np.asarray(X_test)
        h5f['Y'].resize((h5f['Y'].shape[0] + np.asarray(Y_test).shape[0]), axis=0)
        h5f['Y'][-np.asarray(Y_test).shape[0]:] = np.asarray(Y_test)
        h5f['F'].resize((h5f['F'].shape[0] + np.asarray(F_test).shape[0]), axis=0)
        h5f['F'][-np.asarray(F_test).shape[0]:] = np.asarray(F_test)
        h5f['L'].resize((h5f['L'].shape[0] + np.asarray(L_test).shape[0]), axis=0)
        h5f['L'][-np.asarray(L_test).shape[0]:] = np.asarray(L_test)
        h5f.close()

        X_train = []
        Y_train = []
        F_train = []
        L_train = []
        X_test = []
        Y_test = []
        F_test = []
        L_test = []

In [5]:
data_dir = f"../rbns_raw_data"
HNRNPA1_DATA_DIR = f"{data_dir}/HNRNPA1_split_reads/"
RALY_DATA_DIR = f"{data_dir}/RALY/"

In [6]:
generate_dataset('HNRNPA1', f"../dataset_10k/", HNRNPA1_DATA_DIR+'HNRNPA1_0nM_5.reads', HNRNPA1_DATA_DIR+'HNRNPA1_0nM_input.reads', 5, 10000)

Xtrain (10000, 50, 4)
Xtest (1000, 50, 4)


In [7]:
generate_dataset('HNRNPA1', f"../dataset_500k/", HNRNPA1_DATA_DIR+'HNRNPA1_0nM_5.reads', HNRNPA1_DATA_DIR+'HNRNPA1_0nM_input.reads', 5, 500000)

Xtrain (500000, 50, 4)
Xtest (50000, 50, 4)


In [8]:
generate_dataset('RALY', f"../dataset_10k/", RALY_DATA_DIR+'RALY_80.reads', RALY_DATA_DIR+'RALY_input.reads', 80, 10000)

Xtrain (10000, 50, 4)
Xtest (1000, 50, 4)


In [9]:
generate_dataset('RALY', f"../dataset_500k/", RALY_DATA_DIR+'RALY_80.reads', RALY_DATA_DIR+'RALY_input.reads', 80, 500000)

Xtrain (500000, 50, 4)
Xtest (50000, 50, 4)
