#### Data Preparation and Preprocessing

In [1]:
from utility import *
import os, gzip, pickle
import pandas as pd
from natsort import natsorted

##### Make Dataframe from dataset

In [33]:
# Make Dataframe with clip dataset
# Final output : DataFrame[@Sequence @RBP @Class]


# Various type of RBPs in Train/Test Dataset
clip_list = [
    'Ago-EIF',
    'Ago2-MNase',
    'Ago2-1',
    'Ago2-2',
    'Ago2',
    'eIF4III-1',
    'eIF4III-2',
    'ELAVL1-1',
    'ELAVL1-MNase',
    'ELAVL1A',
    'ELAVL1-2',
    'ESWR1',
    'FUS',
    'Mut_FUS',
    'IGFBP1-3',
    'hnRNPC-1',
    'hnRNPC-2',
    'hnRNPL-1',
    'hnRNPL-2',
    'hnRNPL-like',
    'MOV10',
    'Nsun2',
    'PUM2',
    'QKI',
    'SRSF1',
    'TAF15',
    'TDP-43',
    'TIA1',
    'TIAL1',
    'U2AF2',
    'U2AF2-KD'
]

# read (FASTA) file path and return list of parsed object (sequences)
# Input : FASTA file (path), Class file (path) --> per Protein
# Output : list of (sequence, class) tuples
def read_fa(fa_path, clss_path):
    seqs = []
    seq = ''    
    with gzip.open(fa_path, 'r') as fa, gzip.open(clss_path, 'r') as cl:# files are compressed with bgzip
        clss = cl.readlines()
        clss = [int(x.decode('utf-8')) for x in clss[1:]] # class label

        i = 0
        for line in fa:
            line = line.decode('utf-8') # bytes -> string
            if line[0] == '>': # need to skip this header line with after extra jobs
                if len(seqs) == 0 and seq == '': # just skip header of fisrst sequence in fa
                    continue
                else:
                    seqs.append((seq, clss[i])) # add sequence to seqs
                    i += 1
                    seq = '' # reinitialize with empty string
                    continue
            else:
                seq += line.rstrip() # concatenate sequence without '\n' to seq
        
        seqs.append((seq, clss[i]))
    
    return seqs


# Make DataFrame
def mk_frame(data_rt_head, file_list, seq_tail, clss_tail, df_name):
    seqs = {}
    i = 0
    
    for file in file_list:
        seq_path = data_rt_head + '/' + file + seq_tail
        clss_path = data_rt_head + '/' + file + clss_tail
        seqs[clip_list[i]] = read_fa(seq_path, clss_path)
        i += 1
    
    # len(seqs) : number of RBPs
    # len(seqs[clip_list[0]]) : number of seqs per RBP -> 1K(test) or 5K(train) or 10K(test) or 30K(train)
    if len(seqs) != 31 or len(seqs[clip_list[0]]) not in [1000, 5000, 10000, 30000]:
        print('len(seqs) :', len(seqs))
        print('len(seqs[clip_list[0]])', len(seqs[clip_list[0]]))
        raise IndexNotMatching

    df_seqs = pd.DataFrame(
        [(seq, rbp, clss) for (rbp, seq_clss_pair) in seqs.items() for (seq, clss) in seq_clss_pair],
        #[(var, key) for (key, L) in seqs_test.items() for var in L],
        columns=['Sequence', 'RBP', 'Class']
    )

    df_seqs['RBP'] = df_seqs['RBP'].astype('category')
    df_seqs['Class'] = df_seqs['Class'].astype('category')

    df_seqs.to_feather('./../dataset/objs/' + df_name + '.ftr') # save dataframe as object

    return df_seqs


def concat_frame(dfs, df_name):
    if type(dfs) != 'list':
        return dfs
    
    df_total = pd.concat(dfs)

    df_total.to_feather('./../dataset/objs/' + df_name + '.ftr')

    return df_total


# Single Sample set in 5K dataset
data_rt_5k = "./../dataset/clip5000"
files_5k = natsorted(os.listdir(data_rt_5k))
train_seq_tail_5k = "/5000/training_sample_0/sequences.fa.gz"
train_clss_tail_5k = train_seq_tail_5k.replace('sequences.fa', 'matrix_Response.tab')
test_seq_tail_5k = train_seq_tail_5k.replace('training', 'test')
test_clss_tail_5k = train_clss_tail_5k.replace('training', 'test')

# 3 Sample sets in 30K dataset
data_rt_30k = "./../dataset/clip"
files_30k = natsorted(os.listdir(data_rt_30k))
train_seqs_tails_30k = ["/30000/training_sample_" + str(i) + "/sequences.fa.gz" for i in range(3)]
train_clss_tails_30k = [seq_tail.replace('sequences.fa', 'matrix_Response.tab') for seq_tail in train_seqs_tails_30k]
test_seqs_tails_30k = [seq_tail.replace('training', 'test') for seq_tail in train_seqs_tails_30k]
test_clss_tails_30k = [seq_tail.replace('sequences.fa', 'matrix_Response.tab') for seq_tail in test_seqs_tails_30k]

In [36]:
df_train_5k = mk_frame(
    data_rt_head=data_rt_5k,
    file_list=files_5k,
    seq_tail=train_seq_tail_5k,
    clss_tail=train_clss_tail_5k,
    df_name='train_5k'
)

df_test_5k = mk_frame(
    data_rt_head=data_rt_5k,
    file_list=files_5k,
    seq_tail=test_seq_tail_5k,
    clss_tail=test_clss_tail_5k,
    df_name='test_5k'
)


df_train_lst = [
    mk_frame(data_rt_30k, files_30k, train_30k_tail[i][0], train_30k_tail[i][1], 'train_30k_' + str(i)) \
        for train_30k_tail in zip(train_seqs_tails_30k, train_clss_tails_30k) for i in range(len(train_seqs_tails_30k))
]

df_test_lst = [
    mk_frame(data_rt_30k, files_30k, test_30k_tail[i][0], test_30k_tail[i][1], 'test_30k_' + str(i)) \
        for test_30k_tail in zip(test_seqs_tails_30k, test_clss_tails_30k) for i in range(len(test_seqs_tails_30k))
]

df_train_30k = concat_frame(df_train_lst, 'train_30k')

df_test_30k = concat_frame(df_test_lst, 'test_30k')

IsADirectoryError: [Errno 21] Is a directory: './../dataset/clip/1_PARCLIP_AGO1234_hg19/'

In [35]:
[x for x in zip([1, 2, 3], ['a', 'b','c'])]

[(1, 'a'), (2, 'b'), (3, 'c')]