In [1]:
import os
import re
import pandas as pd

In [2]:
import general_param as gparams

# Importing the Data

In [3]:
def get_sequences(raw_content: str, is_binding: bool=False):
    '''
    Extract each sequence, along with its info from the text files.

    '''
    sequences = []
    for seq_data in map(lambda x: x.split('\n'), raw_content.strip('>>').split('>>')):
        _info = seq_data[0].split('_')
        # some sequences have a weird ending.
        # this is an ugly hack to include them
        if len(_info) >= 8:
            _info = _info[:3] + ['-'.join(_info[3:-3])] + _info[-3:]
        if not is_binding:
            _info.insert(1, '')
            _info += ['', '', '']
        sequences.append(_info + [seq_data[1]])
    return sequences

In [4]:
sequences = []
for file_name, is_binding in [(gparams.binding_seqs, True), (gparams.control_seqs, False)]:
    with open(file_name, 'r') as fobj:
        raw_content = fobj.read()
    sequences.extend(get_sequences(raw_content, is_binding))

In [5]:
sequence_data = pd.DataFrame(sequences, columns=gparams.data_header)

# Data Cleaning

We want to make sure the sequences do not contain any special characters so we replace all characters other than `a`, `c`, `t` or `g` with `x`.

Also, the sequences shuld be split up onto lists of single characters, which we then can encode in numerical values.

In [6]:
sequence_data["sequence"] = sequence_data.sequence.apply(lambda x: list(re.sub('[^acgt]', 'x', x.lower())))

In [7]:
sequence_data.to_pickle(gparams.cleanded_data)