In [123]:
import os
import re
import pandas as pd

In [147]:
import general_param as gparams

# Importing the Data

In [117]:
def get_sequences(raw_content: str, is_binding: bool=False):
    '''
    Extract each sequence, along with its info from the text files.

    '''
    sequences = []
    for seq_data in map(lambda x: x.split('\n'), raw_content.strip('>>').split('>>')):
        _info = seq_data[0].split('_')
        # some sequences have a weird ending.
        # this is an ugly hack to include them
        if len(_info) >= 8:
            _info = _info[:3] + ['-'.join(_info[3:-3])] + _info[-3:]
        if not is_binding:
            _info.insert(1, '')
            _info += ['', '', '']
        sequences.append(_info + [seq_data[1]])
    return sequences

In [118]:
sequences = []
for file_name, is_binding in [(gparams.binding_seqs, True), (gparams.control_seqs, False)]:
    with open(file_name, 'r') as fobj:
        raw_content = fobj.read()
    sequences.extend(get_sequences(raw_content, is_binding))

In [141]:
sequence_data = pd.DataFrame(sequences, columns=data_header)

# Data Cleaning

We want to make sure the sequences do not contain any special characters so we replace all characters other than `a`, `c`, `t` or `g` with `x`.

Also, the sequences shuld be split up onto lists of single characters, which we then can encode in numerical values.

In [142]:
sequence_data["sequence_cleaned"] = sequence_data.sequence.apply(lambda x: list(re.sub('[^acgt]', 'x', x.lower())))

In [143]:
sequence_data.sequence_cleaned

0       [c, t, g, a, a, g, c, c, t, t, t, c, c, a, a, ...
1       [g, a, g, c, c, c, c, a, c, c, t, g, g, t, g, ...
2       [t, c, c, a, g, c, t, t, t, c, g, g, c, a, c, ...
3       [t, g, c, a, t, t, c, g, c, a, g, a, g, c, a, ...
4       [a, g, g, c, g, g, g, t, t, c, g, c, g, c, g, ...
                              ...                        
5995    [c, g, a, a, t, g, c, t, c, c, g, c, c, g, a, ...
5996    [c, t, c, c, a, t, g, g, g, g, g, a, g, g, a, ...
5997    [c, g, g, c, a, g, g, g, c, c, g, t, t, c, g, ...
5998    [a, c, g, a, c, g, g, t, a, g, g, c, t, c, c, ...
5999    [t, t, g, t, g, a, g, c, g, t, a, c, c, g, c, ...
Name: sequence_cleaned, Length: 6000, dtype: object

In [145]:
sequence_data.to_csv(os.path.join(data_path, 'cleaned.csv'), index=False)