We want to combine the get_data() function with the annotate_regions() function. The annotations exist alongside the amino acid sequence in the raw data. We should aim to keep this information separate so that the classifier does not examine the annotations.

Below are the two methods as they currently exist:

In [11]:
import glob
import os
from Bio import SeqIO
import os

def get_data():
    """Return a dictionary of all training data, keyed by positive and negative examples, subkeyed by tm/non tm"""
    files = {
        'positive_examples': {
            'tm': [],
            'non_tm': []
        },
        'negative_examples': {
            'tm': [],
            'non_tm': []
        }
    }
    data = {
        'positive_examples': {
            'tm': [],
            'non_tm': []
        },
        'negative_examples': {
            'tm': [],
            'non_tm': []
        }
    }
    data_dir = '/home/jonas/peppred/data/training_data'
    for key in files:
        for subkey in files[key]:
            file_path = os.path.join(data_dir, key, subkey, '*.faa')
            files[key][subkey] = glob.glob(file_path, recursive=True)
            for item in [SeqIO.parse(data_file, format='fasta') for data_file in files[key][subkey]]:
                for subitem in item:
                    data[key][subkey].append(subitem)
    return data

def annotate_regions(datum):
    sequence, annotation = datum.split('#')
    n_region = []
    c_region = []
    h_region = []
    for index, char in enumerate(sequence):
        if annotation[index] == 'c':
            c_region.append(char)
        if annotation[index] == 'h':
            h_region.append(char)
        if annotation[index] == 'n':
            n_region.append(char)
    return {
        'sequence': sequence,
        'annotation': str(annotation),
        'h' : h_region,
        'c' : c_region,
        'n' : n_region,
    }

In [5]:
data = get_data()
data['positive_examples']['tm'][0]

SeqRecord(seq=Seq('MARALCRLPRRGLWLLLAHHLFMTTACQEANYGALLRELCLTQFQVDMEAVGET...iii', SingleLetterAlphabet()), id='RMP1_HUMAN', name='RMP1_HUMAN', description='RMP1_HUMAN O60894 148 AA.', dbxrefs=[])

In [12]:
annotate_regions(data['positive_examples']['tm'][0].seq)

{'annotation': 'nnnnnnnnnnnhhhhhhcccccccccCooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooMMMMMMMMMMMMMMMMMMMMMMiiiiiiiii',
 'c': ['A', 'H', 'H', 'L', 'F', 'M', 'T', 'T', 'A'],
 'h': ['G', 'L', 'W', 'L', 'L', 'L'],
 'n': ['M', 'A', 'R', 'A', 'L', 'C', 'R', 'L', 'P', 'R', 'R'],
 'sequence': 'MARALCRLPRRGLWLLLAHHLFMTTACQEANYGALLRELCLTQFQVDMEAVGETLWCDWGRTIRSYRELADCTWHMAEKLGCFWPNAEVDRFFLAVHGRYFRSCPISGRAVRDPPGSILYPFIVVPITVTLLVTALVVWQSKRTEGIV'}

In [13]:
annotate_regions(data['positive_examples']['non_tm'][0].seq)

{'annotation': 'nnnnnnnnnnnhhhhhhhhhhhhhhhhccccccCOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO',
 'c': ['T', 'T', 'S', 'A', 'W', 'G'],
 'h': ['I',
  'L',
  'A',
  'G',
  'A',
  'A',
  'L',
  'A',
  'G',
  'A',
  'L',
  'A',
  'P',
  'V',
  'L',
  'A'],
 'n': ['M', 'A', 'E', 'Q', 'M', 'Q', 'I', 'S', 'R', 'R', 'T'],
 'sequence': 'MAEQMQISRRTILAGAALAGALAPVLATTSAWGQGAVRKATAAEIAALPRQKVELVDPPFVHAHSQVAEGGPKVVEFTMVIEEKKIVIDDAGTEVHAMAFNGTVPGPLMVVHQDDYLELTLINPETNTLMHNIDFHAATGALGGGGLTEINPGEKTILRFKATKPGVFVYHCAPPGMVPWHVVSGMNGAIMVLPREGLHDGKGKALTYDKIYYVGEQDFYVPRDENGKYKKYEAPGDAYEDTVKVMRTLTPTHVVFNGAVGALTGDKAMTAAVGEKVLIVHSQANRDTRPHLIGGHGDYVWATGKFNTPPDVDQETWFIPGGAAGAAFYTFQQPGIYAYVNHNLIEAFELGAAAHFKVTGEWNDDLMTSVLA

In [14]:
annotate_regions(data['negative_examples']['non_tm'][0].seq)

{'annotation': 'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO',
 'c': [],
 'h': [],
 'n': [],
 'sequence': 'SQKFDVVVIGAGPGGYVAAIRAAQLGLKTACIEKYIGKEGKVALGGTCLNVGCIPSKALLDSSYKYHEAKEAFKVHGIEAKGVTIDVPAMVARKANIVKNLTGGIATLFKANGVTSFEGHGKLLANKQVEVTGLDGKTQVLEAENVIIASGSRPVEIPPAPLSDDIIVDSTGALEFQAVPKKLGVIGAGVIGLELGSVWARLGAEVTVLEALDKFLPAADEQIAKEALKVLTKQGLNIRLGARVTASEVKKKQVTVTFTDANGEQKETFDKLIVAVGRRPVTTDLLAADSGVTLDERGFIYVDDHCKTSVPGVFAIGDVVRGAMLAHKASEEGVMVAERIAGHKAQMNYDLIPSVIYTHPEIAWVGKTEQTLKAEGVEVNVGTFPFAASGRAMAANDTTGLVKVIADAKTDRVLGVHVIGPSAAELVQQGAIGMEFGTSAEDLGMMVFSHPTLSEALHEA

In [15]:
annotate_regions(data['negative_examples']['tm'][0].seq)

{'annotation': 'ooooooooooooooooooooooMMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiMMMMMMMMMMMMMMMMMMMMMMMMMMooooooooooooooMMMMMMMMMMMMMMMMMMiiiiiiMMMMMMMMMMMMMMMMMMMMooooooooooMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiiiiiMMMMMMMMMMMMMMMMMMMMoooooooMMMMMMMMMMMMMMMMMMMMMMMMiiiiiiiiiiiiiiiiiiiiiiii',
 'c': [],
 'h': [],
 'n': [],
 'sequence': 'MDPIALTAAVGADLLGDGRPETLWLGIGTLLMLIGTFYFIVKGWGVTDKEAREYYSITILVPGIASAAYLSMFFGIGLTEVQVGSEMLDIYYARYADWLFTTPLLLLDLALLAKVDRVSIGTLVGVDALMIVTGLVGALSHTPLARYTWWLFSTICMIVVLYFLATSLRAAAKERGPEVASTFNTLTALVLVLWTAYPILWIIGTEGAGVVGLGIETLLFMVLDVTAKVGFGFILLRSRAILGDTEAPEPSAGAEASAAD'}

The above shows that some of the examples (the negative ones) do not contain the annotations for 'c', 'h', and 'n' regions. We should rely on the sequences themselves instead. We can use the annotations and region information if a reason presents itself.

In [19]:
data = get_data()
for key in data:
    for subkey in data[key]:
        data[key][subkey] = [annotate_regions(raw_datum.seq)['sequence'] for raw_datum in data[key][subkey]]

In [20]:
data['negative_examples']['tm'][0]

'MDPIALTAAVGADLLGDGRPETLWLGIGTLLMLIGTFYFIVKGWGVTDKEAREYYSITILVPGIASAAYLSMFFGIGLTEVQVGSEMLDIYYARYADWLFTTPLLLLDLALLAKVDRVSIGTLVGVDALMIVTGLVGALSHTPLARYTWWLFSTICMIVVLYFLATSLRAAAKERGPEVASTFNTLTALVLVLWTAYPILWIIGTEGAGVVGLGIETLLFMVLDVTAKVGFGFILLRSRAILGDTEAPEPSAGAEASAAD'

The process above shows another method to create in the preprocessing pipeline. Namely, that we will use annotate_regions to strip out the annotation information and get only the amino acid sequence. We'll create a new method `transform_data(data)` which will hold the meat of the preprocessing pipeline; making our data fetching look something like: `data = transform_data(get_data())`

## Using the updated methods

In [23]:
import sys
sys.path.append('/home/jonas/peppred/src/')
from data import get_data, transform_data

data = transform_data(get_data())
print(data['negative_examples']['tm'][0])

MDPIALTAAVGADLLGDGRPETLWLGIGTLLMLIGTFYFIVKGWGVTDKEAREYYSITILVPGIASAAYLSMFFGIGLTEVQVGSEMLDIYYARYADWLFTTPLLLLDLALLAKVDRVSIGTLVGVDALMIVTGLVGALSHTPLARYTWWLFSTICMIVVLYFLATSLRAAAKERGPEVASTFNTLTALVLVLWTAYPILWIIGTEGAGVVGLGIETLLFMVLDVTAKVGFGFILLRSRAILGDTEAPEPSAGAEASAAD
