We want to annotate the training data with 'positive' and 'negative' so we can work with them logically.

In [1]:
# This is data.py get_data()
def get_data():
    """Return a list of all training data"""
    data_dir = '/home/jonas/peppred/data/**/*.faa'
    data_files = glob.glob(data_dir, recursive=True)
    data = []
    parsed_items = [SeqIO.parse(data_file, format='fasta') for data_file in data_files]
    for item in parsed_items:
        for subitem in item:
            data.append(subitem)
    return data

It might be better to have data be a hash where with keys for positive and negative

In [2]:
import glob
positive_dir = '/home/jonas/peppred/data/training_data/positive_examples/**/*.faa'
negative_dir = '/home/jonas/peppred/data/training_data/negative_examples/**/*.faa'
negative_files = glob.glob(positive_dir, recursive=True)
positive_files = glob.glob(negative_dir, recursive=True)
data = {
    'positive_examples': [],
    'negative_examples': []
}
from Bio import SeqIO
parsed_items = [SeqIO.parse(data_file, format='fasta') for data_file in negative_files]
for item in parsed_items:
    for subitem in item:
        data['negative_examples'].append(subitem)
print(f"Number of negative examples: {len(data['negative_examples'])}")
parsed_items = [SeqIO.parse(data_file, format='fasta') for data_file in positive_files]
for item in parsed_items:
    for subitem in item:
        data['positive_examples'].append(subitem)
print(f"Number of positive examples: {len(data['positive_examples'])}")

Number of negative examples: 1320
Number of positive examples: 1334


In [3]:
print(data.keys())

dict_keys(['positive_examples', 'negative_examples'])


We may be interested if they are tm or non tm examples as well. So we should make data a dictionary of dictionarys, where the inner dict has lists as values. We can refactor the above code at the same time

In [4]:
data_dir = '/home/jonas/peppred/data/training_data'
import os
files = {
    'positive_examples': {
        'tm': [],
        'non_tm': []
    },
    'negative_examples': {
        'tm': [],
        'non_tm': []
    }
}
data = {
    'positive_examples': {
        'tm': [],
        'non_tm': []
    },
    'negative_examples': {
        'tm': [],
        'non_tm': []
    }
}

for key in files:
    for subkey in files[key]:
        file_path = os.path.join(data_dir, key, subkey, '*.faa')
        files[key][subkey] = glob.glob(file_path, recursive=True)
        for item in [SeqIO.parse(data_file, format='fasta') for data_file in files[key][subkey]]:
            for subitem in item:
                data[key][subkey].append(subitem)
        print(f"{len(data[key][subkey])} examples in {key}/{subkey}")

45 examples in positive_examples/tm
1275 examples in positive_examples/non_tm
247 examples in negative_examples/tm
1087 examples in negative_examples/non_tm


In [5]:
data.keys()

dict_keys(['positive_examples', 'negative_examples'])

get_data() will be updated to return this data from now on

In [7]:
import sys
sys.path.append('/home/jonas/peppred/src/')
from data import get_data
data = get_data()
total = 0
for key in data:
    for subkey in data[key]:
        num_examples = len(data[key][subkey])
        total += num_examples
        print(f"{num_examples} examples for {key}/{subkey}")
print(f"{total} total examples")

45 examples for positive_examples/tm
1275 examples for positive_examples/non_tm
247 examples for negative_examples/tm
1087 examples for negative_examples/non_tm
2654 total examples
