# 1. Load sequences

In [1]:
import pandas as pd
spreadsheet = pd.read_excel('PeptideTower-Viruses.xlsx')
remove_line_breaks = lambda seq : ''.join(seq.split())
given_sequences = list(map(remove_line_breaks, list(spreadsheet['Sequence'])))
len(given_sequences)

185

In [2]:
# any non-standard characters?
standard_amino_acids = set(list('AGILPVFWYDERHKSTCMNQ'))
standard_amino_acids.symmetric_difference(set(''.join(given_sequences)))

{'X', '`'}

In [3]:
# how many sequences contain non-standard characters?
len([seq for seq in given_sequences if ('`' in seq or 'X' in seq)])

3

In [4]:
# let's keep the 182 that don't have non-standard characters...
sequences = [seq for seq in given_sequences if ('`' not in seq and 'X' not in seq)]
len(sequences)

182

# 2. Extract list of unique k-mers

In [5]:
from collections import defaultdict
def extract_unique_k_mers(sequences, k=10):
    """Return a dictionary of the unique k-mers and how many times each k-mer occurred"""
    s = defaultdict(lambda: 0)
    for sequence in sequences:
        for i in range(len(sequence) - k):
            subsequence = sequence[i:i+k]
            s[subsequence] += 1
    return s

In [6]:
%%time
unique_10_mers = extract_unique_k_mers(sequences, k=10)

CPU times: user 41.3 ms, sys: 4.61 ms, total: 45.9 ms
Wall time: 44.9 ms


In [7]:
len(unique_10_mers), set(unique_10_mers.values())

(72678, {1, 2, 3, 4, 5, 6, 7})

# 3. Write to csv

In [8]:
tuples = [(kmer, unique_10_mers[kmer]) for kmer in unique_10_mers.keys()]
tuples = sorted(tuples, key=lambda t:t[1], reverse=True)
lines = ['{},{}'.format(kmer, occurrences) for (kmer, occurrences) in tuples]
with open('unique_10_mers_present', 'w') as f:
    f.writelines(['kmer,occurrences\n'])
    f.writelines(['{}\n'.format(line) for line in lines])