# 1. Compute the Score of a Cyclic Peptide Against a Spectrum

In [1]:
from collections import Counter


amino_acid_weight = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114,
                     'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}


def get_peptide_mass(peptide):
    peptide_sum = 0
    for amino_acid in peptide:
        peptide_sum += amino_acid_weight[amino_acid]
    return peptide_sum


with open('rosalind_ba4f.txt') as file:
    peptide = file.readline().rstrip()
    experimental_spectrum = file.readline().rstrip().split(' ')
    experimental_spectrum = [int(mass) for mass in experimental_spectrum]


def get_score(peptide, experimental_spectrum):
    peptide_len = len(peptide)
    theoreticalSpectrum = [0]
    for currentLen in range(1, peptide_len + 1):
        for i in range(peptide_len):
            endIndex = i + currentLen
            remaining = 0
            if endIndex > peptide_len:
                remaining = endIndex - peptide_len
            currentPeptide = peptide[i:endIndex] + peptide[0:remaining]
            theoreticalSpectrum.append(get_peptide_mass(currentPeptide))
            if currentLen == peptide_len:
                break

    theoreticalSpectrum = sorted(theoreticalSpectrum)
    experimental_spectrum_counter = Counter(experimental_spectrum)
    theoretical_spectrum_counter = Counter(theoreticalSpectrum)
    score = 0
    for item in experimental_spectrum_counter:
        current_count = theoretical_spectrum_counter.get(item)
        if current_count is not None:
            score += min(current_count, experimental_spectrum_counter.get(item))
    return score


score = get_score(peptide, experimental_spectrum)
print(score)
with open('output.txt', 'w') as file:
    file.write(str(score))

520


# 2. Generate the Convolution of a Spectrum

In [2]:
with open('rosalind_ba4h.txt') as file:
    spectrum = sorted([int(mass) for mass in file.readline().rstrip().split(' ')])


def get_convolution(spectrum):
    convolution = []
    for i in range(len(spectrum)):
        for j in range(0, i):
            diff = spectrum[i] - spectrum[j]
            if diff != 0:
                convolution.append(diff)
    return convolution


output = ' '.join([str(mass) for mass in sorted(get_convolution(spectrum))])
print(output)
with open('output.txt', 'w') as file:
    file.write(output)

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11 11

# 3. Implement LeaderboardCyclopeptideSequencing
## Will take really long time to run

In [3]:
from collections import Counter

amino_acid_weight = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114,
                     'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
acids = list(amino_acid_weight.keys())

with open('rosalind_ba4g.txt') as file:
    limit = int(file.readline().rstrip())
    spectrum = sorted([int(mass) for mass in file.readline().rstrip().split(' ')])


def expand_with_score(leaderboard, spectrum):
    current_len = len(leaderboard)
    for i in range(current_len):
        peptide = leaderboard[i][0]
        for acid in acids:
            new_peptide = peptide + acid
            new_score = get_score(new_peptide, spectrum)
            leaderboard.append((new_peptide, new_score))
    del leaderboard[i:current_len]
    return leaderboard


def get_mass(peptide):
    mass = 0
    for acid in peptide:
        mass += amino_acid_weight[acid]
    return mass


def get_score(peptide, experimental_spectrum):
    peptide_len = len(peptide)
    theoreticalSpectrum = [0]
    for currentLen in range(1, peptide_len + 1):
        for i in range(peptide_len):
            endIndex = i + currentLen
            remaining = 0
            if endIndex > peptide_len:
                remaining = endIndex - peptide_len
            currentPeptide = peptide[i:endIndex] + peptide[0:remaining]
            theoreticalSpectrum.append(get_mass(currentPeptide))
            if currentLen == peptide_len:
                break

    theoreticalSpectrum = sorted(theoreticalSpectrum)
    experimental_spectrum_counter = Counter(experimental_spectrum)
    theoretical_spectrum_counter = Counter(theoreticalSpectrum)
    score = 0
    for item in experimental_spectrum_counter:
        current_count = theoretical_spectrum_counter.get(item)
        if current_count is not None:
            score += min(current_count, experimental_spectrum_counter.get(item))
    return score


def cut_leaderboard(leaderboard, initial_limit):
    leaderboard.sort(key=lambda x: x[1], reverse=True)
    while initial_limit < len(leaderboard) and leaderboard[initial_limit][1] == leaderboard[initial_limit - 1][1]:
        initial_limit += 1
    leaderboard = leaderboard[:initial_limit]
    return leaderboard


def leaderboard_cyclopeptide_sequencing(spectrum, limit):
    leader_peptide = ''
    leader_score = 0
    leaderboard = [(leader_peptide, leader_score)]
    current_try = 0
    while len(leaderboard) != 0:
        leaderboard = expand_with_score(leaderboard, spectrum)
        remove_list = []
        for i in range(len(leaderboard)):
            peptide = leaderboard[i][0]
            score = leaderboard[i][1]
            current_mass = get_mass(peptide)
            if current_mass == spectrum[-1]:
                if score > leader_score:
                    leader_peptide = peptide
                    leader_score = score
            elif current_mass > spectrum[-1]:
                remove_list.append(i)

        for i in remove_list[::-1]:
            del leaderboard[i]

        leaderboard = cut_leaderboard(leaderboard, limit)
        current_try += 1
        print(current_try, leader_peptide, leader_score, len(leaderboard))
        if current_try == 25:
            break
    return leader_peptide


leader_peptide = leaderboard_cyclopeptide_sequencing(spectrum, limit)
print(leader_peptide)
masses = list(map(lambda x: amino_acid_weight[x], leader_peptide))
output = '-'.join(list(map(lambda x: str(x), masses)))
print(output)
with open('output.txt', 'w') as file:
    file.write(output)

1  0 20
2  0 419
3  0 1225
4  0 980
5  0 516
6  0 427
7  0 661
8  0 426
9  0 460
10  0 397
11  0 469
12  0 512
13 SYENHWPRSHKIS 157 396
14 SYENHWPRSHKIS 157 395
15 SYENHWPRSHKIS 157 394
16 SYENHWPRSHKIS 157 393
17 SYENHWPRSHKIS 157 392
18 SYENHWPRSHKIS 157 391
19 SYENHWPRSHKIS 157 390
20 SYENHWPRSHKIS 157 389
21 SYENHWPRSHKIS 157 388
22 SYENHWPRSHKIS 157 387
23 SYENHWPRSHKIS 157 386
24 SYENHWPRSHKIS 157 385
25 SYENHWPRSHKIS 157 384
SYENHWPRSHKIS
87-163-129-114-137-186-97-156-87-137-128-113-87


# 4. Implement ConvolutionCyclopeptideSequencing
## Will take really long time to run

In [5]:
from collections import Counter

amino_acid_weight = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114,
                     'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
acids = list(amino_acid_weight.keys())

with open('rosalind_ba4i.txt') as file:
    conv_limit = int(file.readline().rstrip())
    leaderboard_limit = int(file.readline().rstrip())
    spectrum = sorted([int(mass) for mass in file.readline().rstrip().split(' ')])


def get_convolution(spectrum):
    convolution = []
    for i in range(len(spectrum)):
        for j in range(0, i):
            diff = spectrum[i] - spectrum[j]
            if diff != 0:
                convolution.append(diff)
    return convolution


def expand_with_score(leaderboard, spectrum):
    current_len = len(leaderboard)
    for i in range(current_len):
        peptide = leaderboard[i][0]
        for acid in acids:
            new_peptide = peptide + acid
            new_score = get_score(new_peptide, spectrum)
            leaderboard.append((new_peptide, new_score))
    del leaderboard[i: current_len]
    return leaderboard


def get_mass(peptide):
    mass = 0
    for acid in peptide:
        mass += amino_acid_weight[acid]
    return mass


def get_score(peptide, experimental_spectrum):
    peptide_len = len(peptide)
    theoreticalSpectrum = [0]
    for currentLen in range(1, peptide_len + 1):
        for i in range(peptide_len):
            endIndex = i + currentLen
            remaining = 0
            if endIndex > peptide_len:
                remaining = endIndex - peptide_len
            currentPeptide = peptide[i:endIndex] + peptide[0:remaining]
            theoreticalSpectrum.append(get_mass(currentPeptide))
            if currentLen == peptide_len:
                break

    theoreticalSpectrum = sorted(theoreticalSpectrum)
    experimental_spectrum_counter = Counter(experimental_spectrum)
    theoretical_spectrum_counter = Counter(theoreticalSpectrum)
    score = 0
    for item in experimental_spectrum_counter:
        current_count = theoretical_spectrum_counter.get(item)
        if current_count is not None:
            score += min(current_count, experimental_spectrum_counter.get(item))
    return score


def cut_leaderboard(leaderboard, limit):
    leaderboard.sort(key=lambda x: x[1], reverse=True)
    while limit < len(leaderboard) and leaderboard[limit][1] == leaderboard[limit - 1][1]:
        limit += 1
    leaderboard = leaderboard[:limit]
    return leaderboard


def cut_convolution(convolution, limit):
    conv_counter = Counter(convolution)
    conv_counter = conv_counter.most_common()
    # print(conv_counter)
    while limit < len(conv_counter) and conv_counter[limit][1] == conv_counter[limit - 1][1]:
        limit += 1
    conv_counter = conv_counter[:limit]
    convolution = []
    for mass, count in conv_counter:
        convolution.extend([mass] * count)
    return convolution


def cut_amino_acid(convolution):
    unique_mass = set(convolution)
    delete_key_list = []
    for acid in amino_acid_weight:
        if amino_acid_weight[acid] not in unique_mass:
            delete_key_list.append(acid)
    for key in delete_key_list:
        del amino_acid_weight[key]
        acids.remove(key)
    return amino_acid_weight


def leaderboard_cyclopeptide_sequencing(spectrum, limit):
    leader_peptide = ''
    leader_score = 0
    leaderboard = [(leader_peptide, leader_score)]
    current_try = 0
    while len(leaderboard) != 0:
        leaderboard = expand_with_score(leaderboard, spectrum)
        remove_list = []
        for i in range(len(leaderboard)):
            peptide = leaderboard[i][0]
            score = leaderboard[i][1]
            current_mass = get_mass(peptide)
            if current_mass == spectrum[-1]:
                if score > leader_score:
                    leader_peptide = peptide
                    leader_score = score
            elif current_mass > spectrum[-1]:
                remove_list.append(i)

        for i in remove_list[::-1]:
            del leaderboard[i]

        leaderboard = cut_leaderboard(leaderboard, limit)
        current_try += 1
        print(current_try, leader_peptide, leader_score, len(leaderboard))
        if current_try == 50:
            break
    return leader_peptide


convolution = get_convolution(spectrum)
convolution = sorted([mass for mass in convolution if 57 <= mass <= 200])
convolution = cut_convolution(convolution, conv_limit)
amino_acid_weight = cut_amino_acid(convolution)
leader_peptide = leaderboard_cyclopeptide_sequencing(spectrum, leaderboard_limit)
print(leader_peptide)
masses = list(map(lambda x: amino_acid_weight[x], leader_peptide))
output = '-'.join(list(map(lambda x: str(x), masses)))
print(output)
with open('output.txt', 'w') as file:
    file.write(output)

1  0 9
2  0 89
3  0 424
4  0 380
5  0 668
6  0 542
7  0 367
8  0 334
9  0 438
10  0 348
11 RTWFPNWWCCW 92 328
12 FPNTRDCCWRTW 134 369
13 FPNTRDCCWRTW 134 340
14 FPNTRDCCWRTW 134 324
15 FPNTRDCCWRTW 134 323
16 FPNTRDCCWRTW 134 322
17 FPNTRDCCWRTW 134 321
18 FPNTRDCCWRTW 134 320
19 FPNTRDCCWRTW 134 319
20 FPNTRDCCWRTW 134 318
21 FPNTRDCCWRTW 134 317
22 FPNTRDCCWRTW 134 316
23 FPNTRDCCWRTW 134 315
24 FPNTRDCCWRTW 134 314
25 FPNTRDCCWRTW 134 313
26 FPNTRDCCWRTW 134 312
27 FPNTRDCCWRTW 134 311
28 FPNTRDCCWRTW 134 310
29 FPNTRDCCWRTW 134 309
30 FPNTRDCCWRTW 134 308
31 FPNTRDCCWRTW 134 307
32 FPNTRDCCWRTW 134 306
33 FPNTRDCCWRTW 134 305
34 FPNTRDCCWRTW 134 304
35 FPNTRDCCWRTW 134 303
36 FPNTRDCCWRTW 134 302
37 FPNTRDCCWRTW 134 301
38 FPNTRDCCWRTW 134 300
39 FPNTRDCCWRTW 134 299
40 FPNTRDCCWRTW 134 298
41 FPNTRDCCWRTW 134 297
42 FPNTRDCCWRTW 134 296
43 FPNTRDCCWRTW 134 295
44 FPNTRDCCWRTW 134 294
45 FPNTRDCCWRTW 134 293
46 FPNTRDCCWRTW 134 292
47 FPNTRDCCWRTW 134 291
48 FPNTRDCCWRTW 134 290
49

# 5. Construct the Suffix Array of a String

In [6]:
with open('rosalind_ba9g.txt') as file:
    text = file.readline().rstrip()


def get_suffix_array(text):
    text_len = len(text)
    suffix_list = [(text[i: text_len], i) for i in range(text_len)]
    suffix_array = [i for _, i in sorted(suffix_list)]
    return suffix_array


suffix_array = get_suffix_array(text)
output = ', '.join(map(lambda x: str(x), suffix_array))
print(output)
with open('output.txt', 'w') as file:
    file.write(output)

10000, 79, 2993, 80, 6645, 2994, 9004, 9527, 8768, 8420, 81, 2013, 4782, 2962, 6646, 4672, 9207, 5552, 2995, 9418, 9005, 9528, 7743, 4239, 3702, 2825, 8769, 4723, 7570, 2788, 5547, 2131, 1322, 8421, 82, 2014, 4783, 3015, 7460, 918, 763, 2963, 4701, 6647, 8786, 4126, 4116, 4673, 9208, 4317, 9178, 9401, 3614, 732, 8590, 5553, 122, 4357, 2996, 5823, 7947, 9419, 9006, 415, 3905, 7513, 1635, 6330, 9529, 1207, 6580, 7744, 4890, 4240, 3747, 690, 4963, 3703, 6304, 480, 2826, 8770, 1726, 2937, 3767, 2715, 9547, 581, 355, 4724, 4943, 7571, 2902, 1708, 2789, 7548, 1623, 2239, 5548, 2132, 3021, 2589, 1323, 7300, 1788, 8422, 9643, 5708, 7749, 83, 3595, 2015, 6598, 7825, 3406, 4784, 4413, 6163, 5409, 1389, 2254, 4487, 4839, 1610, 7432, 3016, 6509, 6565, 8430, 9457, 251, 7461, 5631, 5982, 2218, 919, 2471, 7799, 8643, 764, 3659, 6423, 1216, 1903, 7246, 2964, 3236, 7063, 1225, 4273, 2816, 4702, 7757, 6684, 1753, 6141, 4151, 7423, 8206, 3222, 6648, 8787, 35, 3896, 543, 776, 9650, 7233, 4127, 8601, 54, 1

# 6. Reconstruct a String from its k-mer Composition

In [7]:
with open('rosalind_ba9a.txt') as file:
    pattern_list = [pattern.rstrip() for pattern in file.readlines()]


class Node:
    def __init__(self, val=None, pos=0):
        self.pos = pos
        self.val = val
        self.children_list = set()

    def add_children(self, node):
        self.children_list.add(node)

    def get_children(self, val):
        children = None
        for node in self.children_list:
            if node.val == val:
                children = node
        return children

    def __str__(self):
        if self.val == Node:
            display_text = 'None'
        else:
            display_text = str(self.val)
        children_text = ''
        for child in self.children_list:
            children_text += str(child.val) + ' '
        display_text = f'val: {display_text}, children: {children_text}'
        return display_text


root = Node()
adj_list = []
current_len = 0
for pattern in pattern_list:
    current_node = root
    for symbol in pattern:
        child_node = current_node.get_children(symbol)
        if child_node is None:
            current_len += 1
            child_node = Node(symbol, current_len)
            current_node.add_children(child_node)
            adj_list.append((current_node.pos, current_len, symbol))
        current_node = child_node

adj_list.sort(key=lambda x: (x[0], x[1]))
output = ''
for item in adj_list:
    output += f'{item[0]}->{item[1]}:{item[2]}\n'
output = output.rstrip()
print(output)
with open('output.txt', 'w') as file:
    file.write(output)

0->1:T
0->93:C
0->629:G
0->1430:A
1->2:A
1->174:G
1->258:C
1->3447:T
2->3:T
2->714:C
2->1521:A
2->6968:G
3->4:C
3->6172:A
4->5:A
5->6:T
6->7:G
7->8:G
8->9:A
9->10:G
10->11:T
11->12:G
12->13:C
13->14:A
14->15:A
15->16:C
16->17:T
17->18:G
18->19:G
19->20:T
20->21:C
21->22:C
22->23:C
23->24:A
24->25:G
25->26:G
26->27:G
27->28:A
28->29:G
29->30:C
30->31:T
31->32:T
32->33:A
33->34:G
34->35:T
35->36:T
36->37:G
37->38:C
38->39:G
39->40:C
40->41:A
41->42:A
42->43:A
43->44:G
44->45:A
45->46:T
46->47:A
47->48:A
48->49:T
49->50:C
50->51:C
51->52:A
52->53:G
53->54:A
54->55:A
55->56:C
56->57:A
57->58:A
58->59:A
59->60:T
60->61:G
61->62:G
62->63:C
63->64:G
64->65:A
65->66:C
66->67:T
67->68:G
68->69:A
69->70:G
70->71:C
71->72:C
72->73:G
73->74:C
74->75:A
75->76:G
76->77:C
77->78:C
78->79:A
79->80:T
80->81:G
81->82:C
82->83:T
83->84:C
84->85:A
85->86:C
86->87:G
87->88:T
88->89:G
89->90:G
90->91:T
91->92:C
93->94:C
93->433:A
93->530:G
93->2036:T
94->95:T
94->3624:A
94->4507:C
94->5919:G
95->96:C
95->18

# 7. Implement TrieMatching

In [8]:
with open('rosalind_ba9b.txt') as file:
    text = file.readline().rstrip()
    pattern_list = [pattern.rstrip() for pattern in file.readlines()]


class Node:
    def __init__(self, val=None):
        self.val = val
        self.children_list = set()

    def add_children(self, node):
        self.children_list.add(node)

    def get_children(self, val):
        children = None
        for node in self.children_list:
            if node.val == val:
                children = node
        return children

    def __str__(self):
        if self.val == Node:
            display_text = 'None'
        else:
            display_text = str(self.val)
        children_text = ''
        for child in self.children_list:
            children_text += str(child.val) + ' '
        display_text = f'val: {display_text}, children: {children_text}'
        return display_text


def prefix_trie_construction(pattern_list):
    root = Node()
    for pattern in pattern_list:
        current_node = root
        for symbol in pattern:
            child_node = current_node.get_children(symbol)
            if child_node is None:
                child_node = Node(symbol)
                current_node.add_children(child_node)
            current_node = child_node
    return root


def prefix_trie_matching(text, node):
    for symbol in text:
        if len(node.children_list) == 0:
            return True
        child_node = node.get_children(symbol)
        if child_node is None:
            return False
        node = child_node
    if len(node.children_list) == 0:
        return True
    return False


prefix_trie_root = prefix_trie_construction(pattern_list)
text_len = len(text)
match_list = []
for i in range(len(text)):
    if prefix_trie_matching(text[i:text_len], prefix_trie_root):
        match_list.append(i)

output = ' '.join(map(lambda x: str(x), match_list))
print(output)
with open('output.txt', 'w') as file:
    file.write(output)

70 111 124 137 193 211 218 302 323 330 343 371 389 414 425 441 463 519 526 543 550 598 613 632 639 705 727
