- with the rise of MRSA at hand, developing new antibiotics represents a central challenge to modern medicine
- a difficult problem in antibiotics research is that of sequencing newly discovered antibiotics, or determining the order of amino acids making up the antibiotic peptide

In [149]:
## 4a Translate an RNA string into an amino acid string
table = { 
        'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACU':'T', 
        'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R',                  
        'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P', 
        'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R', 
        'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A', 
        'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G', 
        'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S', 
        'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L', 
        'UAC':'Y', 'UAU':'Y', 'UAA':'_', 'UAG':'_', 
        'UGC':'C', 'UGU':'C', 'UGA':'_', 'UGG':'W', 
    }

def Translate(rna):
    codons = [rna[i:i+3] for i in range(0, len(rna), 3)]
    amino_acids = ''.join([table[codon] for codon in codons])
    if amino_acids[-1] == '_':
        amino_acids = amino_acids[:-1]
    return amino_acids

In [2]:
Translate('AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA')

'MAMAPRTEINSTRING'

In [None]:
test_file = 'rosalind_ba4a.txt'
with open(test_file, 'r') as reader:
    rna = reader.readline().strip('\n')
Translate(rna)

We say that a DNA string _Pattern_ encodes an amino acid string _Peptide_ if the RNA string transcribed from either _Pattern_ or its reverse complement _Pattern_ translates into Peptide. For example, the DNA string __GAAACT__ is transcribed into __GAAACU__ and translated into __ET__. The reverse complement of this DNA string, __AGTTTC__, is transcribed into __AGUUUC__ and translated into __SF__. Thus, __GAAACT__ encodes both __ET__ and __SF__

In [150]:
# 4b find substrings of a genome encoding a given amino acid string
base_comp = {'A': 'U', 'G': 'C', 'C': 'G', 'U': 'A'}

def ReverseComplement(rna):
    comp = ''.join([base_comp[base] for base in rna])
    return comp[-1::-1]

def FindSubstring(dna, peptide):
    rna = dna.replace('T', 'U')
    
    len_codon = 3*len(peptide)
    
    substrings = []
    def getSubstring(rna, peptide):
        substrings = []
        for i in range(0, len(rna), 3):
            if len(rna[i:i+len_codon]) < len_codon:
                break             
            if Translate(rna[i:i+len_codon]) == peptide:
                substrings.append(rna[i:i+len_codon])
            if Translate(ReverseComplement(rna[i:i+len_codon])) == peptide:
                substrings.append(rna[i:i+len_codon])

        return [substring.replace('U', 'T') for substring in substrings]

    for start in range(3):
        substrings += getSubstring(rna[start:], peptide)
    return substrings


In [4]:
FindSubstring('ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA', 'MA')

['ATGGCC', 'ATGGCC', 'GGCCAT']

In [None]:
test_file = 'rosalind_ba4b.txt'
with open(test_file, 'r') as reader:
    dna = reader.readline().strip('\n')
    peptide = reader.readline().strip('\n')
for string in FindSubstring(dna, peptide):
    print(string)

In [52]:
table['AUG'] + table['GCC']

'MA'

In 1969, Fritz Lipmann (another Nobel laureate) demonstrated that tyrocidines and gramicidins are __non-ribosomal peptides (NRPs)__, synthesized not by the ribosome, but by a giant protein called NRP synthetase. This enzyme pieces together antibiotic peptides without any reliance on RNA or the genetic code

The reason why many NRPs have pharmaceutical applications is that they have been optimized by eons of evolution as "molecular bullets" that bacteria and fungi use to kill their enemies. If these enemies happen to be pathogens, researchers are eager to borrow these bullets as antibacterial drugs. However, NRPs are not limited to antibiotics: many of them represent anti-tumor agents and immunosuppressors, while others are used by bacteria to communicate with other cells.

No of subpeptides in a cyclic peptide of lenth n have $$ n(n-1) + 2 $$

In [151]:
mass_table = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113,
              'L': 113, 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137,
              'F': 147, 'R': 156, 'Y': 163, 'W': 186}

In [152]:
from functools import reduce

# 4c get theoretical spectrum from a given peptide
def getMass(peptide):
    return reduce(lambda a, b: a+b, [mass_table[amino_acid] for amino_acid in peptide])

def GenerateTheoreticalSpectrum(peptide):

    subpeptides = []
    for i in range(len(peptide)-1):
        string = peptide + peptide[:i]
        for j in range(0, len(string)):
            if len(string[j:i + j + 1]) < i+1:
                break
            subpeptides.append(string[j:i+j+1])
    return [0] + sorted([getMass(peptide) for peptide in subpeptides]) + [getMass(peptide)]
        
print(GenerateTheoreticalSpectrum('LEQN'))
GenerateTheoreticalSpectrum('NQEL')

[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]


[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]

In [153]:
extra_test = ' '.join([str(mass) for mass in GenerateTheoreticalSpectrum('IAQMLFYCKVATN')])
output = '0 71 71 99 101 103 113 113 114 128 128 131 147 163 170 172 184 199 215 227 227 231 244 259 260 266 271 286 298 298 310 312 328 330 330 372 385 391 394 399 399 399 401 413 423 426 443 443 470 493 498 502 513 519 526 527 541 554 556 557 564 569 590 598 616 626 640 654 657 658 665 670 682 697 697 703 711 729 729 753 753 771 779 785 785 800 812 817 824 825 828 842 856 866 884 892 913 918 925 926 928 941 955 956 963 969 980 984 989 1012 1039 1039 1056 1059 1069 1081 1083 1083 1083 1088 1091 1097 1110 1152 1152 1154 1170 1172 1184 1184 1196 1211 1216 1222 1223 1238 1251 1255 1255 1267 1283 1298 1310 1312 1319 1335 1351 1354 1354 1368 1369 1369 1379 1381 1383 1411 1411 1482'

extra_test == output

True

In [250]:
test_file = 'rosalind_ba4c.txt'
with open(test_file, 'r') as reader:
    peptide = reader.readline().strip('\n')

' '.join([str(mass) for mass in GenerateTheoreticalSpectrum(peptide)])

'0 57 57 87 87 97 103 113 115 115 128 129 144 147 154 156 172 184 186 202 204 216 228 231 234 241 241 257 259 285 291 301 315 319 328 331 331 342 344 356 358 360 372 388 388 406 413 418 443 445 457 459 469 471 473 475 475 503 505 514 516 532 546 556 560 572 586 588 590 592 599 603 629 643 647 647 659 661 675 689 700 702 703 714 732 744 746 748 760 762 771 787 790 806 815 817 829 831 833 845 863 874 875 877 888 902 916 918 930 930 934 948 974 978 985 987 989 991 1005 1017 1021 1031 1045 1061 1063 1072 1074 1102 1102 1104 1106 1108 1118 1120 1132 1134 1159 1164 1171 1189 1189 1205 1217 1219 1221 1233 1235 1246 1246 1249 1258 1262 1276 1286 1292 1318 1320 1336 1336 1343 1346 1349 1361 1373 1375 1391 1393 1405 1421 1423 1430 1433 1448 1449 1462 1462 1464 1474 1480 1490 1490 1520 1520 1577'

In [154]:
# 4d Compute the no of peptides of given total mass
def cnt(a, k, n):
    if k == 1:
        return 1 if n % a[0] == 0 else 0
    return sum(cnt(a, k - 1, n - i * a[k - 1]) for i in range(0, int(n / a[k - 1]) + 1))

#print(cnt(list(mass_table.values()), len(mass_table), 1024))

def CountPeptide(mass):
    NumPeptides = {}
    masses = list(set(mass_table.values()))
    for i in range(57):
        NumPeptides[i] = 0
    
    for mass in range(57, mass + 1):
        NumPeptides[mass] = masses.count(mass)
        for int_mass in masses:
            if mass >= int_mass:
                if NumPeptides[mass - int_mass] > 0:
                    NumPeptides[mass] += NumPeptides[mass - int_mass]
    return NumPeptides[mass]
        
CountPeptide(1024)
CountPeptide(1475)

3548135554209773

In [175]:
def LinearSpectrum(peptide):
    prefixMass = [0]*(len(peptide))
    for i in range(len(peptide)):
        for amino_acid in mass_table:
            if amino_acid == peptide[i]:
                prefixMass[i] = prefixMass[i-1] + mass_table[amino_acid]
    prefixMass = [0] + prefixMass
    #print(prefixMass)
    linear_spectrum = [0]
    for i in range(0, len(peptide)):
        for j in range(i+1, len(peptide)+1):
            #print(i, j, peptide[i:j])
            linear_spectrum.append(prefixMass[j] - prefixMass[i])
    
    return sorted(linear_spectrum)

In [176]:
LinearSpectrum('NQEL')

[0, 113, 114, 128, 129, 242, 242, 257, 370, 371, 484]

In [None]:
# 4j
test_file = 'rosalind_ba4j.txt'
with open(test_file, 'r') as reader:
    peptide = reader.readline().strip('\n')
' '.join([str(aa) for aa in LinearSpectrum(peptide)])

In [157]:
def CyclicSpectrum(peptide):
    prefixMass = [0]*(len(peptide))
    for i in range(len(peptide)):
        for amino_acid in mass_table:
            if amino_acid == peptide[i]:
                prefixMass[i] = prefixMass[i-1] + mass_table[amino_acid]
    prefixMass = [0] + prefixMass
                
    peptideMass = prefixMass[len(peptide)]
    cyclic_spectrum = [0]
    for i in range(len(peptide)):
        for j in range(i+1, len(peptide)+1):
            cyclic_spectrum.append(prefixMass[j] - prefixMass[i])
            if i > 0 and j < len(peptide):
                cyclic_spectrum.append(peptideMass - (prefixMass[j] - prefixMass[i]))
    
    return sorted(cyclic_spectrum)

In [158]:
CyclicSpectrum('NQEL')

[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]

__Cyclopeptide Sequencing Problem__: Given an ideal spectrum, find a cyclic peptide whose theoretical spectrum matches the experimental spectrum.

__Input__: A collection of (possibly repeated) integers Spectrum corresponding to an ideal spectrum.

__Output__: An amino acid string Peptide such that Cyclospectrum(Peptide) = Spectrum (if such a string exists).

No of subpeptides a linear peptide of given length $n$ have: $$\frac{n(n+1)}{2} + 1$$

Given an experimental spectrum _Spectrum_ of a cyclic peptide, a linear peptide is __consistent__ with _Spectrum_ if every mass in its theoretical spectrum is contained in _Spectrum_. If a mass appears more than once in the theoretical spectrum of the linear peptide, then it must appear at least that many times in _Spectrum_ in order for the linear peptide to be consistent with _Spectrum_. For example, a linear peptide can still be consistent with the theoretical spectrum of NQEL if the peptide’s spectrum contains 242 twice. But it cannot be consistent with the theoretical spectrum of NQEL if its spectrum contains 113 twice.

The key to our new algorithm is that every linear subpeptide of a cyclic peptide Peptide is consistent with _Cyclospectrum(Peptide)_. Thus, to solve the Cyclopeptide Sequencing Problem for Spectrum, we can safely ban all peptides that are inconsistent with _Spectrum_ from the growing set Peptides, which powers the bounding step that we described above.

What about the branching step? Given the current collection of linear peptides Peptides, define _Expand(Peptides)_ as a new collection containing all possible extensions of peptides in Peptides by a single amino acid mass.

In [159]:
# 4e find a cyclic peptide with theoretical spectrum matching an ideal spectrum

def expand(l):
    lc = l[:] # list copy
    for i in lc:
        l.remove(i)
        for m in mass_table.values():
            temp = i[:] # list copy
            temp.append(m)
            l.append(temp)
            
def cut_peptide(peptide):
    result = []
    for cut in range(1, len(peptide)): # cut size
        for i in range(len(peptide) - cut):
            result.append(peptide[i:i + cut])
            result.append(peptide[i + cut:] + peptide[:i]) # cyclic peptide
    result.append(peptide)
    
    return result

def is_consistent(peptide, spectrum):
    spectrum_copy = spectrum[:]
    for subpeptide in peptide:
        if subpeptide not in spectrum_copy:
            return False
        else:
            spectrum_copy.remove(subpeptide)
    return True

def CycloPeptideSequencing(spectrum):
    peptides = [[]]
    while len(peptides) != 0:
        expand(peptides)
        for peptide in peptides:
            p_spectrum = [0] + [sum(s) for s in cut_peptide(peptide)]
            if max(p_spectrum) == max(spectrum):
                if sorted(p_spectrum) == spectrum:
                    print('-'.join([str(x) for x in peptide]))
                    peptides.remove(peptide)
            else:
                if not is_consistent(peptide, spectrum):
                    peptides.remove(peptide)

In [183]:
CycloPeptideSequencing('0 113 128 186 241 299 314 427'.split(' '))

KeyboardInterrupt: 

In [161]:
# 4f Compute the score of a cyclic peptide against a spectrum
def Score(peptide, spectrum):
    theoretical_spectrum = CyclicSpectrum(peptide)
    spectrum = [int(mass) for mass in spectrum]
    unique_masses = set(theoretical_spectrum + spectrum)
    
    result = 0
    for mass in unique_masses:
        result += min(theoretical_spectrum.count(mass), spectrum.count(mass))
    return result

In [162]:
Score('NQEL', '0 99 113 114 128 227 257 299 355 356 370 371 484'.split(' '))

11

In [163]:
Score('VYYEVDWTMGRQIDPDEYPIAQCTRHRATILTLPDWQM',
'0 71 71 87 87 97 97 99 101 101 101 101 103 113 113 113 113 113 115 115 115 115 128 128 129 129 131 131 137 147 156 156 156 163 163 163 163 172 184 186 186 199 204 210 212 212 214 214 214 216 218 226 227 228 230 231 232 243 244 244 257 259 260 260 262 273 276 278 278 281 285 287 292 293 293 301 302 303 312 314 319 325 326 327 327 327 327 328 331 332 341 358 360 364 373 374 374 375 377 389 390 391 393 394 398 402 403 407 407 409 415 416 418 425 428 429 440 440 441 441 444 445 449 456 464 465 472 475 488 490 492 494 497 499 502 503 503 504 504 505 505 512 516 517 520 521 521 531 540 541 544 550 554 556 559 560 569 570 572 573 577 578 587 592 592 601 603 605 612 613 617 618 618 619 621 621 622 625 628 634 634 653 655 657 659 661 669 671 672 675 678 680 684 688 691 696 701 704 705 707 714 715 716 716 718 721 722 723 724 732 734 735 743 748 749 749 756 768 769 775 776 781 781 785 791 792 799 800 804 806 808 809 816 819 822 822 825 829 829 830 835 836 836 843 846 847 850 852 852 870 872 877 879 887 890 895 900 904 905 905 906 912 913 919 919 921 923 928 931 932 935 937 938 942 942 944 947 948 948 953 959 961 965 976 985 985 985 992 999 1005 1008 1013 1015 1016 1020 1024 1024 1028 1032 1035 1036 1036 1036 1043 1046 1049 1050 1051 1061 1061 1062 1062 1066 1068 1069 1073 1075 1076 1089 1090 1098 1100 1114 1117 1122 1123 1131 1132 1133 1133 1133 1137 1137 1137 1139 1141 1148 1149 1152 1155 1160 1162 1162 1163 1174 1174 1176 1179 1183 1191 1198 1199 1201 1201 1204 1219 1220 1224 1225 1229 1232 1232 1234 1244 1245 1248 1250 1250 1252 1252 1253 1261 1261 1262 1263 1264 1265 1273 1280 1288 1296 1302 1302 1305 1312 1313 1316 1316 1318 1325 1332 1335 1335 1339 1345 1347 1349 1350 1351 1354 1354 1360 1362 1363 1363 1367 1369 1376 1377 1387 1388 1391 1392 1392 1393 1397 1403 1410 1415 1417 1425 1425 1428 1433 1434 1440 1444 1451 1460 1463 1464 1464 1464 1464 1464 1465 1469 1475 1477 1479 1488 1491 1492 1497 1501 1502 1505 1510 1520 1525 1525 1526 1526 1530 1531 1534 1540 1540 1543 1547 1548 1556 1561 1564 1564 1566 1577 1578 1579 1588 1590 1590 1592 1593 1597 1603 1606 1619 1623 1626 1627 1627 1627 1628 1634 1635 1637 1637 1638 1639 1641 1648 1648 1653 1662 1665 1671 1674 1676 1677 1681 1689 1693 1693 1695 1703 1706 1708 1719 1724 1724 1727 1729 1734 1735 1738 1740 1742 1742 1750 1752 1752 1753 1754 1754 1756 1760 1763 1765 1766 1778 1784 1785 1790 1792 1794 1804 1804 1806 1808 1821 1821 1823 1832 1837 1837 1837 1851 1853 1853 1853 1855 1857 1857 1862 1865 1866 1866 1867 1871 1871 1875 1879 1879 1883 1885 1889 1890 1891 1897 1905 1908 1916 1919 1923 1928 1933 1936 1941 1941 1949 1952 1964 1966 1966 1966 1966 1967 1968 1968 1977 1980 1984 1984 1985 1988 1992 2000 2004 2004 2009 2010 2012 2020 2020 2022 2028 2031 2034 2034 2036 2037 2041 2046 2051 2052 2053 2056 2065 2079 2079 2081 2081 2087 2091 2093 2097 2097 2105 2111 2113 2115 2116 2122 2123 2124 2129 2131 2133 2135 2138 2139 2140 2147 2148 2152 2153 2159 2160 2165 2168 2168 2169 2178 2180 2184 2187 2192 2197 2198 2206 2206 2209 2215 2224 2226 2237 2237 2240 2244 2244 2244 2247 2250 2253 2255 2260 2263 2264 2266 2268 2269 2277 2280 2286 2287 2293 2293 2294 2296 2297 2300 2301 2307 2309 2310 2311 2315 2315 2324 2334 2337 2339 2341 2352 2354 2356 2359 2369 2378 2378 2382 2383 2384 2386 2392 2393 2396 2397 2399 2400 2400 2406 2407 2413 2416 2424 2425 2427 2429 2430 2433 2438 2440 2443 2446 2449 2449 2449 2453 2456 2456 2467 2469 2478 2484 2487 2487 2495 2496 2501 2506 2509 2513 2515 2524 2525 2525 2528 2533 2534 2540 2541 2545 2546 2553 2554 2555 2558 2560 2562 2564 2569 2570 2571 2577 2578 2580 2582 2588 2596 2596 2600 2602 2606 2612 2612 2614 2614 2628 2637 2640 2641 2642 2647 2652 2656 2657 2659 2659 2662 2665 2671 2673 2673 2681 2683 2684 2689 2689 2693 2701 2705 2708 2709 2709 2713 2716 2725 2725 2726 2727 2727 2727 2727 2729 2741 2744 2752 2752 2757 2760 2765 2770 2774 2777 2785 2788 2796 2802 2803 2804 2808 2810 2814 2814 2818 2822 2822 2826 2827 2827 2828 2831 2836 2836 2838 2840 2840 2840 2842 2856 2856 2856 2861 2870 2872 2872 2885 2887 2889 2889 2899 2901 2903 2908 2909 2915 2927 2928 2930 2933 2937 2939 2939 2940 2941 2941 2943 2951 2951 2953 2955 2958 2959 2964 2966 2969 2969 2974 2985 2987 2990 2998 3000 3000 3004 3012 3016 3017 3019 3022 3028 3031 3040 3045 3045 3052 3054 3055 3056 3056 3058 3059 3065 3066 3066 3066 3067 3070 3074 3087 3090 3096 3100 3101 3103 3103 3105 3114 3115 3116 3127 3129 3129 3132 3137 3145 3146 3150 3153 3153 3159 3162 3163 3167 3167 3168 3168 3173 3183 3188 3191 3192 3196 3201 3202 3205 3214 3216 3218 3224 3228 3229 3229 3229 3229 3229 3230 3233 3242 3249 3253 3259 3260 3265 3268 3268 3276 3278 3283 3290 3296 3300 3301 3301 3302 3305 3306 3316 3317 3324 3326 3330 3330 3331 3333 3339 3339 3342 3343 3344 3346 3348 3354 3358 3358 3361 3368 3375 3377 3377 3380 3381 3388 3391 3391 3397 3405 3413 3420 3428 3429 3430 3431 3432 3432 3440 3441 3441 3443 3443 3445 3448 3449 3459 3461 3461 3464 3468 3469 3473 3474 3489 3492 3492 3494 3495 3502 3510 3514 3517 3519 3519 3530 3531 3531 3533 3538 3541 3544 3545 3552 3554 3556 3556 3556 3560 3560 3560 3561 3562 3570 3571 3576 3579 3593 3595 3603 3604 3617 3618 3620 3624 3625 3627 3631 3631 3632 3632 3642 3643 3644 3647 3650 3657 3657 3657 3658 3661 3665 3669 3669 3673 3677 3678 3680 3685 3688 3694 3701 3708 3708 3708 3717 3728 3732 3734 3740 3745 3745 3746 3749 3751 3751 3755 3756 3758 3761 3762 3765 3770 3772 3774 3774 3780 3781 3787 3788 3788 3789 3793 3798 3803 3806 3814 3816 3821 3823 3841 3841 3843 3846 3847 3850 3857 3857 3858 3863 3864 3864 3868 3871 3871 3874 3877 3884 3885 3887 3889 3893 3894 3901 3902 3908 3912 3912 3917 3918 3924 3925 3937 3944 3944 3945 3950 3958 3959 3961 3969 3970 3971 3972 3975 3977 3977 3978 3979 3986 3988 3989 3992 3997 4002 4005 4009 4013 4015 4018 4021 4022 4024 4032 4034 4036 4038 4040 4059 4059 4065 4068 4071 4072 4072 4074 4075 4075 4076 4080 4081 4088 4090 4092 4101 4101 4106 4115 4116 4120 4121 4123 4124 4133 4134 4137 4139 4143 4149 4152 4153 4162 4172 4172 4173 4176 4177 4181 4188 4188 4189 4189 4190 4190 4191 4194 4196 4199 4201 4203 4205 4218 4221 4228 4229 4237 4244 4248 4249 4252 4252 4253 4253 4264 4265 4268 4275 4277 4278 4284 4286 4286 4290 4291 4295 4299 4300 4302 4303 4304 4316 4318 4319 4319 4320 4329 4333 4335 4352 4361 4362 4365 4366 4366 4366 4366 4367 4368 4374 4379 4381 4390 4391 4392 4400 4400 4401 4406 4408 4412 4415 4415 4417 4420 4431 4433 4433 4434 4436 4449 4449 4450 4461 4462 4463 4465 4466 4467 4475 4477 4479 4479 4479 4481 4481 4483 4489 4494 4507 4507 4509 4521 4530 4530 4530 4530 4537 4537 4537 4546 4556 4562 4562 4564 4564 4565 4565 4578 4578 4578 4578 4580 4580 4580 4580 4580 4590 4592 4592 4592 4592 4594 4596 4596 4606 4606 4622 4622 4693'.split(' '))

521

In [91]:
test_file = 'rosalind_ba4f.txt'
with open(test_file, 'r') as reader:
    peptide = reader.readline().strip('\n')
    spectrum = reader.readline().strip('\n').split(' ')
Score(peptide, spectrum)

862

To generalize “Find a Cyclic Peptide with Theoretical Spectrum Matching an Ideal Spectrum” to handle “noisy” spectra having false and missing masses, we need to relax the requirement that a candidate peptide’s theoretical spectrum must match the experimental spectrum exactly, and instead incorporate a scoring function that will select the peptide whose theoretical spectrum matches the given experimental spectrum the most closely. Given a cyclic peptide Peptide and a spectrum Spectrum, we define Score(Peptide, Spectrum) as the number of masses shared between Cyclospectrum(Peptide) and Spectrum. Recalling Figure 1, if

$$Spectrum = \{0, 99, 113, 114, 128, 227, 257, 299, 355, 356, 370, 371, 484\},$$
then $Score("NQEL", Spectrum) = 11.$

To limit the number of candidate peptides under consideration, we will use a Leaderboard, which holds the N highest scoring candidate peptides for further extension. At each step, we will expand all candidate peptides found in Leaderboard by adding every possible amino acid to the end. Then, we will eliminate those peptides whose newly calculated scores are not high enough to keep them on the Leaderboard. This idea is similar to the notion of a “cut” in a golf tournament; after the cut, only the top N golfers are allowed to play in the next round, since they are the only players who have a reasonable chance of winning.

To be fair, a cut should include anyone who is tied with the Nth-place competitor. Thus, Leaderboard should be trimmed down to the “N highest-scoring peptides including ties”, which may include more than N peptides. Given a list of peptides Leaderboard, a spectrum Spectrum, and an integer N, Cut(Leaderboard, Spectrum, N) returns the top N highest-scoring peptides in <Leaderboard (including ties) with respect to Spectrum. We now introduce LEADERBOARDCYCLOPEPTIDESEQUENCING. In what follows, the 0-peptide is the peptide "" containing no amino acids.

In [164]:
# 4g Implement LeaderBoardCycloPeptideSequencing
def expand(peptides):
    new_peptides = []
    for pep in peptides:
        for mass in set(mass_table.values()):
            new_peptides.append(pep + [mass])
    return new_peptides

def Trim(leaderboard, spectrum, N):
    if len(leaderboard) <= N:
        return leaderboard

    scores = {}
    for i, peptide in enumerate(leaderboard):
        scores[i] = Score(peptide, spectrum)
    
    sorted_scores = sorted(scores.values(), reverse = True)
    cutoff = sorted_scores[N-1]
    
    return [leaderboard[idx] for idx, score in scores.items() if score >= cutoff]

def LeaderBoardCycloPeptideSequencing(N, spectrum):
    leaderboard = [[]]
    leader_peptide = []
    spectrum = list(map(lambda x: int(x), spectrum))
    while len(leaderboard) >= 0:
        leaderboard = expand(leaderboard)
        for peptide in leaderboard:
            if sum(peptide) == max(spectrum):
                if Score(peptide, spectrum) > Score(leader_peptide, spectrum):
                    leader_peptide = peptide
            elif sum(peptide) > max(spectrum):
                leaderboard = [aa for aa in leaderboard if aa != peptide]
        leaderboard = Trim(leaderboard, spectrum, N)
    
    return leaderboard

In [None]:
LeaderBoardCycloPeptideSequencing(10, '0 71 113 129 147 200 218 260 313 331 347 389 460'.split(' '))

__Spectral Convolution Problem__

Compute the convolution of a spectrum.

__Given__: A collection of integers Spectrum.

__Return__: The list of elements in the convolution of Spectrum in decreasing order of their multiplicities. If an element has multiplicity k, it should appear exactly k times.

In [165]:
# 4h Generate convolution of a spectrum
def convolution(spectrum):
    spectrum = list(map(lambda i: int(i), spectrum))
    convolution = [i-j for i in spectrum for j in spectrum if i > j]
    convolution_mult = {element: convolution.count(element) for element in convolution}
    convolution_mult_sorted = {k: v for k, v in sorted(convolution_mult.items(), key = lambda item: item[1],
                                                      reverse = True)}
    
    output = []
    for element in convolution_mult_sorted:
        output += [element]*convolution_mult_sorted[element]
    
    output = [str(element) for element in output]
    return ' '.join(output)

In [182]:
convolution([2, 7])

'5'

In [145]:
convolution('0 137 186 323'.split(' '))

{137: 2, 186: 2, 49: 1, 323: 1}


'137 137 186 186 49 323'

In [None]:
test_file = 'rosalind_ba4h.txt'
with open(test_file, 'r') as reader:
    spectrum = reader.readline().strip('\n')
f = open('4h.txt', 'w')
f.write(convolution(spectrum.split(' ')))

Given an experimental spectrum, we first compute the convolution of an experimental spectrum. We then select the M most frequent elements between 57 and 200 in the convolution to form an extended alphabet of candidate amino acid masses. In order to be fair, we should include the top M elements of the convolution "with ties". Finally, we run the algorithm __LeaderboardCyclopeptideSequencing__, where the amino acid masses are restricted to this alphabet.

__Implement ConvolutionCyclopeptideSequencing__

__Given__: An integer M, an integer N, and a collection of (possibly repeated) integers Spectrum.

__Return__: A cyclic peptide LeaderPeptide with amino acids taken only from the top M elements (and ties) of the convolution of Spectrum that fall between 57 and 200, and where the size of Leaderboard is restricted to the top N (and ties).

In [173]:
# 4i Implement ConvolutionCycloPeptideSequencing

def expand_masses(peptides, masses):
    new_peptides = []
    for peptide in peptides:
        for mass in masses:
            new_peptides.append(peptide + [mass])
    return new_peptides

def find_masses(spectrum, M):
    convolutions = list(map(lambda i: int(i), convolution(spectrum).split(' ')))
    topM = [conv for conv in convolutions if 57 <= conv <= 200]
    
    freq_dict = {}
    for mass in set(convolutions):
        freq_dict[mass] = convolutions.count(mass)
    
    sorted_freq_dict = sorted(freq_dict.items(), key = lambda item: item[1], reverse = True)
    masses = [mass for mass, freq in sorted_freq_dict if freq >= sorted_freq_dict[M][1]]
    return sorted(masses)

def ConvolutionCycloPeptideSequencing(M, N, spectrum):
    spectrum = list(map(lambda i: int(i), spectrum))
    masses = find_masses(spectrum, M)
    leaderboard = [[]]
    leader_peptide = []
    
    while len(leaderboard) >= 0:
        leaderboard = expand_masses(leaderboard, masses)
        for peptide in leaderboard:
            if sum(peptide) == max(spectrum):
                if Score(peptide, spectrum) > Score(leader_peptide, spectrum):
                    leader_peptide = peptide
            elif sum(peptide) > max(spectrum):
                leaderboard = [aa for aa in leaderboard if aa != peptide]
        leaderboard = Trim(leaderboard, spectrum, N)
    
    return leader_peptide

In [None]:
ConvolutionCycloPeptideSequencing(20, 60,
                                 '57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493'.split(' '))