In [1]:
import numpy as np
from collections import Counter

In [2]:
AminoAcidMass = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 
                 'L': 113, 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 
                 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}

In [3]:
def compare(s, t):
    return Counter(s) == Counter(t)

In [4]:
def CheckRepeatPeptide(a,b):
    for e in b:
        if compare(a, e):
            return True

In [5]:
def LinearSpectrum(Peptide):
    Peptides = Peptide.split('-')
    #Peptides = Peptide
    n = len(Peptides)
    PrefixMass = np.zeros(n+1)
    for i in range(0,n):
        PrefixMass[i+1] = PrefixMass[i] + int(Peptides[i])
    LinearSpectrum = [0]
    for i in range(n):
        for j in range(i+1,n+1):
            #add PrefixMass(j) − PrefixMass(i) to LinearSpectrum
            LinearSpectrum.append(PrefixMass[j]-PrefixMass[i])
            
    return np.sort(LinearSpectrum)

In [51]:
def Mass(Peptide):
    mass = 0
    PeptideList = Peptide.split('-')
    #PeptideList = Peptide
    for amino_acid in PeptideList:
        mass += int(amino_acid)
    return mass

In [7]:
def ParentMass(Spectrum):
    mass = 0
    minMass = 10000
    for amino_acid in Spectrum:
        mass += amino_acid
        if(minMass > mass):
            minMass = mass
    return mass,minMass

In [8]:
def Expand(CandidatePeptides):
    if (not CandidatePeptides):
        #return [[k] for k in set(AminoAcidMass.values())]
        return set(str(k) for k in set(AminoAcidMass.values()))
    NewCandidatePeptides = []
    for Peptide in CandidatePeptides:
        for mass in set(AminoAcidMass.values()):
            #NewPeptide = Peptide + [mass]
            NewPeptide = Peptide + '-'+str(mass)
            #if not CheckRepeatPeptide(NewPeptide,NewCandidatePeptides):
            NewCandidatePeptides.append(NewPeptide)
    return NewCandidatePeptides

In [9]:
def Score(Peptide,Experimental):
    if(not Peptide):
        return 0
    Theoretical = LinearSpectrum(Peptide)
    Total = {}
    for mass in Theoretical:
        if mass in Total.keys():
            Total[mass] += 1
        else:
            Total[mass] = 1
    score = 0
    for mass in Experimental:
        if(mass in Total.keys() and Total[mass]>0):
            score += 1
            Total[mass] -= 1
    return score

In [10]:
def Trim(Leaderboard, Spectrum, N):
    n = len(Leaderboard)
    LinearScores = {}
    for Peptide in Leaderboard:
        LinearScores[Peptide] =  Score(Peptide, Spectrum)
    
    count = 0
    NewLeaderBoard = []
    preScore = -1
    for Peptide,score in sorted(LinearScores.items(), key=lambda item: item[1],reverse=True):
        #Peptide = Peptide.strip('][').split(', ') 
        #Peptide = [int(x) for x in Peptide]
        #k = len(Peptide)
        if(preScore == score or count<N):
            NewLeaderBoard.append(Peptide)
        else:
            break
        count += 1
        preScore = score
    return NewLeaderBoard

In [54]:
def LeaderboardCyclopeptideSequencing(Spectrum, N):
        LeaderPeptide = ''
        Leaderboard  = []
        while True:
            Leaderboard = Expand(Leaderboard)
            Leaderboard = Trim(Leaderboard, Spectrum, N)
            DiscardBoard = []
            for Peptide in Leaderboard:
                #if Mass(LinearSpectrum(Peptide)) == parentMass:
                if Mass(Peptide) ==  Spectrum[-1]:
                    if Score(Peptide, Spectrum) > Score(LeaderPeptide, Spectrum):
                        LeaderPeptide = Peptide
                elif Mass(Peptide) > Spectrum[-1]:
                    DiscardBoard.append(Peptide)
            if len(DiscardBoard) == len(Leaderboard):
                break
            Leaderboard = [x for x in Leaderboard if x not in DiscardBoard] 
            Leaderboard = Trim(Leaderboard, Spectrum, N)
        return LeaderPeptide

In [55]:
Spectrum = '0 71 113 129 147 200 218 260 313 331 347 389 460'
Spectrum = np.array(Spectrum.split(' ')).astype(int)
N = 10

In [56]:
LeaderboardCyclopeptideSequencing(Spectrum, N)

'129-71-147-113'

In [67]:
N = 0
Spectrum = []
file = open('dataset_369295_8.txt', 'r') 
for i, line in enumerate(file):
    line=line.rstrip('\n')
    if(i==0):
         N = int(line)
    elif(i==1):
        Spectrum = np.array(line.split(' ')).astype(int)
       

In [68]:
import datetime
a = datetime.datetime.now()
Lead = LeaderboardCyclopeptideSequencing(Spectrum, N)
b = datetime.datetime.now()
c = b - a
c.seconds

23

In [69]:
Lead

'113-103-113-137-128-101-103-113-115-101-115-97-128-137-163-113-99-186-128-147-147-137-87-99'

In [60]:
N = 325
line = '0 71 71 71 87 97 97 99 101 103 113 113 114 115 128 128 129 137 147 163 163 170 184 184 186 186 190 211 215 226 226 229 231 238 241 244 246 257 257 276 277 278 299 300 312 316 317 318 318 323 328 340 343 344 347 349 356 366 370 373 374 391 401 414 414 415 419 427 427 431 437 441 446 453 462 462 462 470 472 502 503 503 511 515 529 530 533 533 540 543 547 556 559 569 574 575 584 590 600 600 604 612 616 617 630 640 640 643 646 648 660 671 683 684 687 693 703 703 719 719 719 729 730 731 737 740 741 745 747 754 774 780 784 790 797 800 806 818 826 827 832 833 838 846 846 847 850 868 869 877 884 889 893 897 903 908 913 917 930 940 947 956 960 960 961 964 965 966 983 983 985 1002 1009 1010 1011 1021 1031 1031 1036 1053 1054 1058 1059 1062 1063 1074 1076 1084 1092 1103 1113 1122 1124 1130 1133 1134 1145 1146 1146 1149 1150 1155 1156 1171 1173 1174 1187 1191 1193 1200 1212 1221 1233 1240 1242 1246 1259 1260 1262 1277 1278 1283 1284 1287 1287 1288 1299 1300 1303 1309 1311 1320 1330 1341 1349 1357 1359 1370 1371 1374 1375 1379 1380 1397 1402 1402 1412 1422 1423 1424 1431 1448 1450 1450 1467 1468 1469 1472 1473 1473 1477 1486 1493 1503 1516 1520 1525 1530 1536 1540 1544 1549 1556 1564 1565 1583 1586 1587 1587 1595 1600 1601 1606 1607 1615 1627 1633 1636 1643 1649 1653 1659 1679 1686 1688 1692 1693 1696 1702 1703 1704 1714 1714 1714 1730 1730 1740 1746 1749 1750 1762 1773 1785 1787 1790 1793 1793 1803 1816 1817 1821 1829 1833 1833 1843 1849 1858 1859 1864 1877 1886 1890 1893 1900 1900 1903 1904 1918 1922 1930 1930 1931 1961 1963 1971 1971 1971 1980 1987 1992 1996 2002 2006 2006 2014 2018 2019 2019 2032 2042 2059 2060 2063 2067 2077 2084 2086 2089 2090 2093 2105 2110 2115 2115 2116 2117 2121 2133 2134 2155 2156 2157 2176 2176 2187 2189 2192 2195 2202 2204 2207 2207 2218 2222 2243 2247 2247 2249 2249 2263 2270 2270 2286 2296 2304 2305 2305 2318 2319 2320 2320 2330 2332 2334 2336 2336 2346 2362 2362 2362 2433'

In [61]:
Spectrum = np.array(line.split(' ')).astype(int)

In [62]:
LeaderboardCyclopeptideSequencing(Spectrum, N)

'99-147-97-129-97-114-163-137-101-128-87-103-71-57-113-71-115-163-113-71-186-71'

In [63]:
Peptide1 = '97-129-97-147-99-71-186-71-113-163-115-71-113-128-103-87-128-101-137-163-114'
Peptide = '99-147-97-129-97-114-163-137-101-128-87-103-71-57-113-71-115-163-113-71-186-71'

In [64]:
Score(Peptide, Spectrum)

239

In [65]:
Mass(Peptide1)

2433

In [66]:
Mass(Peptide)

2433