In [1]:
import numpy as np
import copy

In [62]:
aa_list = """G 57
A 71
S 87
P 97
V 99
T 101
C 103
I 113
L 113
N 114
D 115
K 128
Q 128
E 129
M 131
H 137
F 147
R 156
Y 163
W 186"""
aa_list = aa_list.strip().split("\n")
aa_weights = {}
for line in aa_list:
    aa,w = line.split(" ")
    aa_weights[aa] = int(w)
aa_weights_rv = {v:k for k,v in aa_weights.items()}
aa_weights_20 = []
for k,v in aa_weights.items():
    aa_weights_20.append(v)
    
def Graph(full_spectrum):
    #Full spectrum contain zero and the mass of the whole peptide
    if 0 not in full_spectrum:
        full_spectrum.append(0)
    full_spectrum = sorted(full_spectrum)
        
    n = len(full_spectrum)
    arr = np.zeros([n,n])
    for i in range(len(full_spectrum)-1):
        for k in range(i,len(full_spectrum)):
            arr[i,k]=abs(full_spectrum[i]-full_spectrum[k])
    graph = {}
    for w in aa_weights_rv:
        result = np.where(arr==w)
        result = list(zip(result[0],result[1]))
        for x,y in result:
            graph[f"{full_spectrum[x]}->{full_spectrum[y]}"] = aa_weights_rv[w]
    graph =  {k:v for k,v in sorted(graph.items(),key=(lambda item: int(item[0].split("->")[0])))}
    return graph

def Adjency(full_spectrum):
    #Full spectrum contain zero and the mass of the whole peptide    
    if 0 not in full_spectrum:
        full_spectrum.append(0)
    full_spectrum = sorted(full_spectrum)
            
    n = len(full_spectrum)
    arr = np.zeros([n,n])
    for i in range(len(full_spectrum)-1):
        for k in range(i,len(full_spectrum)):
            arr[i,k]=abs(full_spectrum[i]-full_spectrum[k])
    adj = {}
    for w in aa_weights_rv:
        result = np.where(arr==w)
        result = list(zip(result[0],result[1]))
        for x,y in result:
            if x not in adj:
                adj[x] = [y]
            else:
                adj[x].append(y)
    adj =  {k:v for k,v in sorted(adj.items(),key=(lambda item: int(item[0])))}
    return adj


def FindAllPaths(start_node,end_node,adjency):
    adjency = copy.deepcopy(adjency)
    succeeded = {}
    visited = []
    all_paths = []
    path = []

    current_node = start_node


    while True :
        path.append(current_node)
        #if reach destination, save path and dynamic programming to save possible paths for each node
        if current_node == end_node:            
            all_paths.append(path)
            for i in range(len(path) -1):
                node_i = path[i]
                path_i = path[i+1:]
                if node_i not in succeeded:
                    succeeded[node_i] = [path_i]
                elif path_i not in succeeded[node_i]:
                    succeeded[node_i].append(path_i)
            current_node = path[-2]
            path = path[:-2]
            continue
        #if reach already visited node and this node doesn't lead to destination, move back
        if current_node in visited and current_node not in succeeded:
            visited.append(current_node)
            current_node = path[-2]
            path = path[:-2]
            continue

            
        #if reach a visited node and the path from this node has already been saved, populate new path from this save path 
        if current_node in visited and current_node in succeeded:
            for succeeded_path in succeeded[current_node]:
                new_path = path + succeeded_path
                all_paths.append(new_path)
                for i in range(len(path) -1):
                    node_i = path[i]
                    path_i = path[i+1:] + succeeded_path
                    if node_i not in succeeded:
                        succeeded[node_i] = [path_i]
                    elif path_i not in succeeded[node_i]:
                        succeeded[node_i].append(path_i)
            current_node = path[-2]
            path = path[:-2]
            continue
            
        #if reach dead end, move back a node, if dead end is start node, return all_paths
        if current_node not in adjency or len(adjency[current_node]) == 0:
            visited.append(current_node)
            if current_node == start_node:
                return all_paths
            current_node = path[-2]
            path = path[:-2]
            continue
            
        #move forward, pop a node in adjency list
        if current_node in adjency and len(adjency[current_node]) > 0:
            current_node = adjency[current_node].pop(-1)
            continue

def Weight(peptide):
    w = 0
    for aa in peptide:
        w += aa_weights[aa]
    return w 

def IdealSpectrum(peptide):
    prefix_peptides = []
    suffix_peptides = []
    for i in range(1,len(peptide)):
        prefix_peptides.append(peptide[:i])
        suffix_peptides.append(peptide[i:])
    fragments = prefix_peptides + suffix_peptides
    spectrum = [Weight(fragment) for fragment in fragments]
    spectrum.append(Weight(peptide))
    spectrum.append(0)
    return sorted(spectrum)


def DecodeIdealSpectrum(full_spectrum):
    print("Make sure the full spectrum is entered")
    if 0 not in full_spectrum:
        full_spectrum.append(0)
    full_spectrum = sorted(full_spectrum)
    adj = Adjency(full_spectrum)
    nodes = []
    for k,v in adj.items():
        nodes.append(k)
        nodes += v
    nodes = list(set(nodes))
    start = nodes[0]
    end = nodes[-1]
    print(start,end)
    all_paths = FindAllPaths(start,end,adj)

    graph = Graph(full_spectrum)

    ideal_peptides = []
    for path in all_paths:
        peptide = ""
        for i in range(len(path)-1):
            prefix = full_spectrum[path[i]]
            suffix = full_spectrum[path[i+1]]
            edge = f"{prefix}->{suffix}"
            peptide += graph[edge]
        if IdealSpectrum(peptide) == full_spectrum:
            ideal_peptides.append(peptide)
    return ideal_peptides

def PepToVector(peptide):
    prefix_peptides = []
    for i in range(1,len(peptide)+1):
        prefix_peptides.append(peptide[:i])
    prefix_masses = [Weight(p) for p in prefix_peptides]
    binary_mass_vector = np.zeros(prefix_masses[-1])
    for i in prefix_masses:
        binary_mass_vector[i-1] = 1
    return binary_mass_vector

def VectorToPep(binary_mass_vector):
    print("Make sure to add 0 in the first position in input")
    result = np.where(binary_mass_vector==1)
    spectrum = [ mass for mass in result[0]]
    suffix_mass = []
    for mass in spectrum[:-1]:
        suffix_mass.append(spectrum[-1] - mass)
    spectrum += suffix_mass
    peptides = DecodeIdealSpectrum(spectrum)
    return peptides
def FindAllPathWeightedDAG(start_node,end_node,adjency,poison = None):
    if poison == None:
        poison = []
        
    succeeded = {}
    visited = []
    all_paths = []
    path = []

    current_node = start_node
    while True :
        path.append(current_node)
        #if reach destination, save path and calculate weight(score) of the path for each node in the path
        #if node already has a path, chose the path with higher weight
        if current_node == end_node:
            all_paths.append(path)
            for i in range(len(path) -1):
                node_i = path[i]
                path_i = path[i+1:]
                if node_i not in succeeded:
                    new_sum = sum([ip[k] for k in path_i])
                    succeeded[node_i] = [path_i,new_sum]
                else:
                    new_sum = sum([ip[k] for k in path_i])
                    old_sum = succeeded[node_i][1]
                    if new_sum > old_sum:
                        succeeded[node_i] = [path_i,new_sum]
            current_node = path[-2]
            path = path[:-2]
            continue
            
        #if reach already visited node and this node doesn't lead to destination, move back
        if current_node in visited and current_node not in succeeded:
            visited.append(current_node)
            current_node = path[-2]
            path = path[:-2]
            continue
            
        #if reach a poison node (retristed), move back, poison node become visited
        if current_node in poison:
            visited.append(current_node)
            current_node = path[-2]
            path = path[:-2]
            continue
            
        #if reach a visited node and the path from this node has already been saved, populate new path from this save path
        #calculate weight(score) of the path for potentially new node in the path
        #if node already has a path, chose the path with higher weight        
        if current_node in visited and current_node in succeeded:
            succeeded_path = succeeded[current_node][0]
            new_path = path + succeeded_path
            all_paths.append(new_path)
            for i in range(len(path) -1):
                node_i = path[i]
                path_i = path[i+1:] + succeeded_path
                if node_i not in succeeded:
                    new_sum = sum([ip[k] for k in path_i])
                    succeeded[node_i] = [path_i,new_sum]
                else:
                    new_sum = sum([ip[k] for k in path_i])
                    old_sum = succeeded[node_i][1]
                    if new_sum > old_sum:
                        succeeded[node_i] = [path_i,new_sum]

            current_node = path[-2]
            path = path[:-2]
            continue
       
        #if reach dead end, move back a node, if dead end is start node, return all_paths
        if current_node not in adjency or len(adjency[current_node]) == 0:
            visited.append(current_node)
            if current_node == start_node:
                return all_paths
            current_node = path[-2]
            path = path[:-2]
            continue
        
        #move forward, pop a node in adjency list
        if current_node in adjency and len(adjency[current_node]) > 0:
            current_node = adjency[current_node].pop(-1)
            continue
            
def FindPeptideRealSpectra(real_spectra):
    print("Make sure real spectra has node 0 added in the beginning")
    
    nodes = [i for i in range(len(real_spectra))]
    full_spectrum = nodes
    if 0 not in full_spectrum:
        full_spectrum.append(0)
    full_spectrum = sorted(full_spectrum)
    adj = Adjency(full_spectrum)
    start = nodes[0]
    end = nodes[-1]
    
    all_paths = FindAllPathWeightedDAG(start,end,adj)

    graph = Graph(full_spectrum)
    all_sums = []
    for path in all_paths:
        all_sums.append(sum([real_spectra[i] for i in path]))
    best_path = all_paths[np.argmax(all_sums)]

    peptide = ""
    for i in range(len(best_path)-1):
        prefix = full_spectrum[best_path[i]]
        suffix = full_spectrum[best_path[i+1]]
        edge = f"{prefix}->{suffix}"
        peptide += graph[edge]
    return peptide
    

In [3]:
def PotentialPeptidesFromProteome(proteome,peptide_mass):
    potential_peptides = []
    sliding_mass = 0
    sliding_petide = ""
    i = 0
    while i < len(proteome):    
        if sliding_mass < peptide_mass:
            sliding_mass += aa_weights[proteome[i]]
            sliding_petide += proteome[i]
            i += 1
        elif sliding_mass == peptide_mass:
            potential_peptides.append(sliding_petide)
            sliding_mass += aa_weights[proteome[i]]
            sliding_mass -= aa_weights[sliding_petide[0]]
            sliding_petide = sliding_petide[1:] + proteome[i]
            i += 1
        else:
            sliding_mass -= aa_weights[sliding_petide[0]]
            sliding_petide = sliding_petide[1:]
    return potential_peptides
def PeptideIdentification(real_spectra, proteome):
    peptide_mass = len(real_spectra) - 1
    peptide_list  = PotentialPeptidesFromProteome(proteome, peptide_mass)
    if peptide_list == []:
        return "",0
    score_array = real_spectra[1:]
    score_rank = []
    for peptide in peptide_list:
        prefix_vector = PepToVector(peptide)
        result = np.where(prefix_vector==1)
        score = sum([score_array[i] for i in result[0]])
        score_rank.append(score)
    best_peptide = peptide_list[np.argmax(score_rank)]
    return best_peptide, np.max(score_rank)
def PSMSearch(real_spectra_set, proteome, threshold):
    PSMSet = []
    for real_spectra in real_spectra_set:
        peptide, score = PeptideIdentification(real_spectra,proteome)
        if score >= threshold:
            PSMSet.append(peptide)
    return PSMSet

# Problem 1 Peptide Identification 

In [4]:
# aa_weights = {"X":4, "Z":5}
# aa_weights_rv = {4:"X", 5:"Z"}
ip = """0 9 -7 8 28 6 -9 -7 -19 17 -6 -12 5 6 12 -14 -17 -15 -1 -13 14 16 -17 24 8 20 7 -13 -4 29 -15 -3 3 -20 1 30 -20 21 19 -5 28 23 13 7 8 23 19 6 -9 -13 -11 4 -19 23 -15 -7 -6 11 22 11 2 13 -16 2 10 -18 8 3 -20 7 -19 12 25 -9 -13 19 -7 30 -18 15 7 13 -10 -3 10 10 12 14 26 23 -2 10 18 14 7 10 -17 15 -18 23 -7 12 -18 4 -5 11 -6 -12 0 12 -8 8 5 26 1 -15 14 -3 17 11 -18 -14 30 11 0 24 -11 1 7 -19 -20 26 6 13 -7 -8 10 13 3 -19 11 26 19 4 -17 24 19 -20 19 -18 25 16 -4 19 19 -7 1 -8 -15 -3 -3 21 -14 14 -18 7 29 15 -18 4 -4 14 -16 -6 29 0 -9 -15 -11 -19 -12 -8 13 7 -8 6 26 5 -8 21 3 27 -13 29 3 -18 27 11 -2 -15 29 23 8 24 10 -5 16 -3 2 28 24 4 29 -1 -13 -20 8 -6 -5 3 -4 4 1 13 12 -20 -15 20 11 -10 29 25 12 28 -16 29 10 7 12 12 -6 -8 -5 11 -11 24 -17 0 -4 11 -4 16 -4 -12 -6 11 7 7 -12 -6 16 10 13 27 28 26 30 30 -2 2 -18 24 13 0 -2 -16 1 -14 -9 21 10 -13 9 23 3 24 -9 22 -18 2 -16 22 14 6 3 14 -9 20 13 -1 -6 -14 7 30 20 4 15 6 26 -20 13 2 -9 18 0 19 3 -16 -3 12 24 4 -7 20 -14 23 -5 -17 23 0 -7 13 18 -15 21 16 1 27 -4 13 12 -8 -7 -9 13 3 6 -20 5 -7 2 12 12 6 10 -14 -16 4 25 24 29 17 -11 12 -4 19 4 6 4 12 -17 -11 -19 14 -6 -8 5 -5 -17 23 -19 8 -4 25 -18 -4 -2 -16 18 20 2 24 21 26 4 13 -14 -14 10 11 17 19 -13 8 -7 28 17 -11 6 28 27 24 4 3 -20 29 -12 -19 14 -7 5 20 4 19 -8 28 6 -16 -7 4 -1 -15 -19 6 16 5 27 -18 2 19 17 12 -20 14 -20 23 -4 -5 -19 21 7 28 -18 -20 -10 17 20 16 -14 11 -3 18 28 22 9 -14 -7 10 13 -14 18 -2 29 12 -1 -7 15 21 1 -3 -1 17 17 -16 9 -6 22 -12 12 0 24 4 20 22 -3 -14 23 -5 -18 27 3 29 -10 16 6 18 20 -5 26 24 22 -15 30 27 7 17 13 3 7 17 27 29 -1 23 -1 19 8 29 -20 13 5 -16 19 17 21 -9 -9 5 24 12 28 20 -10 -12 6 21 6 12 21 26 -2 26 21 20 -5 0 -11 29 -6 17 0 -17 6 10 19 4 -6 16 7 30 -4 -18 -13 10 -5 26 7 27 -4 27 23 -5 11 5 -2 6 21 17 23 3 -8 -6 -8 4 13 13 21 9 -2 13 1 22 -10 -1 1 13 1 -15 26 -15 -18 30 6 19 -16 -6 20 8 29 -10 -4 14 7 -2 14 28 10 29 18 28 24 11 29 1 18 14 26 -5 -19 -3 3 10 -10 -2 14 -20 -16 -9 12 -4 -16 -13 26 18 -1 13 15 -5 0 17 -13 -15 -16 -17 -8 -4 -13 -17 -11 -5 9 19 1 18 28 20 21 -10 14 29 16 24 -6 11 -12 25 -5 26 -6 5 -12 14 21 -13 27 12 -14 -20 -15 30 12 28 14 -5 -15 11 -4 -19 8 10 27 16 18 -4 10 -16 -17 -16 -14 29 8 15 9 -19 -13 -3 21 30 23 -14 -15 5 8 15 -19 7 5 18 16 -16 16 9 19 30 3 27 29 1 14 -5 -9 -5 24 20 -1 -3 16 27 0 -11 11 -6 10 16 -7 17 -15 2 -19 -15 -19 -11 13 -9 14 8 1 30 -11 -2 -7 -19 2 17 19 -18 -12 -3 -11 -16 7 3 30 15 14 19 12 -19 19 11 -16 17 -8 29 17 -7 16 -17 6 -7 19 25 29 17 -19 26 24 -12 19 16 16 5 28 -5 -19 -9 -15 24 -7 -17 -14 4 27 15 22 4 -14 23 29 26 -16 28 23 7 12 0 -3 1 -18 -15 17 -5 -4 -19 -12 21 -17 5 -19 23 6 19 -16 1 30 22 -4 -16 -18 3 10 0 16 -2 -13 10 26 -9 -13 21 14 3 8 28 -5 -18 -19 9 8 -13 -18 5 -1 -16 12 -8 25 -7 -15 -6 26 27 -17 14 27 -4 -14 -17 16 23 -10 5 12 -5 -20 25 -5 7 4 -15 3 15 -6 3 -5 -19 -1 -14 12 11 11 4 25 16 17 -12 -15 -6 18 -13 -11 27 13 -7 -19 5 -12 -12 11 21 -11 -19 7 -6 17 -6 -9 10 11 12 4 -8 30 -9 -8 -12 7 26 -5 27 14 25 6 27 -19 16 22 22 19 14 -17 1 -11 -6 -2 -17 4 15 -17 -17 -8 23 8 -1 15 -11 27 -3 12 -17 19 -2 28 26 29 -12 -17 8 10 -5 23 26 12 8 4 30 13 -8 -14 -18 27 -13 -4 28 2 14 -9 30 -16 8 27 -18 -16 -13 -17 9 10 -16 18 23 0 -12 17 17 -4 9 -3 19 21 30 14 26 8 20 -11 24 -4 28 29 30 16 3 11 26 -9 27 29 11 1 3 27 17 -12 27 16 -18 -2 -6 12 30 18 6 23 6 22 29 24 26 9 1 -8 -18 9 -19 -18 17 -18 2 17 25 15 -7 29 -6 -4 2 23 -2 13 7 16 19 6 23 15 30 -7 -20 -13 12 28 28 21 -19 30 11 -13 9 -9 2 -8 -16 30 18 4 20 17 -19 22 -4 -2 -15 12 8 -3 28 -4 30 16 -13 22 21 29 -20 19 -19 7 22 15 -3 5 -7 -4 10 -13 1 10 18 12 7 8 26 -7 -9 -11 22 12 27 -7 -2 19 -18 3 -1 3 22 22 -11 -15 -12 -17 20 13 28 18 0 -14 2 -5 8 14 -8 20 -18 9 17 -18 5 -11 8 19 1 28 -20 -1 29 8 -8 -2 5 -4 18 -5 -17 22 27 28 3 -5 18 -1 27 9 -18 -5 -9 0 -2 24 -1 -14 20 -9 3 -17 17 -6 27 1 -1 -1 14 -16 -12 3 -14 16 -13 -5 -16 15 16 14 15 -20 9 21 -9 -16 19 -15 19 7 -20 7 24 19 26 -12 -17 -13 14 26 -15 -15 -14 21 -15 1 2 12 24 -1 -14 -11 -9 7 -13 24 16 27 1 -5 14 30 14 -18 -3 -10 -3 6 -15 26 18 -6 28 21 20 3 6 27 -19 -18 12 -12 -6 -7 -17 25 -11 25 1 -9 -17 29 18 17 14 -20 1 -6 19 -7 -11 30 16 28 30 5 -20 -13 -1 -16 -7 -15 -10 -11 -20 1 -14 -7 11 -11 -9 -16 -20 4 27 -16 14 4 17 -11 -17 3 -9 -8 -11 3 29 6 -10 -14 17 -14 12 21 5 14 13 27 20 2 2 15 14 22 -5 13 5 18 -12 -15 -20 -17 14 4 18 4 27 -3 -5 5 -4 14 -7 15 -7 28 -9 -3 -5 -7 21 -6 18 7 30 11 -12 11 23 18 12 -4 -12 27 4 28 -11 -2 5 -19 3 2 18 16 21 13 10 29 -20 -13 13 12 4 0 -17 -3 17 -17 -19 15 19 -19 20 -13 30 3 22 30 22 -1 14 -18 -19 28 -11 20 6 -6 13 28 12 16 -7 25 7 5 -19 25 -12 15 16 -18 -8 8 20 29 3 -3 -19 25 29 19 17 25 -6 21 -14 24 -12 -13 5 -12 -10 19 24 13 25 -20 27 -9 6 19 22 0 11 25 -16 -16 1 -13 -17 26 7 23 -17 13 -13 -11 -13 -18 -4 24 -20 -2 4 -2 -9 19 -17 19 -14 16 8 29 24 8 -1 9 2 4 -18 2 25 17 14 7 17 -12 17 -2 30 -1 -5 25 0 16 -7 -18 23 -16 -13 20 12 22 21 23 15 0 18 -2 26 -12 5 -3 -4 20 7 19 -13 8 5 23 5 27 -10 -14 16 0 25 -13 28 -18 -17 25 0 -16 24 2 12 -13 -10 27 -1 -7 24 -16 8 8 3 24 7 18 1 -18 25 14 0 -15 5 23 12 2 -4 27 -2 10 23 30 10 -4 18 20 5 24 22 -2 -7 16 17 -4 27 -18 -12 11 -5 -18 25 3 -5 6 6 26 -18 -16 -4 24 -3 -20 18 -9 -18 -4 2 30 14 0 4 -3 -14 30 -8 -7 25 -18 15 11 -11 22 -3 2 12 24 27 -9 -16 -9 -20 29 20 26 14"""
ip = ip.strip()
ip = ip.split(" ")
ip = np.array([int(bit) for bit in ip])
ip2 = "TRKRRFQKQPYRMPFKRFPQDYEAEWPEEHFKYTKLLINWMWYWLWVEWGFEDACASIWDMCDKNMPRGYIQAMPKAVCDHKWRVEVESESFLKIFYLGEFNSYAFQGHSEYPTGRWRKILRCSNMEMVLGNMKEAIGFFTPQDIHKTTKGYIACGTVWCVKMAVENSYHAQGVPMKPVLLWTFQTREDLEVNWEGPDIAEGKDRVAPKCKQVPPPSEGKHMLLPKYINLRHGVDSVFREGLGRQVLGECDQNSQGGQSLHVRGFVRTWLCLTHLPKLRKDVMLWNFEYKMQYVWTQFELFICRCHFFMVFYKLGFPDLMQHPFHTIAIVRDMWEEVIRVPPRVKCRFRCTDDQWQEEMFKTNFDCCPHEIRHDGWRQSNITWQNMNMVYPIKSMVHHWDVDEPIYMPANDDFDPHLFSTWGCGCFWRIKFEEYFMTEKMMCQPSDTHNIVHKIGRCELLNMEEQHPWIPMSSFPAMCITDVQHAYPYIGEFADKKRGVLRHDWVRLLTREYKEEFCYHCMIIMMCWIHTMTTGCEHDNMFQHLEMWERQLMHGVVIRDYSPSEEKDEYPLFIGKHNKAPWEEQMEVTQQDQQSLIMCRGMDRLHHKIYLPVRNRHRKVSQMFLLNYDMVMREMVAHDVYLVQITEKEYAPPVPRKYRINMYWHYDADNWAPSPIATAPKYWQYQLCKGKFDWYGEMKEVTADYSINFDVMSKFSSFTKPQFQRKGKMDANWRGNCIQGWEIKIYWQLKGEKQKKWNNAYPENQTTSVVDDGMFLEAYSPHNGDATFHGPGKARQWGQPCYSRQTRCEFMHKEKFPYAVYGTTVGGENHDGLVLWVPLWWPWASENVKPPGWWSGLYYRNAVFWRQDIYVPVWYTLDIGLVLCSNFYCGRVMSCIWSMPWHLLCTRQPPKTYYVFHGRIWEYNITVHKGVEYKENAFAFNFYAYTEGTHEMFHQCLRVSKGSRHLTPNLTVSDKWNADHIETIEYQGASFKKEVRHMMTTDIAANAQNYYHGCGTRHDYEQQVSFRYVRLVMKEMQRGGRNGIHWAFNVKVMPWSILWFKICILFYHKIPDYEIGCDKKQMMHNPQNGVDCRRCRACYILNHKSFYRTEADNVYYGHVVQKPCRHDVMDKKAGECWSAWRLTNPLRITGMINRHNRADQYDNEFTRLKEIWDGLQHCHEEIRYRFLVFPVRRQEHHFAELKAAYVFMKQRELPKVRQVQKNHKTCNLRVQCINWYQRIMHPNFQHEAELIHIMAHKVYWPEDYQIMESWEQLKDTLDGMFADRVVQGDFFCEHTTFWIKCLFTIFRPTDFFNQGYLALVEACVCQPIALYRRTPGDGYFTSHFVFEVADVGQYQLIQQSAWNRIQYMHEIFGWWIEPSLMLSIWRGDHDYNKWWIRMVERMWNRDLAYSVCKAALEYCRWDMYMLEHTHFVQPTQGPFILWFYIHTLPWRVNTTDWMQFNAGDASRKTRWDSVMHIICDEQLCKEHDFRGYPSHYNAGQKAQGDMGNKLIAGMCKITYAMWDMIQRGMEPEKIHYVKCTRHADKHHDCNHVRCHKDNFLSNDQDWCWSMNAWLHKKLQVCYPIEITEERECTDKMHVWKMPYPIEHHQQQQPYFNRDQSPQQSRKCYICMTILYALPAVMCSQKQNHHYYESLNECPNAHQSFGEPRWEFYDFACNWTCVSFINLQHRGAMTGSYQEIVQAHESGENKHRLGHRHIMYIWEYDWDQRWVDQHTQITESRREQLQCEYKKYMGLDKGSIMFMSAASAGKWYQDLDFDDYHGMHESFKIHTIPLCQPFAGIDQLFQTLFLQSIEIKFELKRQQLVCYSIQFFIYIPKDGRMLEFRHQASMSTGFGTMTPHGDDPEQNDVCFDNICCCYVSFEYWAIAEDFKNGPRFCENSLVECVKTNIQILQDQSMAIAEGYKHEMSKVVGLRHDCTVMTEVKWIEDQDTDNSHVPLVETMWNMVVTLVEECKAKEWEDDSHVGYNCCESCVYRHRRTWTEKRAVFESRMLHPQKPMRDAYNHTAKCQTDAQLCNILECTMCWYCIDDYVTKYLYAACGYTSHSIKPKGYRMKRYFSFDWCSPLVKKKWDQSIWHNFTSDISYKNSCAYCYDNPEGKSDEQLWWPFTDCMQNYETGGSQYIWGWFVHFSHYCHTRADCQTIRYAKVWYRDFFKKYESWCDKFFQYMDTYPKSRVPRTTQIMYFKLICYARKLNGCLAYSWTDMNSINMADSWVARRNIRRLRLFNQMFDSYPRCIGTSWKSHVRWETQQKGSVDAPESRNNDQQLCICRKCRNSPACWEAHAAITAHHRTVTRDCNEEGKFMGNEKIGCMWHWGVGQLQCKGGERNNVDQHYPWRYTGAFRSGNQAFHPCDGYQTIYMHPRLCDTCCKENFCIEPKKSNRYNWDSMCNKNICPHSNSFFQHSWVQSYMCLVILIFAWCDGSTATYTSTGDIMSFCHGRSQWTEHHPSPNSNHQSWENNAKEKCSVYWDTLYGGLEGSTAMVTWEYDGWFIFEGEWKPERYFMATPCHDPLGVSVITWLVYKHVSQVNEGKEDEGNYEAHGPSFRRMWCVEIVESGHDMNGCKFPTLAHLDDMTILGIINFSRLSRDFSLRHRYWDHNPKVLHQMIYKFVGHAVISHMFNTTNCKKMCFMQHKHIMEMEKWYNAQFGHPGPGTFRKHGLERCMQLFCCGQWNSDMWMVYLVENMHVVAPHFFNQKLPYHFQVSQNDLLAQHCTSFHVQKWKIQQAECCGFMKGWFLGDDGNVKQQDHESDWCHKKKYSECVVEFSMIITLMESNVCKSWCWSHDDGESWRWYVNIMNVCMPYGVDRYITPFFAKLCLWTVVQYMDESPTFKPWTEWWSFCCCTTTRAKMFHLNLCTTETVDVGMAMQNQLISCQQHDSTWFFFVERYVVYGAFKEFNNHWTNKYYEKHDLECQYIQRECCHAQGCKGSHENKGHHTFTTSFGAKCFPTNQVGNRPEFILFVIFWYDFELPLRWVLKDRDTPACGSAENVEKNRNMDHMENKNLLVCYGMVFSFAYLIQQQCMDRGMSPQAPYPFKFFNKGDCSWGSKEQFDWYRRYDTIQWGHNKQEEIDIYFSHCHSNKRGRGDTMHDSSYRYKFHMQKRSKRICGWSHTAAAIVQECSWWMPMLGCMKYYYMLLDKDVMYINICVEWPHFRNIQNNWYGWDMKQEPKERHDVHACTGSAVYMSYFCSDESHIVDWSQICNKWAMQNHLDPQQMWMIEMLFMCINFTQLWIGHGCMHSLFAYPKEKQPHTPKRYNDWNGMTHNYELWSTFKFWAMRIQHMGALHPLTWPKSPQRDFAKFALDFDDGVRKIEHKPPILKVGHLSGKTEHHAGVVRTMWDDWWQDWKFCMTKPMGENSYKNIYSGRKDDGMFYTQVISCYEDHQDLVYECISLFMMQATHFPDNYINKMVPMPNGLSEEGVNQGKCRHGYPPHGYYGECNKKDPWLKGNSNCLEEIHEYDDFCVMEVINAKVRHCTAWGCFYWTTADLCGNTIMDYRLPSREKVNFAHNEIAQCVTNNYRVDLDWHGEVYPPNWQFDLTTLITKDTKACREGSTCLQTINMFETMILDTGVGWIKWDDCMYYWYWHRGECIDYGVALKGWVTNFKTSCGAHYEHGTQCVLELFYDPACERGPGWSYNPCFCIFVRTTDFDQYMILEFIEHEENVWVFLRKPFWDGCPPHSDWIMSINHAHKCCTSPNCGHHARQGMLRFDVGYSNKYMFQCEWTLNSFVRIRWREWHNESVGIMHHWSKCWYKMNDRCRAMPGVTEPKRTGCRGCTLMWRINSMTEVSEWPWRLGAQIDGGAQEYCPDLRQMFRQHWPTCVTWRWLDRTPHPKWNLSLSCKDEPMSDECEFELPMWKEKHLHSVNPCTDQYYANCHWKQMWTSQMDPQEMRMLYDTFNIEDLDWCWWVQGSNWSKHMDNKPMDNDGEDHHVNMFKCRSHCFDTLHMTPLNKKQWQERSMPGPSDWFPRISWLCKMGHVCQTPPNLENDYDIQHGEGAWWSYLNVQPLWHMGWFDFTTYKLTDICPYPGGHSCMISCELIRNWDHRIAFMSPAVIVVDHRQWARAWMLLANGYPRPDTNGPLEGIHTVPTYGLGWVDSHVWMIWPADLQLIKMRRKPWCCKWIDPQDQVWYQNGQSHVWTQGHAWWIQLIHIFFYLPTGACNTMVMMKGSGGRGCTHNKFCVSEAKVQKMKTYRSNCWERDIDGDPPGHDKWRVGNCDNQHEPLRWMQEEVCLQESVASFYRELRWYGHGALRDCDPNVLCITWFFIPKTTPVPRPSLIGEQTIAHIMNKGWYERQAWWPECTFVNFTWHMQYYCQVPWTIFTPAYMHLEKWYIINPTSAHVFLHMELWEYIDIYMAIIDQMAFSFVEFEVNICAWPWKIEGECHDRLGCHLIYNYKSQSPQSVRCLGRCEGCTMNRMHADCNNPIILMASIKNFCEQVAYITVKETVNVHMVAPKFYHVFPMTITLFWIFKRKYRFYWWWNTHLQLEIYWTLQEPGFWADPCRKEFPCCPMWTHRREVLPPFCQDWEIRCWPCDLYLVTWALVCNEWPDNFQLWVGIAAKGPRAQYPAHCALSGTKFIGVWCFIFHMARFNKKCHYMQRMIGPGMECSQCSNKRKFAQVGVHSFDNDLMKCQKTSNSKQEYKKDNPIYDEGMMNLEPKYPPLRYMSGPRRHKTMEREMYHRPWYGDHTGQAAEVDRYKIMIQMAFFMFISDEWNLQARFIQTWQRNICKDACQRQWGTKGYSFPNQVQCARTMSMDHRTNRERTWCHWCGYYHKIWPYGHNPWVLYLTKELSQTTFIPPKHCLWHDQSNIYLSQLTMMQDKPLCPSPHTLVGCWQIEYVWMQNRTVHIPLNHRVYGGKMNSVRQSFRQHYIKWSMNTQHCKPDHQYPDHWKRYRSWTFFKYRPKDAIGWAYCHEIGIKWHVMDRQMNKRKAQDVAECCVRCPLMWCKIPFYGMRYAQPCLGWDGRHHSIYIKLENIWKRGPFP"
real_spectra = ip
proteome = ip2

In [5]:
PeptideIdentification(real_spectra, proteome)

('PFILWFYIHTLPW', 171)

# Problem 2 PSMSearch

``` pseudocode
PSMSearch(SpectralVectors, Proteome, threshold).
    PSMSet ← an empty set
    for each vector Spectrum' in SpectralVectors
          Peptide ← PeptideIdentification(Spectrum', Proteome)
          if Score(Peptide, Spectrum) ≥ threshold
              add the PSM (Peptide, Spectrum') to PSMSet
    return PSMSet
```

In [6]:
# aa_weights = {"X":4, "Z":5}
# aa_weights_rv = {4:"X", 5:"Z"}
ip = """"""
ip = ip.strip()
ip = ip.split("\n")
threshold = int(ip.pop(-1))
proteome = ip.pop(-1)
real_spectra_set = []
for line in ip:
    real_spectra = line.strip().split(" ")
    real_spectra = [0] + real_spectra
    real_spectra = np.array([int(bit) for bit in real_spectra])
    real_spectra_set.append(real_spectra)

In [7]:
PSMSet = PSMSearch(real_spectra_set,proteome,threshold)
print(len(PSMSet))

28


In [8]:
for i in PSMSet:
    print(i)

MNWITGGHRPWML
IAGMDGMQGALEVYKR
RMGIPRGAYTWGVH
ALGGMWVEIMDRTCQ
HFKQYASGKTKSPTVD
QETTSVIMMVYTQSR
NAPMEREPNSDRLF
MSCEEFEWRDDP
HAIDFLCYFRGIARG
TFPMETCQDYQTAF
KYMCEKFPGWIWL
EQHCFFCMWPWNF
ANPLDPLHREANCY
HERQKGNPNPSFHS
TSNTAIWYGAKEQH
FSCGKFKVTHDPIR
AYQGCIREDIRIGQW
GQQYDMDYGQFYI
PGWIWLGLVPRMGIP
TSIYWRHAFIFGTA
KPDTVYNFYCQGEQN
SYDECLYKKGHLP
RHMHSVMLPCCFGFH
GPTCIICNYTPPMEQ
EYTPTFPMETCQD
VNDLKTTRMCELN
YIWFLTEINQEYA
GATVGIKHSSIYVVKDS


# Problem 3 Size of Spectral Dictionary Problem

In [130]:
# aa_weights = {"X":4, "Z":5}
# aa_weights_rv = {4:"X", 5:"Z"}
ip = """0 1 -9 0 -2 1 -3 10 -3 1 -8 14 -9 -10 12 -7 -9 2 4 4 11 -3 -7 -7 14 -9 12 10 -8 12 3 9 -6 -6 10 -4 13 7 -1 12 12 -5 -10 4 -6 -5 15 10 10 -2 -3 4 -3 1 12 0 15 1 7 -3 12 11 -9 3 3 -5 -8 11 -3 8 0 -6 15 15 11 4 8 2 -1 3 0 0 11 11 -3 10 0 10 10 -1 -4 0 4 -4 7 -10 9 -8 5 -9 -5 9 -1 5 -4 13 -7 5 11 -4 -6 5 -3 5 8 13 9 -10 4 0 1 -1 11 -9 -3 -6 8 -9 9 5 -7 6 -8 10 -8 -1 13 11 -6 -5 8 3 -2 -7 -1 10 3 2 -4 2 12 11 6 5 15 6 9 1 -7 -6 10 -7 -2 4 5 11 5 -5 15 9 6 0 5 -7 11 2 1 1 14 -10 -2 10 7 -3 7 -5 -1 -4 12 15 5 -1 -4 -1 12 -9 1 -9 2 15 15 9 2 1 13 0 15 2 5 -5 -6 12 -10 3 -10 13 -4 8 6 11 5 -2 6 8 10 5 -6 6 -1 -3 -6 -3 0 12 -8 7 2 -9 15 13 -8 13 8 4 7 -5 10 1 8 1 -4 5 3 3 4 9 7 -8 9 -5 11 -9 -4 4 2 4 3 12 -5 6 8 -7 -5 1 -10 15 15 6 11 14 3 -5 -8 3 9 3 12 12 -8 4 -3 0 14 0 0 12 2 9 -5 15 -3 -5 -6 11 -4 6 7 -9 15 4 11 7 -4 15 15 13 7 14 7 3 -5 -8 5 11 15 15 0 3 -6 13 8 15 -6 -6 -9 -9 -1 -6 -6 1 3 -2 5 14 15 -3 5 -8 -2 -5 2 5 -1 -2 10 -6 -10 15 11 -10 5 -8 3 11 13 -1 1 4 4 12 6 1 4 2 8 14 -3 8 -4 1 2 3 -7 6 2 5 7 -7 2 -4 9 10 7 2 -5 -8 -7 -8 -5 -2 4 15 -8 -6 15 2 7 -4 12 -2 13 7 9 -8 3 -5 -1 -10 -6 10 -4 9 5 8 11 8 -5 4 -6 -5 6 6 13 3 -3 -4 -10 0 -3 3 -8 -8 0 15 -8 6 12 13 14 -8 -9 -2 4 -5 -5 8 6 5 0 -9 13 11 -4 8 -10 14 3 -7 0 -3 4 -1 -5 7 15 -9 5 0 -4 14 3 -4 11 -7 5 4 4 -7 15 -6 14 8 11 7 3 7 6 10 8 4 9 6 -6 8 3 -1 2 -4 8 -9 11 13 6"""
t_min = 33
t_max = 200
ip = ip.strip()
ip = ip.split(" ")
ip = np.array([int(bit) for bit in ip])

In [131]:
real_spectra = ip
downward = t_max*2
upward = t_max*2
n_row = t_max + 1 + downward + upward
n_column = len(real_spectra)
size = np.zeros([n_row,n_column])
size[0+downward,0] = 1
size.shape

(1001, 513)

In [132]:
for i_current in range(57,n_column):
    for mass in aa_weights_20:
        i_back = i_current - mass
        if i_back < 0:
            continue
        score_i = real_spectra[i_current]
        last_row_back =  t_max +t_max + downward - score_i
        first_row_back = 0 + -t_max + downward - score_i
        rows_back = [row for row in range(first_row_back,last_row_back+1)]
        eligible_rows_back = [row for row in rows_back if row >= 0 and row <n_row]
        if len(rows_back) != len(eligible_rows_back):
            print("row back and eligible row back", len(rows_back), len(eligible_rows_back))
        eligible_rows_current =[row+score_i for row in eligible_rows_back]
        size[eligible_rows_current,i_current] += size[eligible_rows_back, i_back]
sum(size[t_min+downward:t_max+1+downward,n_column-1])

3456.0

In [116]:
aa_weights_20

[57,
 71,
 87,
 97,
 99,
 101,
 103,
 113,
 113,
 114,
 115,
 128,
 128,
 129,
 131,
 137,
 147,
 156,
 163,
 186]