In [1]:
import numpy as np
import copy

# Week 1: Introduction to Evolutionary Tree Construction

In [2]:
'''
Distances Between Leaves Problem: Compute the distances between leaves in a weighted tree.
Input:  An integer n followed by the adjacency list of a weighted tree with n leaves.
Output: An n x n matrix (di,j), where di,j is the length of the path between leaves i and j.
'''

def Distances_Matrix(n, adjacency_list):
    if type(adjacency_list) == str:
        adjacency_list = adjacency_list.split('\n')
        
    graph        = dict()
    graph_weight = dict()
    for adjacency in adjacency_list:
        adjacency = adjacency.split(':')
        graph_weight[adjacency[0]] = int(adjacency[1])
        adjacency = adjacency[0].split('->')
        adjacency[0], adjacency[1] = int(adjacency[0]), int(adjacency[1])
        if adjacency[0] in graph:
            graph[adjacency[0]].append(adjacency[1])
        else:
            graph[adjacency[0]] = [(adjacency[1])]
    
    length_matrix = np.full([n,n], 0, int)

    for from_ in range(n):
        weight_row = [0] * (max(graph.keys()) + 1)
        froms      = [from_]
        
        while len(froms) != 0:
            
            next_froms = []
            for node in froms:
                tos = graph[node]
                
                for to in tos:
                    if to != from_:
                        if (to not in range(n)) & (weight_row[to] == 0):
                            next_froms.append(to)
                        
                        graph_weight_key = str(node) + '->' + str(to)
                        weight           = graph_weight[graph_weight_key]
                        weight_row[to]   = weight_row[node] + weight
                        
            froms = next_froms
        length_matrix[from_, :] = weight_row[: n]

    return(length_matrix)

# Test
n = 4
adjacency_list = '''0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6'''

Distances_Matrix(n, adjacency_list)

array([[ 0, 13, 21, 22],
       [13,  0, 12, 13],
       [21, 12,  0, 13],
       [22, 13, 13,  0]])

In [3]:
'''
Code Challenge: Solve the Limb Length Problem.
Input: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance matrix D (whose elements are integers).
Output: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing).
'''

def Limb_Length(j, length_matrix):
    if type(length_matrix) == str:
        length_matrix = length_matrix.replace('\n', ' ')
        length_matrix = length_matrix.split(' ')
        length_matrix = list(map(int, length_matrix))
        length_matrix = np.array(length_matrix).reshape(n, n)

    min_length = float('Inf')
    for i in range(length_matrix.shape[0]):
        if i != j:
            for k in range(length_matrix.shape[0]):
                if (k != j) & (k != i):
                    length = (length_matrix[i, j] + length_matrix[j, k] - length_matrix[i, k]) / 2
                    if length < min_length:
                        min_length = int(length)
    
    return(min_length)

# Test
n = 4
j = 1
length_matrix = '''0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0'''

Limb_Length(j, length_matrix)

2

In [4]:
'''
Code Challenge: Implement AdditivePhylogeny to solve the Distance-Based Phylogeny Problem.
Input: An integer n followed by a space-separated n x n distance matrix.
Output: A weighted adjacency list for the simple tree fitting this matrix.
'''

def Attached_Limb(length_matrix, j):
    for i in range(length_matrix.shape[0]):
        for k in range(length_matrix.shape[0]):
            if (i != j) & (k != j) :
                if length_matrix[i, k] == length_matrix[i, j - 1] + length_matrix[j - 1, k]:
                    return(i, k)

def Find_Path(nodes, current, final, path, visited, final_path):
    path = path + [current]
    visited.append(current)
    neighbor_nodes = nodes[current].keys()
    if current == final:
        final_path.extend(path) 
        return

    unvisited_neighbor_nodes = set(neighbor_nodes) - set(visited)
    if len(unvisited_neighbor_nodes) == 0:
        return
    
    for unvisited_neighbor_node in list(unvisited_neighbor_nodes):
        Find_Path(nodes,int(unvisited_neighbor_node),final,path,visited, final_path)

    return final_path
 
def add_to_graph(length_matrix,nodes,n,m,i,k,x):
    visited = []
    final_path = []
    
    i_k_path = Find_Path(nodes,i,k,[],visited,final_path)   
    total_length = 0

    for index in range(len(i_k_path) - 1):
        current_node   = i_k_path[index]
        next_node      = i_k_path[index + 1]
        length_between = nodes[current_node][next_node]
        total_length   = total_length + length_between

        if total_length == x:
            limb_length = Limb_Length(n, length_matrix)
            nodes[next_node][n] = limb_length
            nodes[n] = {next_node:limb_length}
            return nodes
        
        elif total_length > x:
            length1 = x - (total_length - length_between)
            length2 = total_length - x

            limb_length = Limb_Length(n , length_matrix)

            nodes[current_node].pop(next_node)
            nodes[next_node]   .pop(current_node)

            nodes[current_node][m[0]] = length1
            nodes[next_node][m[0]]    = length2
            nodes[m[0]] = {current_node:length1, next_node:length2}

            nodes[m[0]][n] = limb_length
            nodes[n]       = {m[0]:limb_length}
            m[0]           = m[0] + 1
            return nodes
    return nodes

def AdditivePhylogeny(length_matrix,n,m):
    '''if type(length_matrix) == str:
        length_matrix = length_matrix.replace('\n', ' ')
        length_matrix = length_matrix.split(' ')
        length_matrix = list(map(int, length_matrix))
        length_matrix = np.array(length_matrix).reshape(n, n)'''
    
    if n == 1:
        nodes = {}
        nodes[1] = {0:length_matrix[0, 1]}
        nodes[0] = {1:length_matrix[0, 1]}
        return nodes
    
    limb_length = Limb_Length(n , length_matrix)

    sub_matrix = copy.deepcopy(length_matrix)

    for j in range(n):

        sub_matrix[j, n] = sub_matrix[j, n] - limb_length
        sub_matrix[n, j] = sub_matrix[j, n]   

    (i,k) = Attached_Limb(sub_matrix, n)

    x = sub_matrix[i, n]

    sub_matrix = sub_matrix[: -1, : -1]
        
    nodes = AdditivePhylogeny(sub_matrix,n-1,m)

    nodes = add_to_graph(length_matrix, nodes, n, m, i, k, x)
    return nodes

#Test
n = 4
length_matrix = '''0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0'''
if type(length_matrix) == str:
    length_matrix = length_matrix.replace('\n', ' ')
    length_matrix = length_matrix.split(' ')
    length_matrix = list(map(int, length_matrix))
    length_matrix = np.array(length_matrix).reshape(n, n)

tmp = AdditivePhylogeny(length_matrix, n - 1,[n])    
for key1, values1 in tmp.items():
    for key2, value2 in values1.items():
        print(str(key1) + '->' + str(key2) + ':' + str(value2))

1->4:2
0->4:11
4->0:11
4->1:2
4->5:4
2->5:6
5->4:4
5->2:6
5->3:7
3->5:7


# Week 2: More Algorithms for Constructing Trees from Distance Matrices

In [5]:
'''
Code Challenge: Implement UPGMA.
Input: An integer n followed by a space separated n x n distance matrix.
Output: An adjacency list for the ultrametric tree returned by UPGMA. 
'''

def UPGMA(n, length_matrix):
    if type(length_matrix) == str:
        length_matrix = length_matrix.replace('\n', ' ')
        length_matrix = length_matrix.split(' ')
        while '' in length_matrix:
            length_matrix.remove('')
        length_matrix = list(map(int, length_matrix))
        length_matrix = np.array(length_matrix).reshape(n, n)
        
    
    clusters = list(range(n))
    ages = dict()
    graph = dict()
    n_node_cluster = dict()
    
    for node in clusters:
        ages[node] = 0
        n_node_cluster[node] = 1
        
    while len(clusters) != 1:
        
        n = length_matrix.shape[0]
        
        # find minimun length
        min_length   = float('Inf')
        for i in range(n):
            for j in range(n):
                if (length_matrix[i, j] != 0) & (length_matrix[i, j] < min_length) & (i < j):
                    min_length   = length_matrix[i, j]
                    min_location = [i, j]
                    
        i_index, j_index = min_location
        
        i = clusters[i_index]
        j = clusters[j_index]
        
        #make new cluster
        new_cluster = max(clusters) + 1
        
        #update new cluster's n_node
        n_node_cluster[new_cluster] = n_node_cluster[i] + n_node_cluster[j]
        
        #update cluster list
        clusters.remove(i)
        clusters.remove(j)
        clusters.append(new_cluster)
        
        #update ages
        ages[new_cluster] = length_matrix[i_index, j_index] / 2
        
        #add new cluster to graph
        if new_cluster in graph :
            graph[new_cluster][i] = ages[new_cluster] - ages[i]
            graph[new_cluster][j] = ages[new_cluster] - ages[j]
        else:
            graph[new_cluster] = {i : (ages[new_cluster] - ages[i]), j : (ages[new_cluster] - ages[j])}
        if i in graph :
            graph[i][new_cluster] = ages[new_cluster] - ages[i]
        else: 
            graph[i] = {new_cluster : (ages[new_cluster] - ages[i])}
        if j in graph :
            graph[j][new_cluster] = ages[new_cluster] - ages[j]
        else: 
            graph[j] = {new_cluster : (ages[new_cluster] - ages[j])}
        
        new_col = []
        for vi, vj in zip(length_matrix[:,i_index], length_matrix[:,j_index]):
            if (vi * vj) != 0:
                new_value = (vi * n_node_cluster[i] + vj * n_node_cluster[j]) / (n_node_cluster[i] + n_node_cluster[j])
                new_col.append(new_value)
        
        #update matrix
        length_matrix = np.delete(length_matrix, [i_index, j_index], 0)
        length_matrix = np.delete(length_matrix, [i_index, j_index], 1)
        
        length_matrix = np.vstack((length_matrix,np.array(new_col).reshape(1, len(new_col))))
        new_col.append(0)
        length_matrix = np.hstack((length_matrix,np.array(new_col).reshape(len(new_col), 1)))
            
    return(graph)

# Test
n = 4
length_matrix = '''0 20 17 11
20 0 20 13
17 20 0 10
11 13 10 0'''

tmp = UPGMA(n, length_matrix)

for key1, values1 in tmp.items():
    for key2, value2 in values1.items():
        print(str(key1) + '->' + str(key2) + ':' + str(value2))

4->2:5.0
4->3:5.0
4->5:2.0
2->4:5.0
3->4:5.0
5->0:7.0
5->4:2.0
5->6:1.83333333333
0->5:7.0
6->1:8.83333333333
6->5:1.83333333333
1->6:8.83333333333


In [6]:
'''
Code Challenge: Implement NeighborJoining.
Input: An integer n, followed by an n x n distance matrix.
Output: An adjacency list for the tree resulting from applying the neighbor-joining algorithm. 
'''

def NeighborJoining(n, length_matrix, nodes, m):

    graph = dict()
    limb_length = dict()
    if n == 2:
        graph[nodes[0]] = {nodes[1]: length_matrix[0, 1]}
        graph[nodes[1]] = {nodes[0]: length_matrix[1, 0]}
        return(graph)
    
    total_distance = {}
    for i in range(n):
        total_distance[i] = np.sum(length_matrix[i,:])
    
    # generate joining matrix 
    matrix_star = np.full([n,n], 0, float)
    for i in range(n):
        for j in range(n):
            if i != j:
                matrix_star[i, j] = (n - 2) * length_matrix[i, j] - total_distance[i] - total_distance[j]
    
    # find minimun value in joining matrix
    min_length   = float('Inf')
    for i in range(n):
        for j in range(n):
            if (matrix_star[i, j] != 0) & (matrix_star[i, j] < min_length) & (i < j):
                min_length   = matrix_star[i, j]
                min_location = [i, j]
    i_index, j_index = min_location
    
    i = nodes[i_index]
    j = nodes[j_index]
    
    # calculate delta for limb length
    delta = (total_distance[i_index] - total_distance[j_index]) / (n - 2)
    
    limb_length[i_index] = (length_matrix[i_index, j_index] + delta) / 2
    limb_length[j_index] = (length_matrix[i_index, j_index] - delta) / 2
    
    #make new node
    new_node = m 
    m = m + 1
    
    # update node list
    nodes.append(new_node)
    nodes.remove(i)
    nodes.remove(j)
    
    # update length matrix
    new_col = []
    for k in range(n):
        new_value = (length_matrix[k, i_index] + length_matrix[k, j_index] - length_matrix[i_index, j_index]) / 2
        new_col.append(new_value)
    length_matrix = np.vstack((length_matrix,np.array(new_col).reshape(1, len(new_col))))
    new_col.append(0)
    length_matrix = np.hstack((length_matrix,np.array(new_col).reshape(len(new_col), 1)))
    
    length_matrix = np.delete(length_matrix, [i_index, j_index], 0)
    length_matrix = np.delete(length_matrix, [i_index, j_index], 1)
    
    # iteration
    graph = NeighborJoining(n - 1, length_matrix, nodes, m)
    
    # update graph
    if new_node in graph :
        graph[new_node][i] = limb_length[i_index]
        graph[new_node][j] = limb_length[j_index]
    else:
        graph[new_node] = {i : limb_length[i_index], j : limb_length[j_index]}
    if i in graph :
        graph[i][new_node] = limb_length[i_index]
    else: 
        graph[i] = {new_node : limb_length[i_index]}
    if j in graph :
        graph[j][new_node] = limb_length[j_index]
    else: 
        graph[j] = {new_node : limb_length[j_index]}  
    
    return(graph)

# Test
n = 4
length_matrix = '''0 23 27 20
23 0 30 28
27 30 0 30
20 28 30 0'''

nodes = list(range(n))
m = n
if type(length_matrix) == str:
    length_matrix = length_matrix.replace('\n', ' ')
    length_matrix = length_matrix.split(' ')
    while '' in length_matrix:
        length_matrix.remove('')
    length_matrix = list(map(int, length_matrix))
    length_matrix = np.array(length_matrix).reshape(n, n)

    
tmp = NeighborJoining(n, length_matrix, nodes, m)
for key1, values1 in tmp.items():
    for key2, value2 in values1.items():
        print(str(key1) + '->' + str(key2) + ':' + str(value2))

4->5:2.0
4->0:8.0
4->3:12.0
5->4:2.0
5->1:13.5
5->2:16.5
1->5:13.5
2->5:16.5
0->4:8.0
3->4:12.0


# Week 3: Constructing Evolutionary Trees from Characters

In [7]:
'''
Code Challenge: Implement SmallParsimony to solve the Small Parsimony Problem.
Input: An integer n followed by an adjacency list for a rooted binary tree with n leaves labeled by DNA strings.
Output: The minimum parsimony score of this tree, followed by the adjacency list of a tree corresponding to labeling
     internal nodes by DNA strings in order to minimize the parsimony score of the tree. 
'''

def SmallParsimony(tree, n_leaf):
    tree_graph  = dict()
    tags   = dict()
    n_character = 0
    current_node = 0
    node_character = dict()
    
    
    if type(tree) == str:
        tree = tree.split('\n')
        for line in tree:
            line  = line.split('->')
            upper = int(line[0])
            
            # tag 1 for processed(have character), 0 for un-processed
            try:
                bottom       = int(line[1])
                tags[bottom] = 0
                
                if upper in tree_graph:
                    tree_graph[upper].append(bottom)
                else:
                    tree_graph[upper] = [bottom]
                    
            except:
                bottom       = line[1]
                node_character[current_node] = bottom
                tags[current_node] = 1
                
                if upper in tree_graph:
                    tree_graph[upper].append(current_node)
                else:
                    tree_graph[upper] = [current_node]
                    
                current_node = current_node + 1
                n_character  = len(bottom)
                
                
    # find root
    for node in tree_graph.keys():
        if node not in tags:
            root = node
            break
    tags[root] = 0
    
    # make a matrix to record scores, character_score[node, character_index(ACGT), position of character]. 
    character_score = np.full((root + 1, 4, n_character), float('Inf'), float)
    
    # initalize the character_score matrix
    for node, tag in tags.items():
        if tag == 1:
            characters = node_character[node]
            for index in range(len(characters)):
                character       = characters[index]
                character_index = 'ACGT'.index(character)
                character_score[node, character_index, index] = 0
    
    # fill the character_score matrix
    while 0 in tags.values():
        for node, tag in tags.items():
            if tag == 0:
                down_node_1, down_node_2 = tree_graph[node]
                if (tags[down_node_1] * tags[down_node_2]) == 1 :
                    for position_index in range(n_character):
                        for character_index in range(4):
                            min_score = float('Inf')
                            for character_index_1 in range(4):
                                for character_index_2 in range(4):
                                    if character_index_1 == character_index:
                                        delta_1 = 0
                                    else:
                                        delta_1 = 1

                                    if character_index_2 == character_index:
                                        delta_2 = 0
                                    else:
                                        delta_2 = 1

                                    score = character_score[down_node_1, character_index_1, position_index] + character_score[down_node_2, character_index_2, position_index] + delta_1 + delta_2
                                    if score < min_score:
                                        min_score = score
                            character_score[node, character_index, position_index] = min_score
                    tags[node] = 1
                        
    # find root characters                    
    root_character = ''
    for position_index in range(n_character):
        score_list      = list(character_score[root, :, position_index])
        character_index = score_list.index(min(score_list))
        character       = 'ACGT'[character_index]
        root_character  = root_character + character
    node_character[root] = root_character
    
    # trackback to find all other characters
    while len(node_character) != root + 1:
        for node, down_nodes in tree_graph.items():
            if (node in node_character):
                if (down_nodes[0] not in node_character) & (down_nodes[1] not in node_character):
                    characters_1 = ''
                    characters_2 = ''
                    
                    for position_index in range(n_character):
                        character       = node_character[node][position_index]
                        character_index = 'ACGT'.index(character)
                        socre           = character_score[node, character_index, position_index]
                        
                        for character_index_1 in range(4):
                            for character_index_2 in range(4):
                                if character_index_1 == character_index:
                                    delta_1 = 0
                                else:
                                    delta_1 = 1

                                if character_index_2 == character_index:
                                    delta_2 = 0
                                else:
                                    delta_2 = 1
                                
                                if socre == (character_score[down_nodes[0], character_index_1, position_index] + character_score[down_nodes[1], character_index_2, position_index] + delta_1 + delta_2) :
                                    min_index_1, min_index_2 = character_index_1, character_index_2
                                    
                        character_1  = 'ACGT'[min_index_1]
                        character_2  = 'ACGT'[min_index_2]
                        characters_1 = characters_1 + character_1
                        characters_2 = characters_2 + character_2
                        
                    node_character[down_nodes[0]] = characters_1
                    node_character[down_nodes[1]] = characters_2
                
                if (down_nodes[0] not in node_character) & (down_nodes[1] in node_character):
                    characters_1 = ''
                    
                    for position_index in range(n_character):
                        character       = node_character[node][position_index]
                        character_index = 'ACGT'.index(character)
                        socre           = character_score[node, character_index, position_index]
                        
                        character_2       = node_character[down_nodes[1]][position_index]
                        character_index_2 = 'ACGT'.index(character_2)
                        
                        for character_index_1 in range(4):
                            if character_index_1 == character_index:
                                delta_1 = 0
                            else:
                                delta_1 = 1

                            if character_index_2 == character_index:
                                delta_2 = 0
                            else:
                                delta_2 = 1
                            
                            if socre == (character_score[down_nodes[0], character_index_1, position_index] + character_score[down_nodes[1], character_index_2, position_index] + delta_1 + delta_2) :
                                min_index_1 = character_index_1
                        
                        character_1  = 'ACGT'[min_index_1]
                        characters_1 = characters_1 + character_1
                    node_character[down_nodes[0]] = characters_1
                
                if (down_nodes[0] in node_character) & (down_nodes[1] not in node_character):
                    characters_2 = ''
                    
                    for position_index in range(n_character):
                        character       = node_character[node][position_index]
                        character_index = 'ACGT'.index(character)
                        socre           = character_score[node, character_index, position_index]
                        
                        character_1       = node_character[down_nodes[0]][position_index]
                        character_index_1 = 'ACGT'.index(character_1)
                        
                        for character_index_2 in range(4):
                            if character_index_1 == character_index:
                                delta_1 = 0
                            else:
                                delta_1 = 1

                            if character_index_2 == character_index:
                                delta_2 = 0
                            else:
                                delta_2 = 1
                            
                            if socre == (character_score[down_nodes[0], character_index_1, position_index] + character_score[down_nodes[1], character_index_2, position_index] + delta_1 + delta_2) :
                                min_index_2 = character_index_2
                        
                        character_2  = 'ACGT'[min_index_2]
                        characters_2 = characters_2 + character_2
                    node_character[down_nodes[1]] = characters_2
    
    # connect node to each other
    new_graph = dict()
    for node, down_nodes in tree_graph.items():
        if down_nodes[0] not in tree_graph:
            new_graph[down_nodes[0]] = [node]
        else:
            if node not in tree_graph[down_nodes[0]]:
                tree_graph[down_nodes[0]].append(node)
                
        if down_nodes[1] not in tree_graph:
            new_graph[down_nodes[1]] = [node]
        else:
            if node not in tree_graph[down_nodes[1]]:
                tree_graph[down_nodes[1]].append(node)
    
    tree_graph = {**tree_graph, **new_graph}
    
    # calculate score for root
    final_score = 0
    for position_index in range(n_character):
        character       = node_character[root][position_index]
        character_index = 'ACGT'.index(character)
        position_score  = character_score[root, character_index, position_index]
        final_score     = final_score + position_score
    

    return(tree_graph, node_character, final_score)

def HammingDistance(string1, string2):
    mismatch = 0
    for base1, base2 in zip (string1, string2):
        if base1 != base2: 
            mismatch = mismatch + 1
    return (mismatch)


# Test
n_leaf = 4
tree   = '''4->CAAATCCC
4->ATTGCGAC
5->CTGCGCTG
5->ATGGACGA
6->4
6->5'''

tree_graph, node_character, final_score = SmallParsimony(tree, n_leaf)

print(int(final_score))
for node_index, down_node_indexs in tree_graph.items():
    node = node_character[node_index]
    for down_node_index in down_node_indexs:
        down_node = node_character[down_node_index]
        print(str(node) + '->' + str(down_node) + ':' + str(HammingDistance(node, down_node)))

16
ATAGTCAC->CAAATCCC:4
ATAGTCAC->ATTGCGAC:3
ATAGTCAC->ATAGACAA:2
ATGGACTA->CTGCGCTG:4
ATGGACTA->ATGGACGA:1
ATGGACTA->ATAGACAA:2
ATAGACAA->ATAGTCAC:2
ATAGACAA->ATGGACTA:2
CAAATCCC->ATAGTCAC:4
ATTGCGAC->ATAGTCAC:3
CTGCGCTG->ATGGACTA:4
ATGGACGA->ATGGACTA:1


In [8]:
'''
Code Challenge: Solve the Small Parsimony in an Unrooted Tree Problem.
Input: An integer n followed by an adjacency list for an unrooted binary tree with n leaves labeled by DNA strings.
Output: The minimum parsimony score of this tree, followed by the adjacency list of the tree corresponding to labeling
     internal nodes by DNA strings in order to minimize the parsimony score of the tree.
'''

def SmallParsimony_Unrooted(tree, n_leaf):
    tree_graph  = dict()
    tags   = dict()
    n_character = 0
    current_node = 0
    node_character = dict()
    
    
    if type(tree) == str:
        tree = tree.split('\n')
        for line in tree:
            line  = line.split('->')
            if line[0].isdigit() :
                upper = int(line[0])

                # tag 1 for processed(have character), 0 for un-processed
                try:
                    bottom       = int(line[1])
                    tags[bottom] = 0

                    if upper in tree_graph:
                        tree_graph[upper].append(bottom)
                    else:
                        tree_graph[upper] = [bottom]

                except:
                    bottom       = line[1]
                    node_character[current_node] = bottom
                    tags[current_node] = 1

                    if upper in tree_graph:
                        tree_graph[upper].append(current_node)
                    else:
                        tree_graph[upper] = [current_node]

                    current_node = current_node + 1
                    n_character  = len(bottom)
    
    # make a fake root
    root = max(tree_graph.keys()) + 1
    tree_graph[root] = [max(tree_graph.keys())]
    tree_graph[root].append(max(tree_graph[root - 1]))
    tree_graph[root - 1].remove(max(tree_graph[root - 1]))
    tags[root] = 0
    
    # make the tree have same pattern as rooted tree
    remove_check       = copy.deepcopy(tags)
    remove_check[root]     = 1

    while sum(remove_check.values()) != (len(remove_check.values()) - 1):
        for node, down_nodes in tree_graph.items():
            if len(down_nodes) == 3:
                if remove_check[node] == 0:
                    if (remove_check[down_nodes[0]] + remove_check[down_nodes[1]] + remove_check[down_nodes[2]]) == 2:
                        for down_node in down_nodes:
                            if remove_check[down_node] == 0:
                                tree_graph[node].remove(down_node)
                                break
                        remove_check[node]      = 1

    
    
    # make a matrix to record scores, character_score[node, character_index(ACGT), position of character]. 
    character_score = np.full((root + 1, 4, n_character), float('Inf'), float)
    
    # initalize the character_score matrix
    for node, tag in tags.items():
        if tag == 1:
            characters = node_character[node]
            for index in range(len(characters)):
                character       = characters[index]
                character_index = 'ACGT'.index(character)
                character_score[node, character_index, index] = 0
    
    #print(tree_graph, tags, character_score)
    # fill the character_score matrix
    while 0 in tags.values():
        for node, tag in tags.items():
            if tag == 0:
                down_node_1, down_node_2 = tree_graph[node]
                if (tags[down_node_1] * tags[down_node_2]) == 1 :
                    for position_index in range(n_character):
                        for character_index in range(4):
                            min_score = float('Inf')
                            for character_index_1 in range(4):
                                for character_index_2 in range(4):
                                    if character_index_1 == character_index:
                                        delta_1 = 0
                                    else:
                                        delta_1 = 1

                                    if character_index_2 == character_index:
                                        delta_2 = 0
                                    else:
                                        delta_2 = 1

                                    score = character_score[down_node_1, character_index_1, position_index] + character_score[down_node_2, character_index_2, position_index] + delta_1 + delta_2
                                    if score < min_score:
                                        min_score = score
                            character_score[node, character_index, position_index] = min_score
                    tags[node] = 1
                        
    # find root characters                    
    root_character = ''
    for position_index in range(n_character):
        score_list      = list(character_score[root, :, position_index])
        character_index = score_list.index(min(score_list))
        character       = 'ACGT'[character_index]
        root_character  = root_character + character
    node_character[root] = root_character
    
    # trackback to find all other characters
    while len(node_character) != root + 1:
        for node, down_nodes in tree_graph.items():
            if (node in node_character):
                if (down_nodes[0] not in node_character) & (down_nodes[1] not in node_character):
                    characters_1 = ''
                    characters_2 = ''
                    
                    for position_index in range(n_character):
                        character       = node_character[node][position_index]
                        character_index = 'ACGT'.index(character)
                        socre           = character_score[node, character_index, position_index]
                        
                        for character_index_1 in range(4):
                            for character_index_2 in range(4):
                                if character_index_1 == character_index:
                                    delta_1 = 0
                                else:
                                    delta_1 = 1

                                if character_index_2 == character_index:
                                    delta_2 = 0
                                else:
                                    delta_2 = 1
                                
                                if socre == (character_score[down_nodes[0], character_index_1, position_index] + character_score[down_nodes[1], character_index_2, position_index] + delta_1 + delta_2) :
                                    min_index_1, min_index_2 = character_index_1, character_index_2
                                    
                        character_1  = 'ACGT'[min_index_1]
                        character_2  = 'ACGT'[min_index_2]
                        characters_1 = characters_1 + character_1
                        characters_2 = characters_2 + character_2
                        
                    node_character[down_nodes[0]] = characters_1
                    node_character[down_nodes[1]] = characters_2
                
                if (down_nodes[0] not in node_character) & (down_nodes[1] in node_character):
                    characters_1 = ''
                    
                    for position_index in range(n_character):
                        character       = node_character[node][position_index]
                        character_index = 'ACGT'.index(character)
                        socre           = character_score[node, character_index, position_index]
                        
                        character_2       = node_character[down_nodes[1]][position_index]
                        character_index_2 = 'ACGT'.index(character_2)
                        
                        for character_index_1 in range(4):
                            if character_index_1 == character_index:
                                delta_1 = 0
                            else:
                                delta_1 = 1

                            if character_index_2 == character_index:
                                delta_2 = 0
                            else:
                                delta_2 = 1
                            
                            if socre == (character_score[down_nodes[0], character_index_1, position_index] + character_score[down_nodes[1], character_index_2, position_index] + delta_1 + delta_2) :
                                min_index_1 = character_index_1
                        
                        character_1  = 'ACGT'[min_index_1]
                        characters_1 = characters_1 + character_1
                    node_character[down_nodes[0]] = characters_1
                
                if (down_nodes[0] in node_character) & (down_nodes[1] not in node_character):
                    characters_2 = ''
                    
                    for position_index in range(n_character):
                        character       = node_character[node][position_index]
                        character_index = 'ACGT'.index(character)
                        socre           = character_score[node, character_index, position_index]
                        
                        character_1       = node_character[down_nodes[0]][position_index]
                        character_index_1 = 'ACGT'.index(character_1)
                        
                        for character_index_2 in range(4):
                            if character_index_1 == character_index:
                                delta_1 = 0
                            else:
                                delta_1 = 1

                            if character_index_2 == character_index:
                                delta_2 = 0
                            else:
                                delta_2 = 1
                            
                            if socre == (character_score[down_nodes[0], character_index_1, position_index] + character_score[down_nodes[1], character_index_2, position_index] + delta_1 + delta_2) :
                                min_index_2 = character_index_2
                        
                        character_2  = 'ACGT'[min_index_2]
                        characters_2 = characters_2 + character_2
                    node_character[down_nodes[1]] = characters_2
    
    # connect node to each other
    new_graph = dict()
    for node, down_nodes in tree_graph.items():
        if down_nodes[0] not in tree_graph:
            new_graph[down_nodes[0]] = [node]
        else:
            if node not in tree_graph[down_nodes[0]]:
                tree_graph[down_nodes[0]].append(node)
                
        if down_nodes[1] not in tree_graph:
            new_graph[down_nodes[1]] = [node]
        else:
            if node not in tree_graph[down_nodes[1]]:
                tree_graph[down_nodes[1]].append(node)
    
    tree_graph = {**tree_graph, **new_graph}
    
    # calculate score for root
    final_score = 0
    for position_index in range(n_character):
        character       = node_character[root][position_index]
        character_index = 'ACGT'.index(character)
        position_score  = character_score[root, character_index, position_index]
        final_score     = final_score + position_score
    
    # remove the fake root and re-connect the node
    node_1, node_2 = tree_graph[root]
    tree_graph[node_1].remove(root)
    tree_graph[node_1].append(node_2)
    tree_graph[node_2].remove(root)
    tree_graph[node_2].append(node_1)
    tree_graph.pop(root)
    
    return(tree_graph, node_character, final_score)

# Test
n_leaf = 4
tree = '''TCGGCCAA->4
4->TCGGCCAA
CCTGGCTG->4
4->CCTGGCTG
CACAGGAT->5
5->CACAGGAT
TGAGTACC->5
5->TGAGTACC
4->5
5->4'''

tree_graph, node_character, final_score = SmallParsimony_Unrooted(tree, n_leaf)

print(int(final_score))
for node_index, down_node_indexs in tree_graph.items():
    node = node_character[node_index]
    for down_node_index in down_node_indexs:
        down_node = node_character[down_node_index]
        print(str(node) + '->' + str(down_node) + ':' + str(HammingDistance(node, down_node)))

17
CCTGGCAA->TCGGCCAA:3
CCTGGCAA->CCTGGCTG:2
CCTGGCAA->CAAGGAAT:4
CAAGGAAT->CACAGGAT:3
CAAGGAAT->TGAGTACC:5
CAAGGAAT->CCTGGCAA:4
TCGGCCAA->CCTGGCAA:3
CCTGGCTG->CCTGGCAA:2
CACAGGAT->CAAGGAAT:3
TGAGTACC->CAAGGAAT:5


In [9]:
'''
Code Challenge: Solve the Nearest Neighbors of a Tree Problem.
Input: Two internal nodes a and b specifying an edge e, followed by an adjacency list of an unrooted binary tree.
Output: Two adjacency lists representing the nearest neighbors of the tree with respect to e. Separate the
     adjacency lists with a blank line.
'''
def Nearest_Neighbors_Tree(a, b, tree):
    
    a=str(a)
    b=str(b)
    
    graph_tree = dict()
    if type(tree) == str:
        tree = tree.split('\n')
    for line in tree:
        line = line.split('->')
        if line[0] in graph_tree:
            graph_tree[line[0]].append(line[1])
        else:
            graph_tree[line[0]] = [line[1]]

    graph_tree[a].remove(b)
    graph_tree[b].remove(a)

    w, x = graph_tree[a]
    y, z = graph_tree[b]
    
    graph_tree[a].append(b)
    graph_tree[b].append(a)
    
    new_tree = copy.deepcopy(graph_tree)
    new_tree[a].remove(x)
    new_tree[x].remove(a)
    new_tree[b].remove(y)
    new_tree[y].remove(b)
    
    new_tree[a].append(y)
    new_tree[y].append(a)
    new_tree[b].append(x)
    new_tree[x].append(b)
    
    neighbor_tree = copy.deepcopy(graph_tree)
    neighbor_tree[a].remove(x)
    neighbor_tree[x].remove(a)
    neighbor_tree[b].remove(z)
    neighbor_tree[z].remove(b)
    
    neighbor_tree[a].append(z)
    neighbor_tree[z].append(a)
    neighbor_tree[b].append(x)
    neighbor_tree[x].append(b)
    
    return(new_tree, neighbor_tree)       
    
# Test
a, b = 5, 4
tree = '''0->4
4->0
1->4
4->1
2->5
5->2
3->5
5->3
4->5
5->4'''

new_tree, neighbor_tree = Nearest_Neighbors_Tree(a, b, tree)

for from_, tos in new_tree.items():
    for to in tos:
        print(str(from_) + '->' + str(to))
print('')
for from_, tos in neighbor_tree.items():
    for to in tos:
        print(str(from_) + '->' + str(to))  

0->5
4->1
4->5
4->3
1->4
2->5
5->2
5->4
5->0
3->4

0->4
4->0
4->5
4->3
1->5
2->5
5->2
5->4
5->1
3->4


In [10]:
'''
Code Challenge: Implement the nearest neighbor interchange heuristic for the Large Parsimony Problem.
Input: An integer n, followed by an adjacency list for an unrooted binary tree whose n leaves are labeled by DNA strings and
     whose internal nodes are labeled by integers.
Output: The parsimony score and unrooted labeled tree obtained after every step of the nearest neighbor interchange heuristic.
     Each step should be separated by a blank line.
'''

def Graph_to_str(tree_graph, node_character, leafs):
    list_tree = []
    for node_index, down_node_indexs in tree_graph.items():
        if node_index in leafs:
            node = node_character[node_index]
        else: node = str(node_index)
        for down_node_index in down_node_indexs:
            if down_node_index in leafs:
                down_node = node_character[down_node_index]
            else:
                down_node = str(down_node_index)
            list_tree.append(node + '->' + down_node)
    return('\n'.join(list_tree))

def NearestNeighborInterchange(tree, n_leaf):
    score = float('Inf')
    tree, node_character, new_score = SmallParsimony_Unrooted(tree, n_leaf)
    leafs          = list(range(n_leaf))
    internal_nodes = list(range(n_leaf, max(tree.keys()) + 1))
    new_tree       = tree
    new_char       = node_character
    #Graph_to_str(tree, node_character, leafs)
    while new_score < score:

        score = new_score
        tree  = new_tree
        node_character = new_char
        for internal_node in internal_nodes:
            down_nodes = tree[internal_node]
            for down_node in down_nodes:
                if down_node in internal_nodes:

                    neighbor_1, neighbor_2 = Nearest_Neighbors_Tree(internal_node, down_node, Graph_to_str(tree, node_character, leafs))
                    neighbor_1_str = Graph_to_str(neighbor_1, node_character, leafs)
                    neighbor_2_str = Graph_to_str(neighbor_2, node_character, leafs)
                    tree1, node_character1, neighbor_score1 = SmallParsimony_Unrooted(neighbor_1_str, n_leaf)
                    tree2, node_character2, neighbor_score2 = SmallParsimony_Unrooted(neighbor_2_str, n_leaf)
                    
                    if neighbor_score1 < new_score:
                        new_score = neighbor_score1
                        new_tree  = tree1
                        new_char  = node_character1
                    if neighbor_score2 < new_score:
                        new_score = neighbor_score2
                        new_tree  = tree2
                        new_char  = node_character2
    
        print(int(new_score))
        for node_index, down_node_indexs in new_tree.items():
            node = node_character[node_index]
            for down_node_index in down_node_indexs:
                down_node = node_character[down_node_index]
                print(str(node) + '->' + str(down_node) + ':' + str(HammingDistance(node, down_node)))
        print('')

# Test
n_leaf = 5
tree = '''GCAGGGTA->5
TTTACGCG->5
CGACCTGA->6
GATTCCAC->6
5->TTTACGCG
5->GCAGGGTA
5->7
TCCGTAGT->7
7->5
7->6
7->TCCGTAGT
6->GATTCCAC
6->CGACCTGA
6->7'''

NearestNeighborInterchange(tree, n_leaf)

22
TCAGCGTA->TTTACGCG:5
TCAGCGTA->GCAGGGTA:2
TCAGCGTA->TCAGCAGA:2
TCAGCAGA->TCAGCGTA:2
TCAGCAGA->TCCGTAGT:3
TCAGCAGA->CAACCTGA:4
CAACCTGA->GATTCCAC:6
CAACCTGA->CGACCTGA:1
CAACCTGA->TCAGCAGA:4
TTTACGCG->TCAGCGTA:5
GCAGGGTA->TCAGCGTA:2
TCCGTAGT->TCAGCAGA:3
GATTCCAC->CAACCTGA:6
CGACCTGA->CAACCTGA:1

21
TCTGCGGT->TTTACGCG:4
TCTGCGGT->TCCGTAGT:3
TCTGCGGT->GAACCCGA:6
GCAGCGGA->GCAGGGTA:2
GCAGCGGA->GATTCCAC:6
GCAGCGGA->GAACCCGA:3
GAACCCGA->CGACCTGA:3
GAACCCGA->TCTGCGGT:6
GAACCCGA->GCAGCGGA:3
TTTACGCG->TCTGCGGT:4
TCCGTAGT->TCTGCGGT:3
GCAGGGTA->GCAGCGGA:2
GATTCCAC->GCAGCGGA:6
CGACCTGA->GAACCCGA:3

21
TCTGCGGT->TTTACGCG:4
TCTGCGGT->TCCGTAGT:3
TCTGCGGT->GCTGCGGT:1
GCAGCGGA->GCAGGGTA:2
GCAGCGGA->CGACCTGA:4
GCAGCGGA->GCTGCGGT:2
GCTGCGGT->GATTCCAC:5
GCTGCGGT->TCTGCGGT:1
GCTGCGGT->GCAGCGGA:2
TTTACGCG->TCTGCGGT:4
TCCGTAGT->TCTGCGGT:3
GCAGGGTA->GCAGCGGA:2
CGACCTGA->GCAGCGGA:4
GATTCCAC->GCTGCGGT:5



# Week 4

In [11]:
'''
CODE CHALLENGE: Construct the graph of a spectrum.
Given: A space-delimited list of integers Spectrum.
Return: Graph(Spectrum).
'''

mass_AA_table = open('data/integer_mass_table.txt').read()
mass_AA_table = mass_AA_table.split('\n')

mass_AA = dict()
for line in mass_AA_table:
    line = line.split(' ')
    mass_AA[int(line[1])] = line[0]

def Spectrum_2_Graph(spectrum, mass_AA):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
        spectrum.append(0)
        spectrum = list(map(int, spectrum))
    
    graph = dict()
    for mass1 in spectrum:
        for mass2 in spectrum:
            delta = mass1 - mass2
            if delta in mass_AA:
                if mass2 in graph:
                    graph[mass2][mass1] = mass_AA[delta]
                else:  
                    graph[mass2] = {mass1 : mass_AA[delta]}
    return(graph)

spectrum = '57 71 154 185 301 332 415 429 486'

graph = Spectrum_2_Graph(spectrum, mass_AA)

for key1, values1 in graph.items():
    for key2, value2 in values1.items():    
        print(str(key1) + '->' + str(key2) + ':' + value2)

0->57:G
0->71:A
57->154:P
57->185:Q
71->185:N
154->301:F
185->332:F
301->415:N
301->429:Q
332->429:P
415->486:A
429->486:G


In [12]:
'''
CODE CHALLENGE: Solve the Decoding an Ideal Spectrum Problem.
Given: A space-delimited list of integers Spectrum.
Return: An amino acid string that explains Spectrum.
'''

table = open('data/integer_mass_table.txt').read()
table = table.split('\n')
AA_mass = {}
for line in table:
    AA, mass    = line.split(' ')
    AA_mass[AA] = int(mass)

def LinearSpectrum(peptide_string, AA_mass):
    prefix_weight = [0]
    for AA in peptide_string:
        prefix_weight.append(prefix_weight[-1] + AA_mass[AA])
        
    linear_spectrum = [0]
    for i in range(len(peptide_string)):
        for j in range(i + 1, len(peptide_string) + 1):
            linear_spectrum.append(prefix_weight[j] - prefix_weight[i])
    linear_spectrum.sort()
    return(linear_spectrum)

def DecodingIdealSpectrum(spectrum, mass_AA, AA_mass):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
        spectrum = [0] + spectrum
        spectrum = list(map(int, spectrum))
    
    graph = Spectrum_2_Graph(spectrum, mass_AA)
    
    for start in graph.keys():
        peptide_string = ''
        Find_Graph_Path(graph, start, peptide_string, spectrum)

def Find_Graph_Path(graph, start, peptide_string, target_spectrum):
    
    current_spectrum = LinearSpectrum(peptide_string, AA_mass)
    if current_spectrum[-1] == target_spectrum[-1]:
        
        check = 1
        
        for item in target_spectrum:
            if item not in current_spectrum:
                check = check * 0
                break
        if check == 1:
            print(peptide_string)

    if start in graph:
        mass2_graph = graph[start]
        for mass2, AA in mass2_graph.items():
            Find_Graph_Path(graph, mass2, peptide_string + AA, target_spectrum)

# Test
spectrum = '57 71 154 185 301 332 415 429 486'

DecodingIdealSpectrum(spectrum, mass_AA, AA_mass)

GPFNA
ANFPG


In [13]:
'''
CODE CHALLENGE: Solve the Converting a Peptide into a Peptide Vector Problem.
Given: An amino acid string P.
Return: The peptide vector of P (in the form of space-separated integers).
'''

def Peptide_2_Vector(peptide_string, AA_mass):
    peptide_vector = [0] * LinearSpectrum(peptide_string, AA_mass)[-1]
    
    prefix_mass = 0
    for AA in peptide_string:
        
        prefix_mass = prefix_mass + AA_mass[AA]
        peptide_vector[prefix_mass - 1] = 1
        
    return(peptide_vector)

# Test
peptide_string = 'NADA'

peptide_vector = Peptide_2_Vector(peptide_string, AA_mass)
peptide_vector = list(map(str, peptide_vector))

print(' '.join(peptide_vector))

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1


In [14]:
'''
CODE CHALLENGE: Solve the Converting a Peptide Vector into a Peptide Problem.
Given: A space-delimited binary vector P.
Return: An amino acid string whose binary peptide vector matches P. For masses with more than one amino acid, any choice may be used.
'''

def Vector_2_Peptide(peptide_vector, mass_AA):
    if type(peptide_vector) == str:
        peptide_vector = peptide_vector.split(' ')
        
    peptide_vector = list(map(int, peptide_vector))
    peptide_vector = np.array(peptide_vector)
    
    positions = list(np.where(peptide_vector == 1)[0] + 1)
    positions = [0] + positions
    
    peptide_string = ''
    for i in range(1, len(positions)):
        mass = positions[i] - positions[i - 1]
        AA   = mass_AA[mass]
        
        peptide_string = peptide_string + AA
        
    return(peptide_string)

# Test
peptide_vector = '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1'

Vector_2_Peptide(peptide_vector, mass_AA)

'NADA'

In [15]:
'''
CODE CHALLENGE: Solve the Peptide Sequencing Problem.
Given: A space-delimited spectral vector Spectrum'.
Return: An amino acid string with maximum score against Spectrum'. For masses with more than one amino acid, any choice may be used.
'''

def Spectral_Vector_2_Peptide(spectral_vector, mass_AA, AA_mass):
    if type(spectral_vector) == str:
        spectral_vector = spectral_vector.split(' ')
        spectral_vector = list(map(int, spectral_vector))
    spectral_vector = [0] + spectral_vector
    
    score_vector    = [-float('Inf')] * len(spectral_vector)
    score_vector[0] = 0
    
    # find max score
    for position in range(len(score_vector)):
        for mass in mass_AA.keys():
            if position >= mass:
                if score_vector[position - mass] + spectral_vector[position] > score_vector[position]:
                    score_vector[position] = score_vector[position - mass] + spectral_vector[position]
    
    peptide_string = ''
    
    position = len(spectral_vector) - 1
    
    # backtrack
    while position != 0:
        current_score = score_vector[position]
        delta_score   = spectral_vector[position]
        last_score    = current_score - delta_score
        last_position = np.where(np.array(score_vector) == last_score)[0]
        masss         = list(position - last_position)
        mass = 0
        for m in masss:
            if m in mass_AA:
                if m > mass:
                    mass = m
        AA            = mass_AA[mass]
        
        peptide_string = AA + peptide_string
        position       = position - mass
        
    return(peptide_string)

# Test
spectral_vector = '-19 -4 12 19 22 22 -8 29 21 9 -20 -2 -20 3 -20 9 15 7 -8 -4 27 29 -11 4 -5 10 -8 0 -19 -13 26 1 14 -18 -20 -2 -5 25 9 21 2 14 11 -11 -3 -6 13 -2 16 10 -12 27 11 -20 17 -20 4 11 -7 30 -4 22 17 -12 0 3 18 -13 9 8 16 -17 -9 12 11 30 18 -3 -1 30 -15 16 13 25 -10 29 9 -12 28 11 -4 -16 -6 16 2 -2 19 -12 -19 -1 13 -16 19 -19 -18 21 23 13 -2 22 -12 -8 -17 -11 30 5 23 -5 8 25 -9 23 -15 19 18 8 15 7 14 13 25 20 0 5 18 7 11 4 23 -19 -11 28 -14 6 14 21 -13 4 -8 10 8 -13 22 -13 17 -8 -7 -3 27 2 10 5 -4 27 16 -16 3 11 20 1 17 -20 -6 30 26 8 26 -9 -20 -6 10 4 -1 -7 -15 18 24 16 -14 -4 -13 -11 -17 15 11 -2 -8 5 22 4 -9 5 30 -9 5 9 -9 26 9 25 9 -12 30 16 5 -4 -19 -4 21 -13 27 -18 20 -11 -4 -4 -3 3 11 -19 29 -7 0 24 -1 5 -10 30 6 -9 -12 5 23 26 6 2 20 4 27 27 9 4 17 28 19 -8 -11 -9 -5 -2 -16 12 -6 -9 -14 29 -5 30 23 -8 13 12 -17 -14 12 29 -8 17 28 6 -15 25 -19 -20 3 -10 -3 -12 19 18 -2 -3 -18 17 13 4 11 27 5 12 22 3 -6 -15 4 22 24 -11 -5 25 -12 -12 -5 -18 -1 -2 25 18 -4 10 -8 -20 6 1 17 -4 24 -18 6 -6 11 -18 26 5 13 30 -9 -19 -17 -15 24 2 -13 -18 13 -13 -15 4 7 -5 2 -11 -8 -3 -14 2 30 -1 14 -1 19 -4 -16 29 10 13 -10 -1 -15 15 1 -12 24 22 -13 -19 10 2 -7 4 -2 18 -3 -14 22 -1 2 13 16 17 4 28 18 -13 10 -16 14 5 -15 6 18 6 6 18 0 21 -8 -2 2 18 -5 -20 -3 0 1 6 -5 8 22 14 9 18 7 5 9 23 -11 -13 -12 11 19 9 0 24 -1 24 23 -5 5 22 21 26 -8 27 8 22 -10 3 28 -7 -3 -16 17 29 17 26 -18 18 -19 -13 -14 -10 10 -15 -20 12 -16 23 14 -6 2 1 25 -4 4 -9 -8 14 8 -3 17 -19 -6 18 18 14 8 21 -17 -13 21 24 12 11 22 9 8 -14 -20 6 -2 -7 28 3 -12 25 -6 21 -19 21 -20 5 20 -20 15 25 2 21 27 -7 -19 26 -2 7 30 -14 30 -20 0 -12 -16 -17 -10 -7 27 -19 25 16 -12 6 -9 20 19 25 12 20 30 27 -10 11 -6 22 3 -10 -3 0 -4 -11 23 12 16 25 5 -13 -12 -17 9 -17 -12 -17 27 14 12 10 25 -16 10 -13 28 24 28 1 13 -13 7 -9 -2 -18 8 16 15 4 -14 4 22 -19 -1 17 25 15 -3 6 -14 -16 29 27 3 18 -19 -14 12 -10 19 9 -14 29 -12 7 5 -11 24 9 3 16 0 23 7 14 2 6 -8 3 -1 9 30 -12 12 29 30 6 27 1 28 27 -17 17 -16 25 17 7 15 2 0 8 14 22 12 12 -5 12 9 -7 16 -17 -1 -20 29 -14 16 -19 26 24 -16 4 3 -7 4 -8 16 29 -6 23 14 -14 5 -20 0 1 -4 -1 -7 24 14 -11 12 18 12 21 -10 -18 2 -6 5 -13 10 11 29 2 21 -9 -17 2 29 23 22 0 -4 12 -12 -3 0 -2 2 9 -19 -10 1 11 -16 -13 30 -20 -15 30 2 -1 26 18 29 6 -11 4 -14 -12 -17 -19 26 -19 -10 -7 26 23 -19 20 29 -17 25 23 16 6'

Spectral_Vector_2_Peptide(spectral_vector, mass_AA, AA_mass)

'CAGSAGGCGP'

# Week 5: Resolving the T. rex Peptides Mystery?

In [16]:
'''
CODE CHALLENGE: Solve the Peptide Identification Problem.
Given: A space-delimited spectral vector Spectrum' and an amino acid string Proteome.
Return: A substring of Proteome with maximum score against Spectrum'.
'''

def peptide_mass(peptide_string, AA_mass):
    total_mass = 0
    for AA in peptide_string:
        total_mass = total_mass + AA_mass[AA]
    return(total_mass)

def peptide_identification(spectral_vector, long_peptide_string, AA_mass):
    
    if type(spectral_vector) == str:
        spectral_vector = spectral_vector.split(' ')
        spectral_vector = list(map(int, spectral_vector))
    #spectral_vector = [0] + spectral_vector

    min_peptide_len = int(len(spectral_vector) / max(AA_mass.values()))
    max_peptide_len = int(len(spectral_vector) / min(AA_mass.values())) + 1

    max_score  = -float('Inf')
    max_string = ''
    for length in range(min_peptide_len, max_peptide_len + 1):
        
        for position in range(len(long_peptide_string) - length + 1):
            
            short_peptide_string = long_peptide_string[position: position + length]

            if peptide_mass(short_peptide_string, AA_mass) == len(spectral_vector):
                
                peptide_vector = Peptide_2_Vector(short_peptide_string, AA_mass)

                
                score          = np.dot(peptide_vector, spectral_vector)
                if score > max_score:
                    max_score  = score
                    max_string = short_peptide_string
    
    return(max_string)

# Test
spectral_vector = '28 -2 -20 4 16 28 20 -2 -11 -1 7 1 -15 12 -10 -9 6 -17 27 10 4 30 5 -17 -8 -14 25 21 -5 -7 26 10 26 -3 24 17 12 -4 4 15 29 -7 17 -16 -13 20 -2 3 -9 -17 2 -18 -19 0 -17 26 -16 23 9 -9 5 6 18 -13 -20 -10 9 -15 -10 4 29 -2 -10 8 0 -20 -8 27 22 1 -7 25 6 16 6 -13 28 -16 -6 15 -14 28 22 20 16 7 -12 -19 30 8 6 29 -9 13 28 27 -16 -16 -16 -3 19 -10 -12 -9 -20 -3 -10 -13 22 30 28 6 11 12 -10 -12 8 26 24 26 18 -5 25 9 -3 26 23 -4 -7 17 -14 5 1 -11 10 -19 0 -20 -17 10 -13 -10 -5 -12 -17 27 27 13 28 4 26 9 25 -10 20 -14 10 6 2 -14 5 17 23 -1 29 28 -2 8 -13 14 8 -6 -3 -18 -12 -16 23 18 11 -11 4 -4 25 16 -17 21 25 27 -1 -12 21 12 6 15 20 3 26 -4 29 16 8 -15 15 11 21 17 -6 27 28 12 18 25 15 25 8 -5 23 23 -16 14 20 9 -6 25 11 -3 -15 -14 4 29 5 -20 2 -1 -5 14 -9 -20 0 -7 14 17 -9 17 -18 9 10 -20 -2 -10 5 15 -16 -6 -16 18 30 22 -17 -11 0 -7 21 10 -3 -1 -8 3 20 -3 -10 14 8 17 -11 -7 -11 -20 28 16 -12 2 22 -2 5 5 -20 4 24 -9 -15 -18 -1 -19 8 -9 -17 18 -17 14 13 9 9 9 -14 0 -13 -5 8 -15 0 13 20 14 3 26 -7 -12 -3 26 10 -16 -14 13 10 -16 12 11 2 -10 -8 -13 5 -3 -14 10 -9 13 16 -20 -9 -12 22 -4 -14 24 -17 0 2 -5 -15 -5 -11 -9 -10 -3 -14 4 15 20 11 18 -15 21 28 -5 9 11 -5 -7 14 -5 28 26 -20 17 12 -18 -7 6 1 5 8 -18 -19 -3 17 13 -11 6 -18 -14 -14 20 19 5 -10 -7 -20 6 -8 2 21 19 -8 -8 21 7 -1 2 -8 0 19 3 -2 -14 13 18 18 -5 -9 -8 21 -18 -18 -18 19 -5 -18 -10 -3 -7 30 -13 20 3 20 -16 1 -13 26 -20 23 13 -16 -9 26 17 23 -16 -13 -9 13 -18 -6 -19 13 -7 -16 -15 -1 -13 1 16 1 9 28 -5 3 23 9 11 28 16 30 -1 6 -6 18 -12 21 29 26 29 3 -7 12 27 28 -11 28 6 10 13 12 10 22 -17 -17 7 -11 30 0 -20 13 8 -7 -3 23 -19 -14 -13 3 12 4 21 1 23 6 -4 18 14 -20 -7 3 22 -8 24 -7 2 29 26 24 -10 7 6 -6 -16 -6 24 -3 -11 -12 3 15 29 16 1 -4 8 0 -18 -16 8 -2 -2 12 21 2 28 -8 -5 -4 21 23 4 -8 -18 -7 13 12 18 12 2 13 2 -2 25 11 28 -4 17 16 28 14 27 15 -12 27 -11 -1 3 -3 6 -18 -3 13 -4 14 3 -1 -4 28 -14 -20 7 -1 28 -1 -19 -12 -9 21 9 9 -14 30 24 -10 25 -9 -10 -16 -13 -3 4 11 7 -3 16 22 8 -9 16 20 -18 14 -13 4 -16 -5 17 -17 -4 20 -7 -14 13 14 -15 -14 5 -17 -18 -15 5 -10 11 -9 21 -5 10 -3 5 -3 -15 1 -18 6 2 21 23 11 5 26 -3 -8 3 -13 -7 -17 1 27 4 19 20 -2 2 -15 -17 -18 -10 -15 9 11 25 -6 7 15 24 19 21 -15 -6 21 8 13 15 20 9 8 -13 19 14 11 10 -2 2 16 -5 -6 25 -5 -6 27 16 11 18 -17 -8 10 14 24 -11 -10 -9 -19 27 22 -1 23 19 -4 -12 -18 21 27 -17 27 -10 15 28 -11 -10 -10 -16 28 1 23 -2 18 1 -10 -15 -15 -13 12 27 -16 12 7 15 -1 28 -5 -2 -18 13 25 28 -11 -6 -18 -16 -8 16 9 -12 -5 10 -16 2 29 16 -16 -7 25 13 -19 -8 19 3 26 14 2 -2 -1 25 -14 -13 -17 10 5 6 -3 18 -10 15 8 -11 19 12 9 14 -1 22 17 -4 -17 10 27 22 16 -10 -1 19 -9 14 -2 -1 -15 -9 23 28 -12 18 30 24 28 28 13 20 21 -1 -16 26 2 8 13 24 -13 4 25 9 26 29 -16 11 -9 30 8 20 16 26 -20 -2 30 4 -7 13 22 -5 -4 -14 6 9 24 -19 11 -16 -1 -18 -2 -5 -20 -5 25 30 -6 -1 19 11 -8 -9 25 0 -13 -13 14 21 26 -5 -2 23 14 5 28 -9 14 29 5 0 5 -15 29 17 13 1 12 14 23 -9 18 22 24 -10 13 20 27 2 20 -14 16 -18 25 -15 16 -2 -16 23 23 -10 27 2 -19 30 20 -15 27 -2 -2 -8 -5 4 -20 -1 -3 -15 -20 -20 -9 26 4 3 14 -3 -14 1 -14 4 0 -16 12 20 -20 -1 8 -15 22 18 -6 14 13 -16 -7 1 -12 10 0 15 -10 17 2 -17 6 -7 28 12 -16 -4 18 4 15 28 0 4 8 -13 17 -15 9 14 8 24 -3 -9 10 -2 8 7 10 4 6 3 -14 19 11 -15 8 4 29 21 -16 23 20 2 -6 29 15 24 3 26 10 -20 0 15 20 3 20 25 9 19 -2 -15 4 -19 7 7 -13 12 20 3 17 -18 29 -8 -17 1 3 -16 -3 -12 26 15 -17 15 16 17 23 -3 16 14 -3 9 3 -14 -8 -2 -19 -4 29 -7 -2 9 -16 22 29 5 19 2 6 -14 -12 -13 -18 14 -17 1 20 -18 19 23 -1 2 -20 -5 7 -20 -14 26 7 14 -13 11 -1 27 11 13 8 19 18 24 3 9 11 22 5 16 -19 21 14 0 -19 0 3 26 15 30 12 -17 18 3 -5 -18 7 -11 27 -5 -18 23 -16 16 12 -18 -9 13 -1 -5 -6 17 -17 -15 -2 -18 1 -1 2 -14 -2 11 28 -3 26 -12 18 4 -6 22 3 6 -4 -11 -13 19 3 -14 -7 4 23 3 13 29 27 -2 -17 17 25 -15 25 30 11 18 1 27 -14 -7 12 -12 -6 14 -17 22 -15 -17 -2 -1 11 10 18 15 -14 26 17 9 6 -11 22 18 16 -1 23 11 15 -19 -11 -4 28 -8 17 20 21 -1 10 -7 -2 16 -9 30 -12 9 -4 -17 1 22 21 7 -9 19 -9 23 15 26 24 27 -5 18 -8 23 0 -12 -10 2 1 -14 0 23 8 -5 21 11 -2 14 12 8 28 23 25 -9 8 -1 -11 21 11 -16 23 15 26 -9 24 -1 20 -18 3 -3 -14 2 5 13 -15 -14 -17 21 28 2 23 21 5 11 21 20 17 -16 -9 -17 -13 13 -19 -7 16 -12 13 6 25 15 -16 -13 6 -19 11 -15 25 18 18 -15 -13 -13 -9 3 -15 19 9 -15 -12 21 28 -4 17 -8 13 30 16 -17 11 0 -17 -7 1 16 29 4 11 -7 -13 15 -1 7 23 -12 6 -4 25 12 30 -9 9 -11 6 -10 25 21 14 -17 3 17 -1 -13 26 -17 25 -3 11 25 -17 8 15 26 23 25 -14 3 18 14 5 8 -15 29 -17 29 27 -1 -3 13 -2 29 13 12 -18 -6 2 -15 -20 6 3 -20 11 -17 13 -19 -5 22 -2 -20 7 -6 4 22 -20 -3 -10 2 0 9 -7 11 2 27 -11 -1 -7 30 12 12 -10 -15 -6 18 23 17 -16 -3 10 -18 -19 26 -2 30 21 6 1 22 27 17 13 14 -16 -5 0 10 7 -7 28 11 16 -2 26 28 16 25 27 11 29 2 16 -18 -14 18 5 14 20 19 0 26 5 -10 11 9 -3 29 21 21 3 21 -5 -5 13 -7 26 0 -7 28 -17 -11 12 1 -11 -14 -19 13 20 -8 -5 14 28 -12 12 -18 4 10 8 13 18 7 -17 30 5 12 5 29 17 26 -7 8 11 9 6 3 7 -16 -3 24 9 26 10 15 -15 3 -8 -2 -3 -11 4 16 -4 -11 11 -9 8 5 13 21 20 26 26 27 -8 -11 -10 22 27 -19 16 16 -17 7 -17 10 -9 22 -5 9 12 17 -7 11 -14 -15 -13 8 30 -16 14 -18 23 23 17 -6 29 12 17 -4 -11 2 28 29 -12 -12 -3 -11 -2 -7 12 11 4 6 -4 28 -5 16 -10 6 12 20 -16 -8 30 -9 -10 27 23 16 29 0 17 8 13 0 -10 5 -12 26 19 -11 -7 -1 -6 -1 0 15 25 7 -13 -10 -5 5 -18 27 28 -15 -1 -20 10 5 28 27 2 -14 17 28 22 3 -18 3 23 17 26 20 30 -7 -4 22 -3 -11 -8 -15 29 -5 -7 13 30 -9 21 -18 -12 -18 25 27 -1 6 2 -2 1 7 28 13 16 1 -6 26 30 -19 -8 18 -18 14 20 -11 28 15 -14 14 6 15 -10 -17 1 14 21 11 13 1 1 -6 3 -14 4 25 -3 15 15'
long_peptide_string = 'FQNQPTAGLSGQQAMRGNNYTTHLMLIKYRQNTYARWYFEWGSTAMLSTRFTMHYPELCNTYERWPQHTTMWDHKAENSIPSEYAGCFELWVDKWGMLTFSDYRHRGGSFIYTIPTQMIPIGECEFIWTSWYILLLIEIRYQPETHWTEPQLHDDMREYSKNMPRKCVSFCGKRIYIPHPFGAYSCITFVNEYEANTLHDKSNSSTPNVGAYIFKKKHHTYMNANTCQTQEGSIMHIRYDACFGFGQQPTNVDEVRKYWWVENCQGVICYYYCWGGDRDPYARNIHNEERCRAVLDDGALGPYRVKKYTFLPPYCPCWQHDVFYCSEHSMFTRYYNQINPYWGGWANKFQRKRNFMHCAKPWHGWKTRWWRETMCEPDPMCGSREWCKAFALWECLKFARHHRNLVMVMTYPEIDPIPADNGCNDCARYSCYIDIWYCVDTIPDGGCCWTDIWLTLNENATIKYPSVKFMDMFPSHEFVIWIPHTNIRRRYRNLTLEEVQKWFRMNTFRRTTVRLRWFCNILWFTQEDFENGTDVSDQMVYWNQGNYSTGWEPYIPLIFAIFECQRYYPSTCIAFPNWVQCGESAMYSYKRASRYYTIANATAIRVWIMQAHLWILQWEPQDTNMYMHIFNIGQYPILKSLGEIRVSPSLCTFAHFRRAYYALEGPRIHKRRFVAKMWVFHEYLMGKEEIIAEMSNASVPHFHFYGRTQIMEYDLRGGGGTQWMYSMPNYRIPHCVDPDNKCIENVQPFIENEMSRWMHLVAPAFHSWHIFSPQMKEPFQAGASVQTNTGYNHCSVEVHHVMEINAMHNDETVGFRSEEGPQLVQVTAVNFFGHQPSIMFLDFVQTDYAVTSKWTMKIYQDSYQNQFYEHDKISAQRQNGMVRYFICYPGNARPTTIVMHCSVYRKTCQSGIDDEETYAMNFYIDTKLCSYIQFHIKKNMMAQRLDHRWEQTDHIRIDHVTKCSNAALINENDQQTGTNMDAIGAVTNSAHSVYRRCAMVPYIYCEMGGDPNGIEFMKGGERVENPFMYDEKWFFWIWATLIIHHFDWRGSNKDAPTWIPRHSQRVHCVINHQQIDQWDDGDSDEPADVGFRTEWQHHSIGNEAANLQKEWWRRNVCLIYGKYESFYSCDDPYRSLTWWCIGIRMEVPEQTVNIAIFIGVYGCTRGGKTMLGHSKGNGCQEQMGELIMPYPQQVYAVLGNIMDVHGYVKVWLITYHGLRKAGDVLSIMWPHAQHVMHNVFLKCPIAKVNQANYDEATKAMNGFPMHLHMRFFNYDWFCWCGVGHRTPRNKAEKKTNAHSHAFPQHEVDLFKNYGSVMVVLTQCVLNMPWYKRERISVHVHKTQHKMWYCYYFTPTIPCYNMAWSYFPYYYKHWMEHPGELLFCEFHVMKEHASKYGNNCREAHEQPAIVDSSRMMPSIQAPPWYDFVIMVFYVIKCHHIWYLTQTEGTYVVIHIWLTPAGHRDVSTFWKAYVEPGFITTPARCFQKHWMRKVDLTCCQQHENKFENKDANRYADHKEMELFHRTFFRCNIGWPLIRQYAMGQGVNSENSHGPFLHPSDGETWMPCKAYDIVILMASKDSCPIQKYYIMGSSEFASDMWFAVCAHFSDWKLRFNSLIRQDVECSRIAVDEYPDFPWSKIKRHQYMETIADLLSHNTQDQGGFMFPLKHSVQMSRFSMYLDHWECHVPWVEPDTRDPTCNQACMMVDFEHMSIKNERDLYDRYIMILTHQNHKVDILMFNTTAVVLPDNDMHKRESRHCVNYSMNQAPQYEHAYHQIWTERLDISYQDVFLLFSPYIHLTECTKQLWIHQKPIAQAKPPTDFSCPRTDYWQHNALTMMGALYFWRGMMLPKPHCLVDEHWMAHMDQDLDHAYIRWRETHSMHGFVGDMDIIRIDEWSWGFWKCVNVGKLALHHWASWSFFPHYLYMVHCKSQHRCDLKSAYGLHEVVKWFYFPGAMQCMFWKCQHYYMEAPYIDMQMQRLWQDKMMPPNFQNCFSKFQLKYGYCYRHHECEVHTKAHCRPDRWFWNVKRIHSERNSWGLVGFQKAIQGSSTGTMLSFNCLWIYNGFSHREHRTCWYLPWNDNTAPRAETRPWGEEYKTMETGHPEVIDPKKVWYAHMMIHDAPHQRRKGKANYLMGLMKENRCDLTEEHHE'

peptide_identification(spectral_vector, long_peptide_string, AA_mass)

'IWTSWYILLLIEIR'

In [17]:
'''
CODE CHALLENGE: Implement PSMSearch to solve the Peptide Search Problem.
Given: A set of space-delimited spectral vectors SpectralVectors, an amino acid string Proteome, and an integer threshold.
Return: The set PSMthreshold(Proteome, SpectralVectors).
'''

def PSMSearch(spectral_vectors, long_peptide_string, threshold):
    if type(spectral_vectors) == str:
        spectral_vectors = spectral_vectors.split('\n')
        for i in range(len(spectral_vectors)):
            spectral_vectors[i] = spectral_vectors[i].split(' ')
            spectral_vectors[i] = list(map(int, spectral_vectors[i]))
    
    min_peptide_len = int(len(spectral_vectors[0]) / max(AA_mass.values()))
    max_peptide_len = int(len(spectral_vectors[0]) / min(AA_mass.values())) + 1
    
    short_peptides = []
    
    for spectral_vector in spectral_vectors:
        short_peptide_string = peptide_identification(spectral_vector, long_peptide_string, AA_mass)
        peptide_vector       = Peptide_2_Vector(short_peptide_string, AA_mass)
        score          = np.dot(peptide_vector, spectral_vector)
        if score >= threshold:
            short_peptides.append(short_peptide_string)
    return(short_peptides)

# Test
spectral_vectors = '''20 -15 20 5 -10 -10 18 26 23 -1 20 10 -10 -7 -15 22 1 -15 23 9 11 1 9 14 -20 25 12 -15 -7 24 -10 -2 -6 23 -14 5 -8 -12 -9 12 -7 -7 11 23 -12 9 -10 9 17 -5 22 -14 2 28 -19 9 -13 4 18 -18 28 4 18 -20 -5 19 11 11 17 -10 -10 5 22 14 -1 -7 -11 23 -7 21 23 13 18 -2 21 20 8 5 28 -11 -9 -12 9 27 -19 16 -7 6 5 27 7 -2 -7 3 -5 -14 20 28 22 13 5 -1 3 2 1 -16 3 28 -12 23 -6 29 13 22 -8 1 -20 -6 -6 20 0 11 30 3 -8 15 18 -15 -5 -9 17 13 10 -6 29 13 16 -5 9 0 2 -5 4 -14 16 11 21 -16 29 8 -19 -10 -20 -17 6 23 -4 -15 3 -12 5 30 30 26 -1 19 -7 -11 9 19 26 6 15 15 -16 11 18 22 25 5 -6 -1 0 15 15 -6 -20 4 10 18 7 25 3 -10 12 15 -4 6 19 26 -14 20 5 29 -9 3 13 10 -3 23 1 -15 3 -6 -6 -2 -13 -1 -12 17 -20 -11 13 -9 23 21 -10 1 9 17 3 9 15 16 20 14 16 15 8 5 -6 -8 18 6 11 11 7 8 29 18 12 18 -1 15 -16 7 -16 -2 -3 -4 5 -10 12 10 1 30 16 -19 6 2 12 -1 -14 27 11 -3 17 13 28 4 -1 -15 -9 4 29 1 -9 26 3 -13 13 -7 30 23 26 0 16 -5 0 12 8 26 12 5 -2 13 12 3 14 15 -8 -11 9 -5 23 2 5 23 -7 22 -4 16 -19 26 0 -1 0 20 -3 -19 11 27 7 13 -11 27 5 17 28 -13 21 18 -10 25 27 18 2 1 15 -19 20 -20 6 -17 -20 23 -7 -6 -1 -8 5 26 -10 14 3 -1 29 27 14 -15 21 -1 29 -8 -7 -18 2 15 -9 9 19 -5 23 15 -16 8 11 28 -7 6 -6 -3 3 7 21 13 6 19 0 28 4 22 14 12 30 23 20 -3 2 -14 18 -2 12 -7 -13 29 -12 10 16 -1 -4 5 -19 14 1 8 -8 3 -16 9 10 -3 17 5 -17 16 30 7 0 -18 25 12 1 17 -11 -4 6 1 25 -7 -7 18 9 18 22 7 18 -2 -3 30 28 7 -10 -6 -11 -12 18 -10 15 -15 -11 -16 -11 12 -19 -12 25 -9 -17 22 -3 26 -9 3 23 13 19 12 -20 24 27 23 14 20 -8 25 -4 21 9 21 0 5 29 7 -1 -6 10 28 -2 23 -19 2 13 -14 15 14 8 4 30 25 -11 1 15 12 10 18 13 27 -9 24 -4 -17 -20 -8 -4 11 6 -8 -6 3 19 19 -2 -2 -9 3 8 7 -7 23 29 -11 23 0 -7 -4 9 -4 11 16 5 -12 -9 -12 15 11 -17 21 26 -8 19 16 4 18 14 -1 23 -15 18 12 17 -15 -18 -17 17 7 -7 22 -19 22 21 1 -10 18 28 14 20 -6 -5 -12 1 -7 -15 -2 23 7 15 -7 7 -8 26 -3 9 22 19 -5 17 9 -2 -19 13 -20 15 28 20 6 -10 7 29 -9 -9 5 -11 -10 0 -15 -13 -6 -9 8 -9 20 27 -16 3 -6 25 26 -4 21 -9 -9 -14 30 -19 28 -11 14 -13 -6 -8 -20 -2 25 9 12 9 -11 -15 14 -11 22 21 17 -3 -3 5 26 -10 -18 26 11 3 -11 -3 12 14 23 29 -9 22 10 -5 -3 -12 24 3 28 29 4 22 -5 -5 -19 -13 10 -15 12 15 17 17 2 21 5 -18 -5 10 29 -19 -2 -15 30 14 6 -1 7 27 -13 3 14 -10 10 18 -11 16 10 -13 15 25 4 -13 -15 -13 5 16 9 15 -5 21 19 -12 15 23 -17 6 22 17 -3 0 27 26 11 4 -12 15 -11 12 30 -15 3 -1 28 -13 -16 -4 17 18 29 -15 5 28 4 16 8 13 -8 13 1 -7 18 14 -10 -5 20 13 -8 -1 3 -5 4 23 -12 22 -18 27 23 12 14 -4 -17 23 -9 27 -13 16 -9 12 19 -11 30 24 -11 -9 19 10 12 6 -13 21 9 -5 13 14 30 13 -5 -9 19 7 -4 -14 -3 2 17 23 -11 -13 -4 25 23 -9 9 -14 -8 1 8 -10 7 4 -13 -18 -10 -13 29 13 21 8 -10 26 30 -11 11 -7 -6 1 17 26 5 22 15 -4 24 23 17 -17 15 -9 -13 12 -7 9 2 -1 -13 -8 20 11 -4 25 -4 20 2 4 -16 -2 -10 -20 -7 19 4 -6 11 -12 7 -10 -12 -20 4 -3 25 10 12 9 17 1 -4 2 20 24 14 -17 5 -7 10 -1 -14 -19 3 30 19 15 28 14 12 13 9 -13 28 -12 20 5 21 20 -2 7 -3 8 2 -9 -3 26 -14 -1 17 -5 6 -3 23 -8 0 4 11 2 -17 -12 -9 5 15 13 20 4 -11 18 19 -12 12 26 -7 -20 -15 14 -7 12 17 8 -14 -5 -18 -4 10 4 0 12 18 12 11 30 -19 -1 8 -9 24 5 -4 9 1 2 -15 -18 -13 -8 9 22 6 5 22 -2 1 8 7 -10 14 -10 30 14 27 27 -19 -3 4 -18 -4 -12 21 -3 -11 -19 8 18 16 14 11 -19 -13 -18 -19 4 26 -10 -7 -10 -11 -1 16 -12 21 5 -18 8 25 -7 6 25 -14 22 29 -13 17 2 -6 -19 -18 23 18 6 6 27 23 -12 18 29 -20 13 -2 7 15 -8 -10 10 2 -14 12 -12 -3 3 16 9 9 -12 18 19 29 3 -11 29 -8 -19 -6 10 14 -4 27 26 -16 -6 -12 29 10 -10 -6 -19 23 -12 -18 -17 -7 27 -15 25 25 -12 0 10 -6 15 5 17 -4 -20 -9 20 -2 -6 -10 -13 -7 14 -6 12 2 17 6 -17 26 -14 -19 4 -8 -15 19 26 28 -3 22 22 -16 -12 25 10 -13 19 11 21 10 3 -4 5 9 -2 6 -4 -19 -15 -1 19 -7 -17 30 -18 -18 -13 26 -7 26 25 9 3 4 19 13 26 -4 -10 -16 27 6 30 -5 10 -18 -1 -5 18 -19 23 4 23 -2 13 3 16 3 30 -10 7 22 17 17 16 21 24 25 -10 -13 -13 -18 -20 -1 -6 27 18 -9 12 10 -2 -14 22 -5 -4 26 21 28 -7 26 26 9 -6 -16 11 -18 0 23 24 9 -18 17 10 14 22 -7 7 27 14 26 23 9 16 27 18 11 -11 14 11 -11 -9 -12 13 2 1 3 -1 -18 14 -5 23 15 -9 30 12 28 2 3 29 -6 10 28 23 21 29 27 30 20 -16 3 -15 21 -7 4 -5 26 23 5 -19 -16 -19 26 15 6 8 1 -10 -3 23 12 -19 20 15 25 18 16 -14 -17 -1 4 17 24 14 -10 22 28 -19 -9 19 24 22 6 4 17 -13 23 -10 25 14 -12 4 12 9 29 4 7 20 7 -11 26 28 -8 28 3 -7 28 -5 1 1 24 7 27 4 -10 15 -8 -1 30 20 4 -12 14 -14 21 5 10 -4 4 13 -7 27 21 -13 -4 -1 -8 0 29 -12 -9 24 13 -18 22 27 -17 12 10 -11 -5 10 2 20 -12 30 2 -3 -18 5 5 4 7 -19 2 20 13 -8 -7 -8 -1 -12 16 10 -17 27 12 20 26 8 27 12 -1 -5 8 5 2 10 -12 4 11 20 16 -11 6 12 25 1 28 -6 -6 -1 21 23 -18 26 29 3 -1 14 -6 -18 26 -1 24 0 0 -4 14 21 2 -1 11 29 18 10 6 25 -10 9 11 17 10 -2 -13 29 24 16 -19 -18 17 -17 10 6 21 21 10 8 -4 15 15 -14 -6 -16 4 11 1 -18 17 -11 -3 -15 18 0 -18 -5 14 8 4 13 -15 -7 4 -2 23 28 8 1 10 -6 -7 4 22 -20 11 -17 -10 -18 22 12 1 1 3 -14 30 -18 26 27 19 28 -10 23 23 18 -12 6 20 -9 22 -17 2 -12 -3 0 28 14 -4 -1 -3 -5 28 8 19 22 16 -8 -8 7 3 17 19 23 17 -12 -16 -5 -19 15 -11 25 22 6 -7 0 26 29 22 14 -7 -15 -20 18 21 -19 26 27 27 14 21 -6 -9 6 9 -2 13 28 -16 23 12 -18 15 -19 12 15 18 6 9 -1 12 -10 17 -14 26 -16 -7 -6 7 19 -15 1 -8 23 -7 28 22 -17 15 22 -19 3 -9 -2 3 -7 15 -6 12 17 6 -11 20 -20 19 -1 19 -20 19 5 -3 26 16 0 0 17 22 -7 8 -2 -9 2 30 18 19 -3 -15 21 -12 14 29 17 8 5 -1 19 7 -8 -13 -16 29 -7 -12 20 16 13 -11
-18 -7 -9 -1 27 -15 -13 -16 19 5 11 26 4 -19 -11 -3 -6 12 6 10 9 -2 9 -19 12 -8 -13 -12 20 20 15 -5 -15 22 -10 -15 20 30 30 -6 4 -3 10 20 -17 0 -7 6 2 28 -3 25 23 -19 7 -12 25 24 10 14 2 26 -18 -18 -13 -3 -16 -2 -2 28 28 18 -2 6 3 29 13 -20 1 -10 21 -5 8 10 23 23 -11 22 -3 11 14 -2 26 21 -16 24 -19 11 17 10 24 0 -14 27 26 10 -3 -12 5 21 -5 16 -3 -2 -17 28 -16 19 -3 16 -9 -14 30 15 -1 -1 29 25 24 3 23 30 28 23 -1 21 12 0 -20 22 -4 -5 -3 -11 25 4 28 -16 23 25 -9 13 12 -9 2 -12 -6 -19 -2 -5 17 -16 -14 -8 -15 0 11 -13 21 -1 -12 12 23 11 5 -11 -12 8 14 -12 10 15 17 5 7 18 -18 26 25 19 22 -15 18 -8 -8 1 20 7 18 27 21 -10 1 0 3 24 21 -10 6 1 4 7 4 5 -3 22 14 -12 -3 5 -9 15 1 -1 -18 -14 12 1 0 14 -2 11 -1 -17 -9 -19 14 -9 30 -4 1 -4 20 10 13 21 -14 -10 5 9 7 5 -4 -7 -16 19 -6 13 -11 16 -3 -18 -5 28 13 -20 18 30 25 3 -14 14 -20 -5 2 11 17 -1 9 -5 -20 20 26 1 7 2 12 17 -6 22 -2 19 2 18 4 -19 2 -15 -6 20 9 -8 -19 12 26 18 12 -13 8 4 7 2 0 -1 -3 11 24 -5 3 29 -6 -2 -18 1 -15 3 10 23 -4 -17 -20 11 -14 -13 4 6 30 -9 16 12 12 1 12 17 -15 -19 25 17 -9 5 14 7 -12 19 27 28 -4 28 28 -20 8 -2 9 -20 16 27 9 -15 -18 -20 27 -6 14 24 20 4 -14 -6 -10 15 -7 -16 -6 -17 28 -12 10 -3 3 24 -10 -15 13 18 29 -15 -4 -16 6 28 6 21 28 25 5 2 5 -3 -4 6 4 29 -16 15 30 -13 6 -18 16 12 28 25 2 26 14 6 -6 -12 -7 -10 -20 12 14 18 26 -3 -17 27 -2 8 25 -6 23 9 19 7 16 1 -11 -17 -15 1 1 -6 -6 -10 -14 -6 8 3 -19 -13 23 5 17 2 11 23 -18 -4 11 -10 1 21 5 29 25 18 17 21 0 17 29 -16 18 23 23 -9 5 -20 -15 25 -3 -20 27 5 -13 -13 -4 -6 7 -19 -1 -6 9 -18 24 5 -8 -10 25 7 -12 -16 11 -19 24 25 -16 20 26 -11 -6 -20 30 -10 -9 26 2 -5 -6 1 10 -6 3 -18 -20 -4 23 3 -12 -7 -20 6 -2 24 4 5 -2 29 26 -5 30 18 -5 15 16 29 -13 -10 -13 -12 19 22 -2 7 27 8 8 10 13 15 7 -8 5 30 26 20 11 18 -20 -19 -4 -17 -17 -9 19 -16 19 4 28 26 17 30 16 -14 -2 0 -11 -12 -19 24 13 6 -8 13 2 20 28 -4 2 16 0 12 5 -4 -18 9 26 5 -19 -7 -19 -1 17 -1 -10 13 8 -3 30 -2 19 24 6 -13 -5 9 14 22 10 24 -5 28 20 -3 -5 11 27 10 13 27 -12 28 -19 -7 20 -8 -16 16 -20 -8 27 -18 22 9 -16 -8 28 6 -4 -5 -10 17 9 -16 -2 -14 -12 -19 27 -18 21 25 -7 -18 29 2 -7 13 9 -18 29 24 -6 30 -17 21 16 -10 -18 23 4 -12 -3 5 4 19 -19 -18 5 25 -4 7 1 9 -5 14 27 14 -11 -6 -14 18 -19 -16 -4 24 -17 27 1 17 19 18 15 20 -2 8 -6 6 1 -11 30 1 -7 -12 26 26 17 17 -18 -19 10 2 -10 -10 -5 -10 20 30 0 12 -17 -13 15 23 2 30 10 14 22 -12 -7 24 8 20 14 -14 6 18 5 -6 -3 25 4 3 4 21 2 -18 7 12 18 -1 -18 0 -1 -19 8 -2 3 11 0 -2 -2 -20 -16 12 -15 20 -8 7 -4 21 -12 24 16 5 23 10 -14 -15 26 14 -9 23 -14 24 -6 6 10 -7 25 -4 24 9 15 -16 -4 28 -1 23 16 -9 -16 13 30 -13 27 21 -20 -14 -13 -3 -6 -18 -12 27 -12 10 -20 12 29 14 15 24 1 -13 1 -19 2 -6 -7 7 -1 14 -12 -6 -13 -13 -5 12 9 22 -15 -18 7 -13 19 18 -13 -5 -14 5 4 -16 -10 22 23 27 -4 -2 7 -10 11 -7 11 -9 -2 -6 25 0 28 17 19 0 24 -8 21 1 29 -7 22 29 -16 -12 1 -1 20 -12 6 4 19 2 11 -17 13 -1 10 30 21 0 23 26 14 -18 11 3 11 -10 -17 9 -19 11 -20 -8 -7 -1 -9 15 -3 -2 12 24 -10 27 -17 -13 16 9 -18 -16 -7 -9 27 9 -7 29 -8 -20 14 7 10 -1 -13 17 -4 -4 12 1 -19 -8 -8 26 -10 0 -5 3 14 10 -4 -18 11 -1 -3 9 12 -11 -18 -6 4 10 -13 15 22 6 -5 10 6 -4 1 6 -18 27 30 -9 10 -8 5 -13 27 1 1 17 0 27 -10 5 21 3 29 -2 -6 26 -13 7 4 27 28 -14 -7 8 11 14 19 10 19 -16 -7 -8 9 2 21 14 -17 14 27 24 -15 4 5 -8 30 -8 3 -12 6 13 18 -1 12 5 24 8 24 28 -12 23 4 30 27 -3 9 29 -16 7 5 -14 -18 21 -15 12 2 10 28 10 -11 11 -3 -12 9 16 0 9 -15 2 -2 29 24 30 13 8 -5 12 -13 -4 11 20 25 8 3 21 1 29 22 -16 10 -17 -20 3 20 24 14 26 -20 24 -10 -7 18 23 29 4 13 -13 28 25 13 -5 22 3 1 21 17 9 1 -19 -20 -14 13 9 23 15 -17 6 -18 23 10 5 2 15 -5 -2 0 -9 19 16 -17 8 17 -11 6 -10 24 -11 2 0 -12 7 -1 7 3 21 11 18 19 -11 -14 -19 19 -16 17 2 7 22 -13 -17 11 7 23 -15 -19 5 12 -5 16 5 17 -2 -9 -15 -13 14 12 -2 -9 -14 19 2 6 16 30 24 12 -4 9 13 10 -7 -5 19 20 -14 13 -16 30 20 0 12 8 -18 11 20 4 26 20 17 14 -10 0 -11 28 7 15 27 23 -2 -2 16 -1 15 12 -11 -18 11 -11 16 -5 -3 -12 -8 23 15 24 19 -16 13 12 -9 21 -11 -19 -14 18 29 -12 23 4 -6 -18 19 -7 23 15 -14 27 19 -8 28 -10 -19 16 -15 -14 -8 22 -9 -1 14 5 -4 3 -8 15 26 -4 13 5 -4 -14 -15 1 15 12 -14 -1 12 -9 15 14 -4 2 11 -10 -15 7 -1 -3 -17 -16 -20 11 21 26 23 -1 12 19 -3 20 27 22 -12 30 -10 -14 18 -5 -1 28 29 -20 7 6 -8 -10 20 -18 4 -19 2 7 -8 19 -19 -12 -11 -5 -10 5 -4 0 24 21 -15 10 -5 -14 8 13 16 24 4 -13 -15 15 17 25 4 6 28 13 17 7 -17 28 -4 -16 -11 15 13 2 -9 18 -17 -16 -13 17 -13 -15 30 -13 -19 7 -14 -16 21 0 20 -11 18 30 16 -14 15 9 -13 -19 23 4 -13 22 -18 -6 19 2 -3 -7 -13 16 -8 0 18 -17 -3 -9 -16 25 21 -9 17 -1 -4 -5 -9 12 5 20 -12 9 3 -20 30 7 1 -4 15 -7 8 28 11 -5 7 28 -20 -6 -20 4 -15 11 11 -16 9 21 13 -15 5 23 17 13 9 3 7 3 3 -15 -11 1 28 -12 14 -6 11 4 -5 8 21 1 -16 -17 -9 24 13 -18 29 17 0 -11 -12 25 -3 28 3 -16 2 -20 29 24 -17 30 -13 19 -2 -7 -17 8 6 21 11 0 28 23 -2 -10 27 -7 9 0 11 4 23 -11 16 16 9 22 -14 25 6 22 17 22 9 -2 -13 -9 -10 -6 -10 4 11 29 -18 10 29 7 29 13 -18 28 30 13 -3 -12 10 23 9 7 29 -10 1 0 15 16 -18 -11 -17 8 3 7 9 -8 6 17 11 -14 -2 25 3 29 4 1 12 -1 23 27 23 -1 11 -5 27 -6 14 2 14 18 -12 17 -19 25 29 -16 2 23 0 -2 12 -12 -2 1 24 4 16 -4 8 26 23 25 -16 20 5 14 0 29 18 25 -15 -1 13 13 -6 11 -17 11 -20 -14 13 6 6 6 11 22 14 0 27 -9 29 27 15 -15 6 23 16 0 14 28 -3 2 -5 -14'''
long_peptide_string = 'FQNQPTAGLSGQQAMRGNNYTTHLMLIKYRQNTYARWYFEWGSTAMLSTRFTMHYPELCNTYERWPQHTTMWDHKAENSIPSEYAGCFELWVDKWGMLTFSDYRHRGGSFIYTIPTQMIPIGECEFIWTSWYILLLIEIRYQPETHWTEPQLHDDMREYSKNMPRKCVSFCGKRIYIPHPFGAYSCITFVNEYEANTLHDKSNSSTPNVGAYIFKKKHHTYMNANTCQTQEGSIMHIRYDACFGFGQQPTNVDEVRKYWWVENCQGVICYYYCWGGDRDPYARNIHNEERCRAVLDDGALGPYRVKKYTFLPPYCPCWQHDVFYCSEHSMFTRYYNQINPYWGGWANKFQRKRNFMHCAKPWHGWKTRWWRETMCEPDPMCGSREWCKAFALWECLKFARHHRNLVMVMTYPEIDPIPADNGCNDCARYSCYIDIWYCVDTIPDGGCCWTDIWLTLNENATIKYPSVKFMDMFPSHEFVIWIPHTNIRRRYRNLTLEEVQKWFRMNTFRRTTVRLRWFCNILWFTQEDFENGTDVSDQMVYWNQGNYSTGWEPYIPLIFAIFECQRYYPSTCIAFPNWVQCGESAMYSYKRASRYYTIANATAIRVWIMQAHLWILQWEPQDTNMYMHIFNIGQYPILKSLGEIRVSPSLCTFAHFRRAYYALEGPRIHKRRFVAKMWVFHEYLMGKEEIIAEMSNASVPHFHFYGRTQIMEYDLRGGGGTQWMYSMPNYRIPHCVDPDNKCIENVQPFIENEMSRWMHLVAPAFHSWHIFSPQMKEPFQAGASVQTNTGYNHCSVEVHHVMEINAMHNDETVGFRSEEGPQLVQVTAVNFFGHQPSIMFLDFVQTDYAVTSKWTMKIYQDSYQNQFYEHDKISAQRQNGMVRYFICYPGNARPTTIVMHCSVYRKTCQSGIDDEETYAMNFYIDTKLCSYIQFHIKKNMMAQRLDHRWEQTDHIRIDHVTKCSNAALINENDQQTGTNMDAIGAVTNSAHSVYRRCAMVPYIYCEMGGDPNGIEFMKGGERVENPFMYDEKWFFWIWATLIIHHFDWRGSNKDAPTWIPRHSQRVHCVINHQQIDQWDDGDSDEPADVGFRTEWQHHSIGNEAANLQKEWWRRNVCLIYGKYESFYSCDDPYRSLTWWCIGIRMEVPEQTVNIAIFIGVYGCTRGGKTMLGHSKGNGCQEQMGELIMPYPQQVYAVLGNIMDVHGYVKVWLITYHGLRKAGDVLSIMWPHAQHVMHNVFLKCPIAKVNQANYDEATKAMNGFPMHLHMRFFNYDWFCWCGVGHRTPRNKAEKKTNAHSHAFPQHEVDLFKNYGSVMVVLTQCVLNMPWYKRERISVHVHKTQHKMWYCYYFTPTIPCYNMAWSYFPYYYKHWMEHPGELLFCEFHVMKEHASKYGNNCREAHEQPAIVDSSRMMPSIQAPPWYDFVIMVFYVIKCHHIWYLTQTEGTYVVIHIWLTPAGHRDVSTFWKAYVEPGFITTPARCFQKHWMRKVDLTCCQQHENKFENKDANRYADHKEMELFHRTFFRCNIGWPLIRQYAMGQGVNSENSHGPFLHPSDGETWMPCKAYDIVILMASKDSCPIQKYYIMGSSEFASDMWFAVCAHFSDWKLRFNSLIRQDVECSRIAVDEYPDFPWSKIKRHQYMETIADLLSHNTQDQGGFMFPLKHSVQMSRFSMYLDHWECHVPWVEPDTRDPTCNQACMMVDFEHMSIKNERDLYDRYIMILTHQNHKVDILMFNTTAVVLPDNDMHKRESRHCVNYSMNQAPQYEHAYHQIWTERLDISYQDVFLLFSPYIHLTECTKQLWIHQKPIAQAKPPTDFSCPRTDYWQHNALTMMGALYFWRGMMLPKPHCLVDEHWMAHMDQDLDHAYIRWRETHSMHGFVGDMDIIRIDEWSWGFWKCVNVGKLALHHWASWSFFPHYLYMVHCKSQHRCDLKSAYGLHEVVKWFYFPGAMQCMFWKCQHYYMEAPYIDMQMQRLWQDKMMPPNFQNCFSKFQLKYGYCYRHHECEVHTKAHCRPDRWFWNVKRIHSERNSWGLVGFQKAIQGSSTGTMLSFNCLWIYNGFSHREHRTCWYLPWNDNTAPRAETRPWGEEYKTMETGHPEVIDPKKVWYAHMMIHDAPHQRRKGKANYLMGLMKENRCDLTEEHHE'
threshold = 154


PSMSearch(spectral_vectors, long_peptide_string, threshold)

['MGELIMPYPQQVYAV']

In [18]:
'''
CODE CHALLENGE: Solve the Size of Spectral Dictionary Problem.
Given: A spectral vector Spectrum', an integer threshold, and an integer max_score.
Return: The size of the dictionary Dictionarythreshold(Spectrum').
'''

def Spectral_Dict_Size(spectral_vector, min_threshold, max_threshold, AA_mass):
    if type(spectral_vector) == str:
        spectral_vector = spectral_vector.split(' ')
        spectral_vector = list(map(int, spectral_vector))
        spectral_vector = [0] + spectral_vector
    
    size       = np.full([len(spectral_vector), max_threshold + 1], 0, int)
    size[0, 0] = 1
    
    for i in range(1, len(spectral_vector)):
        for t in range(max_threshold + 1):
            new_size = 0
            for mass in AA_mass.values():
                last_i = i - mass
                last_t = t - spectral_vector[i]
                if (last_i >= 0) & (last_t >= 0):
                    if last_t <= max_threshold:
                        new_size = new_size + size[last_i, last_t]
            size[i, t] = new_size
            
    dict_size = np.sum(size[-1, min_threshold:])
    
    return(dict_size)

# Test
spectral_vector = '-9 14 15 -4 13 -6 4 0 1 -6 14 2 -4 6 15 0 6 -9 -2 6 12 15 -5 -1 -10 6 10 -2 10 12 8 11 5 8 11 1 -2 1 0 3 3 -9 -9 -3 12 14 -4 2 -10 11 2 10 -9 -6 15 -4 -7 -7 -4 -8 15 3 -9 11 14 0 0 14 -3 -3 8 -10 -6 -5 -4 -1 2 4 -1 -8 6 13 -8 -7 1 0 1 12 1 -3 9 -1 6 2 -10 4 13 9 -9 4 -2 10 -8 -2 6 11 6 9 -2 -4 -4 -1 1 2 9 0 9 -5 1 -3 -4 15 -2 2 8 2 1 -4 -1 4 -6 -5 -3 -1 11 -10 9 3 4 -6 -9 -1 -4 14 -10 -2 12 -4 -3 4 6 10 -3 1 -8 8 -4 -5 15 13 0 8 14 13 0 -5 -7 5 3 7 -3 7 -5 14 -1 11 3 15 4 13 0 -4 0 11 2 6 6 11 -5 4 -10 -9 -10 -10 -10 -7 0 4 -6 -2 -4 9 10 8 -5 6 13 10 -1 13 -9 9 -6 -6 -3 -4 12 14 8 15 -1 8 2 -4 7 -10 8 -7 15 -2 -1 4 -7 8 2 10 10 4 -7 0 -7 8 14 -5 8 2 4 9 11 -5 -6 13 2 -2 1 5 2 0 11 -2 -8 10 15 -4 -2 0 -5 0 5 9 7 2 7 -7 9 -3 3 6 4 10 2 14 10 -2 14 10 2 6 9 -1 -8 12 5 4 7 -4 -9 1 -8 13 12 -3 -2 7 2 14 2 1 -8 2 8 4 -2 4 0 10 12 13 14 7 0 -8 -2 9 -5 4 -5 6 -1 15 14 15 -1 -5 -6 0 9 9 -4 1 11 2 5 7 -8 10 0 3 7 -2 4 -6 -6 -7 -9 4 6 10 -8 -2 11 2 7 -9 8 8 0 4 -2 -5 -3 13 -7 10 1 -5 9 10 15 -1 3 -9 13 -3 9 -6 5 -5 2 -8 -5 3 -1 -8 7 8 3 11 -2 6 -5 10 9 13 10 -4 12 7 1 -5 -3 -4 -8 -2 -9 13 6 -10 -8 -4 -9 -2 8 2 -10 -7 -5 -5 -2 1 -6 -7 -4 4 1 11 -3 8 4 13 3 13 0 9 7 1 -1 -3 7 6 -4 -10 10 12 10 5 13 13 0 10 -7 5 3 -1 4 -7 15 9 13 -8 -6 14 14 7 -7 -1 9 14 3 6 -9 1 8 6 -7 -3 15 -10 -7 2'
min_threshold, max_threshold = 38, 200

Spectral_Dict_Size(spectral_vector, min_threshold, max_threshold, AA_mass)

337

In [19]:
'''
CODE CHALLENGE: Solve the Probability of Spectral Dictionary Problem.
Given: A spectral vector Spectrum', an integer threshold, and an integer max_score.
Return: The probability of the dictionary Dictionarythreshold(Spectrum').
'''
def Probability_Spectral_Dict(spectral_vector, min_threshold, max_threshold, AA_mass):
    if type(spectral_vector) == str:
        spectral_vector = spectral_vector.split(' ')
        spectral_vector = list(map(int, spectral_vector))
        spectral_vector = [0] + spectral_vector
    
    prob       = np.full([len(spectral_vector), max_threshold + 1], 0, float)
    prob[0, 0] = 1
    
    for i in range(1, len(spectral_vector)):
        for t in range(max_threshold + 1):
            new_prob = 0
            for mass in AA_mass.values():
                last_i = i - mass
                last_t = t - spectral_vector[i]
                
                if (last_i >= 0) & (last_t >= 0):
                    if last_t <= max_threshold:
                        new_prob = new_prob + prob[last_i, last_t]

            prob[i, t] = (new_prob) / len(AA_mass.values())
            
            
    dict_prob = np.sum(prob[-1, min_threshold:])
    
    return(dict_prob)

# Test
spectral_vector = '2 15 11 15 12 13 -3 12 5 -4 -1 -2 8 -6 6 -1 9 11 0 1 -2 -9 -7 -8 1 -7 -10 -5 2 15 14 5 -4 4 10 -4 -6 3 5 4 10 4 14 4 -2 6 13 0 13 -1 3 7 14 0 -10 8 6 1 -5 10 -7 3 -2 5 -2 13 12 -9 -6 0 11 -1 11 15 10 7 -10 -2 2 -6 -3 4 7 -7 -1 -3 12 4 14 14 10 2 7 11 -1 6 3 -9 11 6 -5 2 1 -9 -10 -5 15 -6 10 11 -1 -5 3 0 4 -4 -1 -6 -1 13 5 2 -2 -4 -8 6 13 4 -1 -5 5 5 -2 9 6 -6 -1 3 15 7 7 -8 0 -6 14 7 2 -1 -6 9 12 7 -8 10 -9 11 8 -1 -1 -2 10 4 -8 -9 -5 -7 1 8 -3 13 10 -10 -9 12 -4 -10 -2 -2 4 -10 10 -8 0 5 8 1 5 12 -9 4 -6 -4 -9 -10 -2 3 -4 -5 5 6 14 6 10 5 0 8 3 -9 -7 -2 14 0 12 12 3 13 11 4 6 13 -4 7 7 5 -7 10 0 -5 3 7 2 8 6 4 12 10 -8 5 7 0 9 6 0 10 7 3 -5 -2 -7 -9 10 -10 -9 5 -4 -10 13 14 -9 4 9 12 6 -2 13 -4 2 13 9 -10 -8 14 15 6 13 8 9 6 -8 -6 12 6 -8 2 -8 5 13 5 5 -9 8 -2 9 -2 9 14 -3 -5 4 4 11 -10 0 13 11 14 -7 -7 11 3 -6 4 -1 13 -4 10 -1 -3 15 3 4 -6 6 -5 -6 5 15 14 9 -7 8 2 -10 1 8 -4 6 -4 11 8 -9 3 -7 -7 12 -2 12 9 14 4 7 4 8 9 4 11 -10 -5 -5 0 9 -10 -10 6 2 -5 9 -7 14 -6 5 9 8 -8 2 -8 5 3 -4 9 7 -8 5 -4 -2 -6 6 -9 9 -3 7 -4 8 3 12 11 10 -3 -6 7 8 0 -2 -8 2 -4 -6 8 -10 -5 8 6 10 6 7 9 -6 0 3 -4 1 12 -5 -10 10 2 7 7 0 5 -8 8 14 -2 -10 15 15 7 1 3 12 14 15 -1 -6 -7 9 1 10 -8 4 3 5 15 -4 11 -8 -8 1 -5 13 -2 12 8 5 -5 9 13 -10 11 0 -8 15 15 -3 -1'
min_threshold, max_threshold = 36, 200

Probability_Spectral_Dict(spectral_vector, min_threshold, max_threshold, AA_mass)

4.7484375000000001e-05

In [20]:
'''
CODE CHALLENGE: Solve the Spectral Alignment Problem.
Given: A peptide Peptide, a spectral vector Spectrum', and an integer k.
Return: A peptide Peptide' related to Peptide by up to k modifications with maximal score against Spectrum' out of all possibilities.
'''

def Spectral_Alignment(peptide_string, spectral_vector, k_modif, AA_mass):
    if type(spectral_vector) == str:
        spectral_vector = spectral_vector.split(' ')
        spectral_vector = list(map(int, spectral_vector))
        spectral_vector = [0] + spectral_vector
    
    score_matrix = np.full([k_modif + 1, len(peptide_string) + 1, len(spectral_vector)], -float('Inf'), float)
    score_matrix[0, 0, 0] = 0
    
    AA_mass_list = [0]
    prefix_peptide_mass = [0]
    current_mass = 0
    for AA in peptide_string:
        AA_mass_list.append(AA_mass[AA])
        current_mass = current_mass + AA_mass[AA]
        prefix_peptide_mass.append(current_mass)
    
    # dynamic programming
    for t in range(k_modif + 1):
        for i in range(1, len(peptide_string) + 1):
            for j in range(len(spectral_vector)):
                max_score = -float('Inf')
                if j >= AA_mass_list[i]:
                    max_score = score_matrix[t, i - 1, j - AA_mass_list[i]]
                for jj in range(j):
                    if max_score < score_matrix[t - 1, i - 1, jj]:
                        max_score = score_matrix[t - 1,i - 1, jj]
                score_matrix[t, i, j] = spectral_vector[j] + max_score
        
    # backtrack
    i = len(prefix_peptide_mass) - 1
    j = len(spectral_vector) - 1
    t = list(score_matrix[:, -1, -1]).index(max(score_matrix[:, -1, -1]))
    modifications = []

    while t != 0:
        score = score_matrix[t, i, j] - spectral_vector[j]
        
        if score == score_matrix[t, i - 1, j - AA_mass_list[i]]:
            j = j - AA_mass_list[i]
            i = i - 1
            
        else:
            for jj in range(j - 1):
                if score == score_matrix[t - 1, i - 1, jj]:
                    modifications.append([i, j - prefix_peptide_mass[i]])
                    i = i - 1
                    j = jj
                    t = t - 1
                    break
                    
    modifications = sorted(modifications)
    
    # insert modification note
    peptide_string = list(peptide_string)
    shift = 0
    for modification in modifications:
        position = modification[0] - 1
        score    = modification[1]
        
        shift = score - shift
        
        if shift > 0:
            insertion = '(+' + str(shift) + ')'
        else : 
            insertion = '(' + str(shift) + ')'
            
        peptide_string[position] = peptide_string[position] + insertion
        
    peptide_string = ''.join(peptide_string)

    return(peptide_string)

spectral_vector = '-10 11 2 6 9 10 12 13 -8 15 -7 7 -9 9 9 10 13 -3 -8 4 15 -10 -7 4 14 -1 8 15 13 1 14 15 1 -8 1 -6 15 10 -4 15 -4 4 14 12 12 7 14 6 -6 -6 9 2 -6 -2 -1 2 13 3 -8 -4 -5 2 3 -5 13 8 -7 14 2 -8 9 -3 0 8 14 3 -5 10 6 -9 14 -1 -5 -10 -9 9 -6 6 2 -1 -1 7 -10 -3 -4 3 3 -3 7 14 -5 13 -8 15 7 -1 5 -9 -3 -2 -2 -8 -3 -9 -8 -4 -9 5 14 9 -5 10 6 4 -7 -6 -3 13 7 8 7 4 -5 -6 -6 -8 2 5 1 8 4 2 13 4 1 -4 12 -8 -1 -5 0 4 -10 8 11 13 0 11 10 -3 12 -2 5 14 9 -2 12 -9 -5 -2 9 -8 -10 1 -3 -3 -5 2 -9 -2 -2 -9 -4 1 6 -8 15 14 -1 9 3 6 -9 5 -7 14 8 -10 8 6 -10 12 0 -1 -5 9 -5 -2 3 2 -3 7 10 14 8 -4 15 15 12 15 0 -6 10 9 8 -7 -6 -10 3 -5 -9 -6 -5 14 15 12 11 4 2 10 -7 -2 9 1 11 5 1 3 6 3 -3 9 8 2 7 -7 -9 9 11 11 4 -1 -10 -8 6 10 -6 9 -3 13 10 -9 -10 5 3 -9 -2 3 -7 3 3 3 -9 -3 -8 13 14 6 15 -2 -8 -1 8 -1 -6 -7 -3 9 -10 11 1 6 -7 14 -8 -2 10 13 -2 -2 9 5 15 2 8 11 5 7 -9 -3 2 7 -3 11 -5 3 0 8 10 -4 0 15 2 1 0 4 -6 -8 -6 -10 -7 0 -1 4 12 -9 10 -8 2 3 9 4 -4 11 6 10 -1 -3 -8 5 2 6 2 4 9 -10 9 -9 14 10 -5 10 11 -9 -7 -10 -1 5 2 -9 11 6 -3 -2 -8 15 2 7 -9 1 11 10 -9 -5 13 -10 -6 6 5 11 5 -10 7 -9 7 7 12 -8 9 -9 -5 2 -3 6 12 -2 12 -6 -8 10 -4 6 -3 6 -7 -10 -7 2 -4 5 -7 13 2 11 -7 8 12 13 2 1 7 -1 6 15 12 5 -7 -1 10 -1 3 12 6 -3 4 -7 -2 -1 14 1 2 14 -9 -1 -5 -5 -4 -6 6 -4 2 -4 8 8 5 1 1 -7 10 14 -1 -3 -9 0 -5 13 4 -7 15 14 9 -5 -3 -6 -5 -5 -9 -6 -2 6 15 11 3 9 11 9 11 -7 -5 -4 -6 6 5 6 5 14 14 0 -7 -5 2 -7 -4 0 4 5 2 11 -8 13 14 11 9 8 6 11 8 -10 -4 15 10 13 11 -10 5 -10 -9 10 0 3 -5 -3 11 1 6 15 -8 -2 -9 4 -7 -4 0 -5 -1 14 12 -6 14 14 14 7 -5 -7 -7 14 4 1 7 2 -10 15 5 2 11 5 4 1 2 14 -6 -7 -6 4 -8 -1 11 9 14 4 -2 -7 -10 -6 1 -1 0 9 6 7 -9 -4 2 13 10 -6 12 -10 -3 15 6 -5 -1 -1 13 3 -5 8 7 -4 4 2 15 -2 12 6 14 15 8 8 11 13 8 5 -7 1 -10 -8 -2 -1 -5 -6 4 -9 11 -1 -8 13 8 11 0 0 14 -3 8 -2 5 -10 -9 -1 10 13 7 12 13 12 -8 5 -10 6 -9 14 3 10 -5 15 2 6 15 11 -9 -8 8 4 -5 0 -4 -2 -9 -10 13 13 -2 10 -1 -6 8 -5 13 9 4 -1 1 9 5 8 9 -8 -7 -5 3 14 6 4 -6 -10 3 13 -2 7 -4 6 2 -1 -4 14 11 13 -5 6 10 -7 7 -9 -7 -7 -6 -10 -3 7 -1 13 -4 -4 9 6 12 1 12 12 9 11 12 6 -10 0 2 -9 -2 -1 -10 15 -6 -3 9 -3 9 -2 -2 7 8 1 3 12 -2 -2 -7 -5 10 13 -2 -4 -4 14 -6 2 -1 -2 8 8 5 -6 5 -4 15 -10 7 2 8 2 10 -1 4 -6 0 14 3 0 15 4 13 6 13 13 15 11 -6 -3 -10 15 14 -2 8 -1 -8 4 2 -3 13 12 7 -4'
peptide_string  = 'YNQNWH'
k_modif = 2

Spectral_Alignment(peptide_string, spectral_vector, k_modif, AA_mass)

'YN(+12)QNWH(-12)'