In [1]:
import random
import copy
import numpy as np
from collections import Counter

# Week 1

In [2]:
'''
Code Challenge: Solve the String Composition Problem.
Input: An integer k and a string Text.
Output: Compositionk(Text) (the k-mers can be provided in any order).
'''

def Composition(string, k):
    composition = []
    for start in range(len(string) - k +1):
        end       = start + k
        substring = string[start: end]
        composition.append(substring)
    return (composition)

# Test
k = 5
string = 'CAATCCAAC'

Composition(string, k)

['CAATC', 'AATCC', 'ATCCA', 'TCCAA', 'CCAAC']

In [3]:
'''
String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.
Input: A sequence of k-mers Pattern1, … ,Patternn such that the last k - 1 symbols of Patterni are equal to the first k-1 symbolsof Patterni+1 for 1 ≤ i ≤ n-1.
Output: A string Text of length k+n-1 such that the i-th k-mer in Text is equal to Patterni  (for 1 ≤ i ≤ n).
'''

def NaiveLink(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    string = patterns[0]
    for pattern in patterns[1:]:
        string = string + pattern[-1]
    return (string)

# Test
patterns = '''ACCGA
CCGAA
CGAAG
GAAGC
AAGCT'''

NaiveLink(patterns)

'ACCGAAGCT'

In [4]:
'''
Code Challenge: Solve the Overlap Graph Problem (restated below).
Input: A collection Patterns of k-mers.
Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. 
'''

def FindOverlap(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    
    adjacency_list = {}
    prefixs = {}
    suffixs = {}
    for pattern in patterns:
        suffixs[pattern] = pattern[1:]
        if pattern[:-1] in prefixs:
            prefixs[pattern[:-1]].append(pattern)
        else:
            prefixs[pattern[:-1]] = [pattern]
    
    for pattern,suffix in suffixs.items():
        if suffix in prefixs:
            adjacency_list[pattern] = prefixs[suffix]
            
    return(adjacency_list)

# Test
patterns = '''ATGCG
GCATG
CATGC
AGGCA
GGCAT
GGCAC'''

FindOverlap(patterns)

{'AGGCA': ['GGCAT', 'GGCAC'],
 'CATGC': ['ATGCG'],
 'GCATG': ['CATGC'],
 'GGCAT': ['GCATG']}

In [5]:
'''
DeBruijn Graph from k-mers Problem: Construct the de Bruijn graph from a set of k-mers.
Input: A collection of k-mers Patterns.
Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).
'''

def PathGraph(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    
    graph = {}
    for pattern in patterns:
        if pattern[:-1] in graph:
            graph[pattern[:-1]].append(pattern[1:])
        else:
            graph[pattern[:-1]] = [pattern[1:]]
    return(graph)

# Test
patterns = '''GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG'''

PathGraph(patterns)

{'AGG': ['GGG'],
 'CAG': ['AGG', 'AGG'],
 'GAG': ['AGG'],
 'GGA': ['GAG'],
 'GGG': ['GGG', 'GGA']}

In [6]:
'''
Code Challenge: Solve the De Bruijn Graph from a String Problem.
Input: An integer k and a string Text.
Output: DeBruijnk(Text), in the form of an adjacency list.
'''

def DeBruijn(string, k):
    composition  = Composition(string, k)
    output_graph = PathGraph(composition)
    
    return (output_graph)

# Test
k = 4
string = 'AAGATTCTCTAAGA'

DeBruijn(string, k)

{'AAG': ['AGA', 'AGA'],
 'AGA': ['GAT'],
 'ATT': ['TTC'],
 'CTA': ['TAA'],
 'CTC': ['TCT'],
 'GAT': ['ATT'],
 'TAA': ['AAG'],
 'TCT': ['CTC', 'CTA'],
 'TTC': ['TCT']}

# Week 2

In [7]:
'''
Code Challenge: Solve the Eulerian Cycle Problem.
Input: The adjacency list of an Eulerian directed graph.
Output: An Eulerian cycle in this graph.
'''

def GraphString2Dict(graph):
    graph = graph.split('\n')
    graph_dict = {}
    for path in graph:
        path       = path.replace(' ','')
        from_, to_ = path.split('->')
        if ',' in to_:
            to_              = to_.split(',')
            graph_dict[from_] = to_
        else:
            graph_dict[from_] = [to_]
    return(graph_dict)

def EulerianCycle(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
        
    tmp_graph = copy.deepcopy(graph)
    from_ = random.choice(list(tmp_graph.keys()))
    cycle = [from_]
    unexplored_nodes = {}
    
    while len(tmp_graph) != 0:
        if from_ in tmp_graph:
            to = random.choice(tmp_graph[from_])
            cycle.append(to)
            tmp_graph[from_].remove(to)
            
            if len(tmp_graph[from_]) == 0:
                tmp_graph.pop(from_)
                unexplored_nodes.pop(from_, None)
            else:
                unexplored_nodes[from_] = copy.deepcopy(tmp_graph[from_])
            from_ = to
        
        else:
            # find a random new start point from unexplored_nodes
            unexplored_node_index = np.where(np.array(cycle) == random.choice(list(unexplored_nodes)))[0][-1]
            unexplored_node       = cycle[unexplored_node_index]
            next_node             = random.choice(unexplored_nodes[unexplored_node])
            tmp_graph[unexplored_node].remove(next_node)
            
            # make the cycle go back to new start point
            recover_cycle = cycle[unexplored_node_index:]
            cycle         = cycle[:unexplored_node_index + 1]
            from_         = next_node
            cycle.append(from_)
            
            # reconstruct the tmp_graph and unexplored_nodes according to current cycle
            unexplored_nodes = {}
            tmp_graph        = copy.deepcopy(graph)
            for i in range(len(cycle) - 1):
                node    = cycle[i]
                to_node = cycle[i + 1]
                tmp_graph[node].remove(to_node)
                if len(tmp_graph[node]) == 0:
                    tmp_graph.pop(node)
                    unexplored_nodes.pop(node, None)
                else : 
                    unexplored_nodes[node] = copy.deepcopy(tmp_graph[node])
    return(cycle)

# Test
graph = '''0 -> 3
     1 -> 0
     2 -> 1,6
     3 -> 2
     4 -> 2
     5 -> 4
     6 -> 5,8
     7 -> 9
     8 -> 7
     9 -> 6'''

EulerianCycle(graph)

['6', '8', '7', '9', '6', '5', '4', '2', '1', '0', '3', '2', '6']

In [8]:
'''
Code Challenge: Solve the Eulerian Path Problem.
Input: The adjacency list of a directed graph that has an Eulerian path.
Output: An Eulerian path in this graph.
'''

def CheckBalance(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
    balance = {}
    
    for key, values in graph.items():
        balance[key] = 0
        for value in values:
            balance[value] = 0
    
    for from_, tos in graph.items():
        balance[from_] = balance[from_] + len(tos)
        for to in tos:
            balance[to] = balance[to] - 1
    return (balance)

def EulerianPath(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
    
    balance = CheckBalance(graph)
    # find start point by balance > 0
    for key, values in balance.items():
        if values > 0 :
            from_ = key
            break
            
    tmp_graph = copy.deepcopy(graph)
    cycle = [from_]
    unexplored_nodes = {}
    
    while len(tmp_graph) != 0:
        if from_ in tmp_graph:
            to = random.choice(tmp_graph[from_])
            cycle.append(to)
            tmp_graph[from_].remove(to)
            if len(tmp_graph[from_]) == 0:
                tmp_graph.pop(from_)
                unexplored_nodes.pop(from_, None)
            else:
                unexplored_nodes[from_] = copy.deepcopy(tmp_graph[from_])
            from_ = to
        
        else:
            unexplored_node_index = np.where(np.array(cycle) == random.choice(list(unexplored_nodes)))[0][-1]
            unexplored_node       = cycle[unexplored_node_index]
            next_node             = random.choice(unexplored_nodes[unexplored_node])
            tmp_graph[unexplored_node].remove(next_node)

            recover_cycle = cycle[unexplored_node_index:]
            cycle         = cycle[:unexplored_node_index + 1]
            from_         = next_node
            cycle.append(from_)

            unexplored_nodes = {}
            tmp_graph        = copy.deepcopy(graph)
            for i in range(len(cycle) - 1):
                node    = cycle[i]
                to_node = cycle[i + 1]
                tmp_graph[node].remove(to_node)
                if len(tmp_graph[node]) == 0:
                    tmp_graph.pop(node)
                    unexplored_nodes.pop(node, None)
                else : 
                    unexplored_nodes[node] = copy.deepcopy(tmp_graph[node])
    return(cycle)

# Test
graph='''0 -> 2
     1 -> 3
     2 -> 1
     3 -> 0,4
     6 -> 3,7
     7 -> 8
     8 -> 9
     9 -> 6'''

EulerianPath(graph)

['6', '7', '8', '9', '6', '3', '0', '2', '1', '3', '4']

In [9]:
'''
Code Challenge: Solve the String Reconstruction Problem.
Input: An integer k followed by a list of k-mers Patterns.
Output: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)
'''

def Reconstruction(k, strings):
    if type(strings) == str:
        strings = strings.split('\n')
    graph          = PathGraph(strings)
    cycle          = EulerianPath(graph)
    reconstruction = NaiveLink(cycle)
    return(reconstruction)

# Test
k = 4
strings = '''CTTA
ACCA
TACC
GGCT
GCTT
TTAC'''

Reconstruction(k, strings)

'GGCTTACCA'

In [10]:
'''
Code Challenge: Solve the k-Universal Circular String Problem.
Input: An integer k.
Output: A k-universal circular string.
'''

def BinaryStrings(k):
    strings = ['0','1']
    for i in range(k - 1):
        strings = strings + strings
        half    = len(strings) / 2
        for index in range(len(strings)):
            if index < half:
                strings[index] = strings[index] + '0'
            else :
                strings[index] = strings[index] + '1'
    return(strings)

def KUniversal(k):
    strings        = BinaryStrings(k) 
    graph          = PathGraph(strings)
    cycle          = EulerianCycle(graph)
    reconstruction = NaiveLink(cycle)
    # delete last k - 1 number, because it is a cycle 
    reconstruction = reconstruction[: -k + 1]
    return(reconstruction)

# Test
k = 4

KUniversal(k)

'1111011000010100'

In [11]:
'''
Code Challenge: Solve the String Reconstruction from Read-Pairs Problem.
Input: Integers k and d followed by a collection of paired k-mers PairedReads.
Output: A string Text with (k, d)-mer composition equal to PairedReads.
'''

def PairedPathGraph(gapped_patterns, k):
    if type(gapped_patterns) == str:
        gapped_patterns = gapped_patterns.split('\n')

    graph = {}
    for pattern in gapped_patterns:
        prefix = pattern[ : k - 1] + pattern[k    : (k * 2)]
        suffix = pattern[1: k + 1] + pattern[k + 2: (k * 2) + 1]
        if prefix in graph:
            graph[prefix].append(suffix)
        else:
            graph[prefix] = [suffix]

    return(graph)

def PairedReconstruction(gapped_patterns, k, d):
    graph = PairedPathGraph(gapped_patterns, k)
    path  = EulerianPath(graph)  
    
    gapped_patterns_1 = []
    gapped_patterns_2 = []
    
    for gapped_pattern in path:
        pattern_1, pattern_2 = gapped_pattern.split('|')
        gapped_patterns_1.append(pattern_1)
        gapped_patterns_2.append(pattern_2)
        
    prefix_string = NaiveLink(gapped_patterns_1)
    suffix_string = NaiveLink(gapped_patterns_2)
    
    overlap_length = len(prefix_string) - k - d
    for i in range(overlap_length):
        if suffix_string[i] != prefix_string[i + k + d]:
            return('there is no string spelled by the gapped patterns')
    whole_string = prefix_string + suffix_string[overlap_length:]
    return(whole_string)

# Test
k = 4
d = 2
gapped_patterns = '''GAGA|TTGA
TCGT|GATG
CGTG|ATGT
TGGT|TGAG
GTGA|TGTT
GTGG|GTGA
TGAG|GTTG
GGTC|GAGA
GTCG|AGAT'''

PairedReconstruction(gapped_patterns, k, d)

'GTGGTCGTGAGATGTTGA'

In [12]:
'''
Contig Generation Problem: Generate the contigs from a collection of reads (with imperfect coverage).
Input: A collection of k-mers Patterns. 
Output: All contigs in DeBruijn(Patterns).
'''

def CountInOut(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
        
    in_dict  = {}
    out_dict = {}
    
    for key, values in graph.items():
        out_dict[key] = 0
        for value in values:
            in_dict[value] = 0
    
    for prefix, suffixs in graph.items():
        out_dict[prefix] = out_dict[prefix] + len(suffixs)
        for suffix in suffixs:
            in_dict[suffix] = in_dict[suffix] + 1
            
    return (in_dict,out_dict)

def MaximalNonBranchingPaths(strings):
    graph = PathGraph(strings)

    in_count, out_count = CountInOut(graph)

    nodes = set()
    for _ in in_count.keys():
        nodes.add(_)
    for _ in out_count.keys():    
        nodes.add(_)
    nodes = list(nodes)

    '''
    #This is for isolated cycle
    one_one_graph = {}
    for key, value in graph.items():
        if (in_count.get(key, -1) == out_count.get(key, -1) == 1):
            one_one_graph[key] = value
    '''
    
    paths = []
    for node in nodes:
        if (in_count.get(node) != 1) | (out_count.get(node) != 1):
            if out_count.get(node, -1) > 0:
                for out_node in graph[node]:
                    non_branching_path = []
                    non_branching_path.append(node)
                    non_branching_path.append(out_node)
                    while (in_count.get(out_node, -1) == out_count.get(out_node, -1) == 1):
                        #one_one_graph.pop(out_node)
                        non_branching_path.extend(graph[out_node])
                        out_node = graph[out_node][0]
                    paths.append(NaiveLink(non_branching_path))
                    
    return(paths)

# Test
strings = '''ATG
ATG
TGT
TGG
CAT
GGA
GAT
AGA'''

MaximalNonBranchingPaths(strings)

['TGT', 'TGGA', 'GAT', 'ATG', 'ATG', 'CAT', 'AGA']

# Week 3

In [13]:
'''
Protein Translation Problem: Translate an RNA string into an amino acid string.
Input: An RNA string Pattern and the array GeneticCode.
Output: The translation of Pattern into an amino acid string Peptide.
'''


table = open('data/RNA_codon_table.txt').read()
table = table.split('\n')
translation_rule = {}
for line in table:
    RNA, peptide = line.split(' ')
    translation_rule[RNA] = peptide

def RNA2peptide(RNA_string):
    peptide_string = ''
    position       = 0
    while ('*' not in peptide_string) & (position in range(len(RNA_string)-2)):
        RNA            = RNA_string[position: position + 3]
        AA             = translation_rule[RNA]
        peptide_string = peptide_string + AA
        position       = position + 3
        
    peptide_string = peptide_string[: -1]
    
    return(peptide_string)

# Test
RNA_string = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'

RNA2peptide(RNA_string)

'MAMAPRTEINSTRING'

In [14]:
'''
Peptide Encoding Problem: Find substrings of a genome encoding a given amino acid sequence.
Input: A DNA string Text, an amino acid string Peptide, and the array GeneticCode.
Output: All substrings of Text encoding Peptide (if any such substrings exist).
'''

def RNA2peptideNoStop(RNA_string):
    peptide_string = ''
    position       = 0
    while (position in range(len(RNA_string) - 2)):
        RNA            = RNA_string[position: position + 3]
        AA             = translation_rule[RNA]
        peptide_string = peptide_string + AA
        position       = position + 3
    
    return(peptide_string)

def ReverseComplement(DNA_string):
    complementry_rule = {
        'A' : 'T',
        'T' : 'A',
        'C' : 'G',
        'G' : 'C' 
    }
    com_string = ''
    for base in DNA_string:
        com_string = com_string + complementry_rule[base]
    
    com_string = com_string[::-1]
    
    return (com_string)

def DNA2RNA(DNA_string):
    complementry_rule = {
        'A' : 'U',
        'T' : 'A',
        'C' : 'G',
        'G' : 'C' 
    }
    RNA_string = ''
    for base in DNA_string:
        RNA_string = RNA_string + complementry_rule[base]
    
    RNA_string = RNA_string[::-1]
    
    return(RNA_string)

def RNA2DNA(RNA_string):
    complementry_rule = {
        'A' : 'T',
        'U' : 'A',
        'C' : 'G',
        'G' : 'C' 
    }
    DNA_string = ''
    for base in RNA_string:
        DNA_string = DNA_string + complementry_rule[base]
    
    DNA_string = DNA_string[::-1]
    
    return(DNA_string)

def PatternMatching(pattern, genome):
    positions = []
    for start in range(len(genome) - len(pattern) + 1):
        end = start + len(pattern)
        if pattern == genome[start: end]:
            positions.append(start)
    return(positions)

def PeptideEncoding(DNA_string, peptide_pattern):
    matched_DNAs   = []
    for start in range(3):
        DNA_pattern    = DNA_string
        RNA_string     = DNA2RNA(DNA_pattern)[start:]
        peptide_string = RNA2peptideNoStop(RNA_string)
        positions      = PatternMatching(peptide_pattern, peptide_string)
        
        for position in positions:
            start       = position * 3
            end         = start + len(peptide_pattern) * 3
            matched_RNA = RNA_string[start: end]
            matched_DNAs.append(RNA2DNA(matched_RNA))
            
    for start in range(3):
        DNA_pattern    = ReverseComplement(DNA_string)
        RNA_string     = DNA2RNA(DNA_pattern)[start:]
        peptide_string = RNA2peptideNoStop(RNA_string)
        positions      = PatternMatching(peptide_pattern, peptide_string)
        for position in positions:
            start       = position * 3
            end         = start + len(peptide_pattern) * 3
            matched_RNA = RNA_string[start: end]
            matched_DNAs.append(ReverseComplement(RNA2DNA(matched_RNA)))
    
    return(matched_DNAs)

# Test
DNA_string = 'ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA'
peptide_pattern = 'MA'

PeptideEncoding(DNA_string, peptide_pattern)

['GGCCAT', 'ATGGCC', 'ATGGCC']

In [15]:
'''
Code Challenge: Implement LinearSpectrum.
Input: An amino acid string Peptide.
Output: The linear spectrum of Peptide.
'''

table = open('data/integer_mass_table.txt').read()
table = table.split('\n')
AA_weitght = {}
for line in table:
    AA, weight = line.split(' ')
    AA_weitght[AA] = int(weight)

def LinearSpectrum(peptide_string):
    prefix_weight = [0]
    for AA in peptide_string:
        prefix_weight.append(prefix_weight[-1] + AA_weitght[AA])
        
    linear_spectrum = [0]
    for i in range(len(peptide_string)):
        for j in range(i + 1, len(peptide_string) + 1):
            linear_spectrum.append(prefix_weight[j] - prefix_weight[i])
    linear_spectrum.sort()
    return(linear_spectrum)

# Test
peptide_string = 'NQEL'

LinearSpectrum(peptide_string)

[0, 113, 114, 128, 129, 242, 242, 257, 370, 371, 484]

In [16]:
'''
Generating Theoretical Spectrum Problem: Generate the theoretical spectrum of a cyclic peptide.
Input: An amino acid string Peptide.
Output: Cyclospectrum(Peptide).
'''

table = open('data/integer_mass_table.txt').read()
table = table.split('\n')
AA_weitght = {}
for line in table:
    AA, weight = line.split(' ')
    AA_weitght[AA] = int(weight)

def CyclicSpectrum(peptide_string):
    prefix_weight = [0]
    for AA in peptide_string:
        prefix_weight.append(prefix_weight[-1] + AA_weitght[AA])
    
    total_weight = prefix_weight[-1]
    cyclic_spectrum = [0]
    for i in range(len(peptide_string)):
        for j in range(i + 1, len(peptide_string) + 1):
            cyclic_spectrum.append(prefix_weight[j] - prefix_weight[i])
            if (i > 0) and (j < (len(peptide_string))):
                cyclic_spectrum.append(total_weight - (prefix_weight[j] - prefix_weight[i]))
    cyclic_spectrum.sort()
    return(cyclic_spectrum)

# Test
peptide_string = 'LEQN'

CyclicSpectrum(peptide_string)

[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]

In [17]:
'''
Code Challenge: Implement CyclopeptideSequencing
'''

def CyclopeptideSequencing(spectrum):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    AAs = ['G','A','S','P','V','T','C','I','L','N','D','K','Q','E','M','H','F','R','Y','W']
    
    output_peptides = []
    
    peptides = ['']
    while len(peptides) != 0:
        
        extend_peptides = []
        for peptide in peptides:
            for AA in AAs:
                extend_peptide = peptide + AA
                extend_peptides.append(extend_peptide)
        peptides = copy.deepcopy(extend_peptides)
        
        for peptide in peptides:
            peptide_spectrum = LinearSpectrum(peptide)
            if peptide_spectrum[-1] == spectrum[-1]:
                if CyclicSpectrum(peptide) == spectrum:
                    output_peptides.append(peptide)
                extend_peptides.remove(peptide)
            
            else:
                for weight in peptide_spectrum:
                    if weight not in spectrum:
                        extend_peptides.remove(peptide)
                        break
                        
        peptides = copy.deepcopy(extend_peptides)
    print(output_peptides)   
    
    output_weight_string = []
    for output_peptide in output_peptides:
        weight_string = []
        for AA in output_peptide:
            weight = str(AA_weitght[AA])
            weight_string.append(weight)
        output_weight_string.append('-'.join(weight_string))
        
    output_weight_string = list(set(output_weight_string))
    
    return(output_weight_string)

# Test
spectrum = '0 113 128 186 241 299 314 427'

CyclopeptideSequencing(spectrum)

['IKW', 'IQW', 'IWK', 'IWQ', 'LKW', 'LQW', 'LWK', 'LWQ', 'KIW', 'KLW', 'KWI', 'KWL', 'QIW', 'QLW', 'QWI', 'QWL', 'WIK', 'WIQ', 'WLK', 'WLQ', 'WKI', 'WKL', 'WQI', 'WQL']


['186-113-128',
 '186-128-113',
 '113-128-186',
 '113-186-128',
 '128-186-113',
 '128-113-186']

# Week 4

In [18]:
'''
Cyclopeptide Scoring Problem: Compute the score of a cyclic peptide against a spectrum.
Input: An amino acid string Peptide and a collection of integers Spectrum. 
Output: The score of Peptide against Spectrum, Score(Peptide, Spectrum).
'''

def Cyclopeptide_Scoring(peptide_string, spectrum):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    
    theoretical_spectrum = CyclicSpectrum(peptide_string)
    
    score = 0
    for mass in theoretical_spectrum:
        if mass in spectrum:
            spectrum.remove(mass)
            score = score + 1
            
    return(score)

# Test
peptide_string = 'NQEL'
spectrum = '0 99 113 114 128 227 257 299 355 356 370 371 484'

Cyclopeptide_Scoring(peptide_string, spectrum)

11

In [19]:
'''
Code Challenge: Implement Trim (reproduced below).
Input: A collection of peptides Leaderboard, a collection of integers Spectrum, and an integer N.
Output: The N highest-scoring linear peptides on Leaderboard with respect to Spectrum.
'''

def Linearpeptide_Scoring(peptide_string, spectrum):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    
    theoretical_spectrum = LinearSpectrum(peptide_string)
    
    score = 0
    for mass in theoretical_spectrum:
        if mass in spectrum:
            spectrum.remove(mass)
            score = score + 1
            
    return(score)

def Trim(leaderboard, spectrum, N):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    
    if type(leaderboard) == str:
        leaderboard = leaderboard.split(' ')
    
    linear_scores = []
    for peptide in leaderboard:
        
        peptide_score = Linearpeptide_Scoring(peptide, spectrum)
        linear_scores.append(peptide_score)
    
    score_peptide      = zip(linear_scores, leaderboard)
    score_peptide      = sorted(score_peptide, reverse = True)
    score_sorted       = []
    leaderboard_sorted = []
    for x, y in score_peptide:
        score_sorted.append(x)
        leaderboard_sorted.append(y)
        
    for rank in range(N, len(leaderboard)):
        if score_sorted[rank] < score_sorted[N - 1]:
            leaderboard_sorted = leaderboard_sorted[: rank]
            return(leaderboard_sorted)
        
    return(leaderboard_sorted)

# Test
leaderboard = 'LAST ALST TLLT TQAS'
spectrum = '0 71 87 101 113 158 184 188 259 271 372'
N = 2

Trim(leaderboard, spectrum, N)

['LAST', 'ALST']

In [20]:
'''
Code Challenge: Implement LeaderboardCyclopeptideSequencing.
Input: An integer N and a collection of integers Spectrum.
Output: LeaderPeptide after running LeaderboardCyclopeptideSequencing(Spectrum, N).
'''

def LeaderboardCyclopeptideSequencing(spectrum, N):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    AAs = ['G','A','S','P','V','T','C','I','L','N','D','K','Q','E','M','H','F','R','Y','W']
    
    leaderboard   = ['']
    leaderpeptide = ''
    while len(leaderboard) != 0:
        extend_leaderboard = []
        for peptide in leaderboard:
            for AA in AAs:
                extend_peptide = peptide + AA
                extend_leaderboard.append(extend_peptide)
        leaderboard = copy.deepcopy(extend_leaderboard)
        
        for peptide in leaderboard:
            peptide_spectrum = LinearSpectrum(peptide)
            if peptide_spectrum[-1] == spectrum[-1]:
                if Linearpeptide_Scoring(peptide, spectrum) > Linearpeptide_Scoring(leaderpeptide, spectrum):
                    leaderpeptide = peptide
            elif peptide_spectrum[-1] > spectrum[-1]:
                extend_leaderboard.remove(peptide)

        leaderboard = Trim(extend_leaderboard, spectrum, N)
            
    output_number = []
    for AA in leaderpeptide:
        weight = str(AA_weitght[AA])
        output_number.append(weight)
    output_number = '-'.join(output_number)
    
    return(output_number)      

# Test
N = 10
spectrum = '0 71 113 129 147 200 218 260 313 331 347 389 460'

LeaderboardCyclopeptideSequencing(spectrum, N)

'113-147-71-129'

In [21]:
'''
Spectral Convolution Problem: Compute the convolution of a spectrum.
Input: A collection of integers Spectrum.
Output: The list of elements in the convolution of Spectrum. If an element has multiplicity k, it should appear exactly k times.
'''

def SpectralConvolution(spectrum):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    
    convolution_spectrum = []
    for i in spectrum:
        for j in spectrum:
            if ((i - j) > 56) & ((i - j) < 200):
                convolution_spectrum.append(i - j)
                
    #convolution_spectrum.sort()
    return(convolution_spectrum)

spectrum = '0 137 186 323'

SpectralConvolution(spectrum)

[137, 186, 186, 137]

In [22]:
'''
Code Challenge: Implement ConvolutionCyclopeptideSequencing.
Input: An integer M, an integer N, and a collection of (possibly repeated) integers Spectrum.
Output: A cyclic peptide LeaderPeptide with amino acids taken only from the top M elements (and ties) of the convolution of Spectrum that fall between 57 and 200, and where the size of Leaderboard is restricted to the top N (and ties).
'''

def LinearSpectrum_Scoring(weights, spectrum):
    
    theoretical_spectrum = LinearWeightSpectrum(weights)
    copy_spectrum        = copy.deepcopy(spectrum)
    
    score = 0
    for mass in theoretical_spectrum:
        if mass in copy_spectrum:
            copy_spectrum.remove(mass)
            score = score + 1
    return(score)

def LinearWeightSpectrum(weights):
    if type(weights) == str:
        weights = weights.split('-')
    weights = list(map(int, weights))
    
    prefix_weight = [0]
    for weight in weights:
        prefix_weight.append(prefix_weight[-1] + weight)

    linear_spectrum = [0]
    for i in range(len(weights)):
        for j in range(i + 1, len(weights) + 1):
            linear_spectrum.append(prefix_weight[j] - prefix_weight[i])
    linear_spectrum.sort()
    
    return(linear_spectrum)

def Trim_spectrum(leaderboard, spectrum, N):
    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    
    if type(leaderboard) == str:
        leaderboard = leaderboard.split(' ')
    
    linear_scores = []
    for weight_spectrum in leaderboard:    
        spectrum_score = LinearSpectrum_Scoring(weight_spectrum, spectrum)
        linear_scores.append(spectrum_score)
    
    score_specturm     = zip(linear_scores, leaderboard)
    score_specturm     = sorted(score_specturm, reverse = True)
    score_sorted       = []
    leaderboard_sorted = []
    for x, y in score_specturm:
        score_sorted.append(x)
        leaderboard_sorted.append(y)
        
    for rank in range(N, len(leaderboard)):
        if score_sorted[rank] < score_sorted[N - 1]:
            leaderboard_sorted = leaderboard_sorted[: rank]
            return(leaderboard_sorted)
        
    return(leaderboard_sorted)

def ConvolutionCyclopeptideSequencing(M, N, spectrum):

    if type(spectrum) == str:
        spectrum = spectrum.split(' ')
    spectrum = list(map(int, spectrum))
    spectrum.sort()
    
    # Pick top M weight
    spectral_convolution = SpectralConvolution(spectrum)
    convolution_count    = Counter(spectral_convolution)
    
    weights = list(convolution_count.keys())
    counts  = list(convolution_count.values())
    
    counts_weights = zip(counts,weights)
    counts_weights = sorted(counts_weights, reverse = True)
    count_sorted   = []
    weight_sorted  = []
    for x, y in counts_weights:
        count_sorted.append(x)
        weight_sorted.append(y)
    
    for rank in range(M, len(count_sorted)):
        if count_sorted[rank] < count_sorted[M - 1]:
            weight_sorted = weight_sorted[: rank]
    
    weight_candidates = weight_sorted

    # Inital leaderboard with weight candidates
    leaderboard   = []
    for weight_candidate in weight_candidates:
        leaderboard.append([weight_candidate])
    leader_weight = [0]
    
    while len(leaderboard) != 0:
        extend_leaderboard = []
        for weight_string in leaderboard:
            for weight in weight_candidates:
                extend_weight = copy.deepcopy(weight_string)
                extend_weight.append(weight)
                extend_leaderboard.append(extend_weight)
                
        leaderboard = copy.deepcopy(extend_leaderboard)

        for weights in leaderboard:
            weights_spectrum = LinearWeightSpectrum(weights)
            if weights_spectrum[-1] == spectrum[-1]:
                if LinearSpectrum_Scoring(weights, spectrum) > LinearSpectrum_Scoring(leader_weight, spectrum):
                    leader_weight = weights
                    
            elif weights_spectrum[-1] > spectrum[-1]:
                extend_leaderboard.remove(weights)
        
        # Select top N weights for next loop leaderboard
        leaderboard = Trim_spectrum(extend_leaderboard, spectrum, N)
        
    return(leader_weight)

M = 20
N = 60
spectrum = '57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493'

ConvolutionCyclopeptideSequencing(M, N, spectrum)

[71, 99, 129, 57, 79, 58]