In [1]:
import random
import copy
import numpy as np

# Week 1

In [2]:
'''
Code Challenge: Solve the String Composition Problem.
Input: An integer k and a string Text.
Output: Compositionk(Text) (the k-mers can be provided in any order).
'''

def Composition(string, k):
    composition = []
    for start in range(len(string) - k +1):
        end       = start + k
        substring = string[start: end]
        composition.append(substring)
    return (composition)

k = 5
string = 'CAATCCAAC'

Composition(string, k)

['CAATC', 'AATCC', 'ATCCA', 'TCCAA', 'CCAAC']

In [3]:
'''
String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.
Input: A sequence of k-mers Pattern1, … ,Patternn such that the last k - 1 symbols of Patterni are equal to the first k-1 symbolsof Patterni+1 for 1 ≤ i ≤ n-1.
Output: A string Text of length k+n-1 such that the i-th k-mer in Text is equal to Patterni  (for 1 ≤ i ≤ n).
'''

def NaiveLink(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    string = patterns[0]
    for pattern in patterns[1:]:
        string = string + pattern[-1]
    return (string)

patterns = '''ACCGA
CCGAA
CGAAG
GAAGC
AAGCT'''

NaiveLink(patterns)

'ACCGAAGCT'

In [4]:
'''
Code Challenge: Solve the Overlap Graph Problem (restated below).
Input: A collection Patterns of k-mers.
Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. 
'''

def FindOverlap(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    
    adjacency_list = {}
    prefixs = {}
    suffixs = {}
    for pattern in patterns:
        suffixs[pattern] = pattern[1:]
        if pattern[:-1] in prefixs:
            prefixs[pattern[:-1]].append(pattern)
        else:
            prefixs[pattern[:-1]] = [pattern]
    
    for pattern,suffix in suffixs.items():
        if suffix in prefixs:
            adjacency_list[pattern] = prefixs[suffix]
            
    return(adjacency_list)

patterns = '''ATGCG
GCATG
CATGC
AGGCA
GGCAT
GGCAC'''

FindOverlap(patterns)

{'AGGCA': ['GGCAT', 'GGCAC'],
 'CATGC': ['ATGCG'],
 'GCATG': ['CATGC'],
 'GGCAT': ['GCATG']}

In [5]:
'''
DeBruijn Graph from k-mers Problem: Construct the de Bruijn graph from a set of k-mers.
Input: A collection of k-mers Patterns.
Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).
'''

def PathGraph(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    
    graph = {}
    for pattern in patterns:
        if pattern[:-1] in graph:
            graph[pattern[:-1]].append(pattern[1:])
        else:
            graph[pattern[:-1]] = [pattern[1:]]
    return(graph)

patterns = '''GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG'''

PathGraph(patterns)

{'AGG': ['GGG'],
 'CAG': ['AGG', 'AGG'],
 'GAG': ['AGG'],
 'GGA': ['GAG'],
 'GGG': ['GGG', 'GGA']}

In [6]:
'''
Code Challenge: Solve the De Bruijn Graph from a String Problem.
Input: An integer k and a string Text.
Output: DeBruijnk(Text), in the form of an adjacency list.
'''

def DeBruijn(string, k):
    composition  = Composition(string, k)
    output_graph = PathGraph(composition)
    
    return (output_graph)

k = 4
string = 'AAGATTCTCTAAGA'

DeBruijn(string, k)

{'AAG': ['AGA', 'AGA'],
 'AGA': ['GAT'],
 'ATT': ['TTC'],
 'CTA': ['TAA'],
 'CTC': ['TCT'],
 'GAT': ['ATT'],
 'TAA': ['AAG'],
 'TCT': ['CTC', 'CTA'],
 'TTC': ['TCT']}

# Week 2

In [7]:
'''
Code Challenge: Solve the Eulerian Cycle Problem.
Input: The adjacency list of an Eulerian directed graph.
Output: An Eulerian cycle in this graph.
'''

def GraphString2Dict(graph):
    graph = graph.split('\n')
    graph_dict = {}
    for path in graph:
        path       = path.replace(' ','')
        from_, to_ = path.split('->')
        if ',' in to_:
            to_              = to_.split(',')
            graph_dict[from_] = to_
        else:
            graph_dict[from_] = [to_]
    return(graph_dict)

def EulerianCycle(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
        
    tmp_graph = copy.deepcopy(graph)
    from_ = random.choice(list(tmp_graph.keys()))
    cycle = [from_]
    unexplored_nodes = {}
    
    while len(tmp_graph) != 0:
        if from_ in tmp_graph:
            to = random.choice(tmp_graph[from_])
            cycle.append(to)
            tmp_graph[from_].remove(to)
            
            if len(tmp_graph[from_]) == 0:
                tmp_graph.pop(from_)
                unexplored_nodes.pop(from_, None)
            else:
                unexplored_nodes[from_] = copy.deepcopy(tmp_graph[from_])
            from_ = to
        
        else:
            # find a random new start point from unexplored_nodes
            unexplored_node_index = np.where(np.array(cycle) == random.choice(list(unexplored_nodes)))[0][-1]
            unexplored_node       = cycle[unexplored_node_index]
            next_node             = random.choice(unexplored_nodes[unexplored_node])
            tmp_graph[unexplored_node].remove(next_node)
            
            # make the cycle go back to new start point
            recover_cycle = cycle[unexplored_node_index:]
            cycle         = cycle[:unexplored_node_index + 1]
            from_         = next_node
            cycle.append(from_)
            
            # reconstruct the tmp_graph and unexplored_nodes according to current cycle
            unexplored_nodes = {}
            tmp_graph        = copy.deepcopy(graph)
            for i in range(len(cycle) - 1):
                node    = cycle[i]
                to_node = cycle[i + 1]
                tmp_graph[node].remove(to_node)
                if len(tmp_graph[node]) == 0:
                    tmp_graph.pop(node)
                    unexplored_nodes.pop(node, None)
                else : 
                    unexplored_nodes[node] = copy.deepcopy(tmp_graph[node])
    return(cycle)

graph = '''0 -> 3
     1 -> 0
     2 -> 1,6
     3 -> 2
     4 -> 2
     5 -> 4
     6 -> 5,8
     7 -> 9
     8 -> 7
     9 -> 6'''

EulerianCycle(graph)

['4', '2', '1', '0', '3', '2', '6', '8', '7', '9', '6', '5', '4']

In [8]:
'''
Code Challenge: Solve the Eulerian Path Problem.
Input: The adjacency list of a directed graph that has an Eulerian path.
Output: An Eulerian path in this graph.
'''

def CheckBalance(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
    balance = {}
    
    for key, values in graph.items():
        balance[key] = 0
        for value in values:
            balance[value] = 0
    
    for from_, tos in graph.items():
        balance[from_] = balance[from_] + len(tos)
        for to in tos:
            balance[to] = balance[to] - 1
    return (balance)

def EulerianPath(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
    
    balance = CheckBalance(graph)
    # find start point by balance > 0
    for key, values in balance.items():
        if values > 0 :
            from_ = key
            break
            
    tmp_graph = copy.deepcopy(graph)
    cycle = [from_]
    unexplored_nodes = {}
    
    while len(tmp_graph) != 0:
        if from_ in tmp_graph:
            to = random.choice(tmp_graph[from_])
            cycle.append(to)
            tmp_graph[from_].remove(to)
            if len(tmp_graph[from_]) == 0:
                tmp_graph.pop(from_)
                unexplored_nodes.pop(from_, None)
            else:
                unexplored_nodes[from_] = copy.deepcopy(tmp_graph[from_])
            from_ = to
        
        else:
            unexplored_node_index = np.where(np.array(cycle) == random.choice(list(unexplored_nodes)))[0][-1]
            unexplored_node       = cycle[unexplored_node_index]
            next_node             = random.choice(unexplored_nodes[unexplored_node])
            tmp_graph[unexplored_node].remove(next_node)

            recover_cycle = cycle[unexplored_node_index:]
            cycle         = cycle[:unexplored_node_index + 1]
            from_         = next_node
            cycle.append(from_)

            unexplored_nodes = {}
            tmp_graph        = copy.deepcopy(graph)
            for i in range(len(cycle) - 1):
                node    = cycle[i]
                to_node = cycle[i + 1]
                tmp_graph[node].remove(to_node)
                if len(tmp_graph[node]) == 0:
                    tmp_graph.pop(node)
                    unexplored_nodes.pop(node, None)
                else : 
                    unexplored_nodes[node] = copy.deepcopy(tmp_graph[node])
    return(cycle)

graph='''0 -> 2
     1 -> 3
     2 -> 1
     3 -> 0,4
     6 -> 3,7
     7 -> 8
     8 -> 9
     9 -> 6'''

EulerianPath(graph)

['6', '7', '8', '9', '6', '3', '0', '2', '1', '3', '4']

In [9]:
'''
Code Challenge: Solve the String Reconstruction Problem.
Input: An integer k followed by a list of k-mers Patterns.
Output: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)
'''

def Reconstruction(k, strings):
    if type(strings) == str:
        strings = strings.split('\n')
    graph          = PathGraph(strings)
    cycle          = EulerianPath(graph)
    reconstruction = NaiveLink(cycle)
    return(reconstruction)

k = 4
strings = '''CTTA
ACCA
TACC
GGCT
GCTT
TTAC'''

Reconstruction(k, strings)

'GGCTTACCA'

In [10]:
'''
Code Challenge: Solve the k-Universal Circular String Problem.
Input: An integer k.
Output: A k-universal circular string.
'''

def BinaryStrings(k):
    strings = ['0','1']
    for i in range(k - 1):
        strings = strings + strings
        half    = len(strings) / 2
        for index in range(len(strings)):
            if index < half:
                strings[index] = strings[index] + '0'
            else :
                strings[index] = strings[index] + '1'
    return(strings)

def KUniversal(k):
    strings        = BinaryStrings(k) 
    graph          = PathGraph(strings)
    cycle          = EulerianCycle(graph)
    reconstruction = NaiveLink(cycle)
    # delete last k - 1 number, because it is a cycle 
    reconstruction = reconstruction[: -k + 1]
    return(reconstruction)

k = 4



In [11]:
'''
Code Challenge: Solve the String Reconstruction from Read-Pairs Problem.
Input: Integers k and d followed by a collection of paired k-mers PairedReads.
Output: A string Text with (k, d)-mer composition equal to PairedReads.
'''

def PairedPathGraph(gapped_patterns, k):
    if type(gapped_patterns) == str:
        gapped_patterns = gapped_patterns.split('\n')

    graph = {}
    for pattern in gapped_patterns:
        prefix = pattern[ : k - 1] + pattern[k    : (k * 2)]
        suffix = pattern[1: k + 1] + pattern[k + 2: (k * 2) + 1]
        if prefix in graph:
            graph[prefix].append(suffix)
        else:
            graph[prefix] = [suffix]

    return(graph)

def PairedReconstruction(gapped_patterns, k, d):
    graph = PairedPathGraph(gapped_patterns, k)
    path  = EulerianPath(graph)  
    
    gapped_patterns_1 = []
    gapped_patterns_2 = []
    
    for gapped_pattern in path:
        pattern_1, pattern_2 = gapped_pattern.split('|')
        gapped_patterns_1.append(pattern_1)
        gapped_patterns_2.append(pattern_2)
        
    prefix_string = NaiveLink(gapped_patterns_1)
    suffix_string = NaiveLink(gapped_patterns_2)
    
    overlap_length = len(prefix_string) - k - d
    for i in range(overlap_length):
        if suffix_string[i] != prefix_string[i + k + d]:
            return('there is no string spelled by the gapped patterns')
    whole_string = prefix_string + suffix_string[overlap_length:]
    return(whole_string)

k = 4
d = 2
gapped_patterns = '''GAGA|TTGA
TCGT|GATG
CGTG|ATGT
TGGT|TGAG
GTGA|TGTT
GTGG|GTGA
TGAG|GTTG
GGTC|GAGA
GTCG|AGAT'''

PairedReconstruction(gapped_patterns, k, d)

'GTGGTCGTGAGATGTTGA'

In [12]:
'''
Contig Generation Problem: Generate the contigs from a collection of reads (with imperfect coverage).
Input: A collection of k-mers Patterns. 
Output: All contigs in DeBruijn(Patterns).
'''

def CountInOut(graph):
    if type(graph) == str:
        graph = GraphString2Dict(graph)
        
    in_dict  = {}
    out_dict = {}
    
    for key, values in graph.items():
        out_dict[key] = 0
        for value in values:
            in_dict[value] = 0
    
    for prefix, suffixs in graph.items():
        out_dict[prefix] = out_dict[prefix] + len(suffixs)
        for suffix in suffixs:
            in_dict[suffix] = in_dict[suffix] + 1
            
    return (in_dict,out_dict)

def MaximalNonBranchingPaths(strings):
    graph = PathGraph(strings)

    in_count, out_count = CountInOut(graph)

    nodes = set()
    for _ in in_count.keys():
        nodes.add(_)
    for _ in out_count.keys():    
        nodes.add(_)
    nodes = list(nodes)

    '''
    #This is for isolated cycle
    one_one_graph = {}
    for key, value in graph.items():
        if (in_count.get(key, -1) == out_count.get(key, -1) == 1):
            one_one_graph[key] = value
    '''
    
    paths = []
    for node in nodes:
        if (in_count.get(node) != 1) | (out_count.get(node) != 1):
            if out_count.get(node, -1) > 0:
                for out_node in graph[node]:
                    non_branching_path = []
                    non_branching_path.append(node)
                    non_branching_path.append(out_node)
                    while (in_count.get(out_node, -1) == out_count.get(out_node, -1) == 1):
                        #one_one_graph.pop(out_node)
                        non_branching_path.extend(graph[out_node])
                        out_node = graph[out_node][0]
                    paths.append(NaiveLink(non_branching_path))
                    
    return(paths)

strings = '''ATG
ATG
TGT
TGG
CAT
GGA
GAT
AGA'''

MaximalNonBranchingPaths(strings)

['ATG', 'ATG', 'AGA', 'TGT', 'TGGA', 'CAT', 'GAT']