# Week 1

In [1]:
'''
Code Challenge: Solve the String Composition Problem.
Input: An integer k and a string Text.
Output: Compositionk(Text) (the k-mers can be provided in any order).
'''

def Composition(string, k):
    composition = []
    for start in range(len(string) - k +1):
        end       = start + k
        substring = string[start: end]
        composition.append(substring)
    return (composition)

k = 5
string = 'CAATCCAAC'

Composition(string, k)

['CAATC', 'AATCC', 'ATCCA', 'TCCAA', 'CCAAC']

In [2]:
'''
String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.
Input: A sequence of k-mers Pattern1, … ,Patternn such that the last k - 1 symbols of Patterni are equal to the first k-1 symbolsof Patterni+1 for 1 ≤ i ≤ n-1.
Output: A string Text of length k+n-1 such that the i-th k-mer in Text is equal to Patterni  (for 1 ≤ i ≤ n).
'''

def Reconstruction(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    string = patterns[0]
    for pattern in patterns[1:]:
        string = string + pattern[-1]
    return (string)

patterns = '''ACCGA
CCGAA
CGAAG
GAAGC
AAGCT'''

Reconstruction(patterns)

'ACCGAAGCT'

In [3]:
'''
Code Challenge: Solve the Overlap Graph Problem (restated below).
Input: A collection Patterns of k-mers.
Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. 
'''

def FindOverlap(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    
    adjacency_list = {}
    prefixs = {}
    suffixs = {}
    for pattern in patterns:
        suffixs[pattern] = pattern[1:]
        if pattern[:-1] in prefixs:
            prefixs[pattern[:-1]].append(pattern)
        else:
            prefixs[pattern[:-1]] = [pattern]
    
    for pattern,suffix in suffixs.items():
        if suffix in prefixs:
            adjacency_list[pattern] = prefixs[suffix]
            
    return(adjacency_list)

patterns = '''ATGCG
GCATG
CATGC
AGGCA
GGCAT
GGCAC'''

FindOverlap(patterns)

{'AGGCA': ['GGCAT', 'GGCAC'],
 'CATGC': ['ATGCG'],
 'GCATG': ['CATGC'],
 'GGCAT': ['GCATG']}

In [4]:
'''
DeBruijn Graph from k-mers Problem: Construct the de Bruijn graph from a set of k-mers.
Input: A collection of k-mers Patterns.
Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).
'''

def PathGraph(patterns):
    if type(patterns) == str:
        patterns = patterns.split('\n')
    
    graph = {}
    for pattern in patterns:
        if pattern[:-1] in graph:
            graph[pattern[:-1]].append(pattern[1:])
        else:
            graph[pattern[:-1]] = [pattern[1:]]
    return(graph)

patterns = '''GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG'''

PathGraph(patterns)

{'AGG': ['GGG'],
 'CAG': ['AGG', 'AGG'],
 'GAG': ['AGG'],
 'GGA': ['GAG'],
 'GGG': ['GGG', 'GGA']}

In [5]:
'''
Code Challenge: Solve the De Bruijn Graph from a String Problem.
Input: An integer k and a string Text.
Output: DeBruijnk(Text), in the form of an adjacency list.
'''

def DeBruijn(string, k):
    composition  = Composition(string, k)
    output_graph = PathGraph(composition)
    
    return (output_graph)

k = 4
string = 'AAGATTCTCTAAGA'

DeBruijn(string, k)

{'AAG': ['AGA', 'AGA'],
 'AGA': ['GAT'],
 'ATT': ['TTC'],
 'CTA': ['TAA'],
 'CTC': ['TCT'],
 'GAT': ['ATT'],
 'TAA': ['AAG'],
 'TCT': ['CTC', 'CTA'],
 'TTC': ['TCT']}