In [1]:
from typing import List, Dict, Iterable

In [12]:
def kmer_composition(text: str, k: int) -> Iterable[str]:
    kmers = []
    for i in range(len(text) - k + 1):
        kmer = text[i:i + k]
        kmers.append(kmer)
    return kmers

In [13]:
text = 'CAATCCAAC'

print(kmer_composition(text, 5))

['CAATC', 'AATCC', 'ATCCA', 'TCCAA']


In [2]:
def genome_path(path: list[str]) -> str:
    genome = path[0]
    
    for i in range(1, len(path)):
        genome += path[i][-1]
    
    return genome

sample_path = ["ACCGA", "CCGAA", "CGAAG", "GAAGC", "AAGCT"]
print(genome_path(sample_path))

ACCGAAGCT


In [None]:
from typing import List, Dict, Iterable

def overlap_graph(patterns: List[str]) -> Dict[str, List[str]]:
    adjacencyDict = {}

    patterns = set(patterns)
    patterns = list(patterns)
    for read in sorted(patterns):
        adjacencyDict[read] = []
        suffix = read[1:]
        for pattern in patterns:
            prefix = pattern[:-1]
            if suffix == prefix:
                adjacencyDict[read].append(pattern)

    removeKeys = [item for item in adjacencyDict if adjacencyDict[item] == []]

    for key in removeKeys:
        del adjacencyDict[key]

    return adjacencyDict

In [20]:
from typing import List, Dict, Iterable

def de_bruijn_string(text: str, k: int) -> Dict[str, List[str]]:
    adjacencyDict = {}
    nodeList = []

    for i in range(len(text) - k + 1):
        node = text[i:i + k - 1]
        nodeList.append(node)
    nodeList = sorted(list(set(nodeList)))

    for node in nodeList:
        adjacencyDict[node] = []
    
    for i in range(len(text) - k + 1):
        node = text[i:i + k - 1]
        if i + 2*k - (k - 1) <= len(text) + 1:
            nextNode = text[i + k - (k - 1):i + k]
        else:
            nextNode = None
        if nextNode != None:
            adjacencyDict[node].append(nextNode)
            adjacencyDict[node].sort()
    return adjacencyDict


k = 3
text = 'ACGTGTATA'
print(de_bruijn_string(text, k))

{'AC': ['CG'], 'AT': ['TA'], 'CG': ['GT'], 'GT': ['TA', 'TG'], 'TA': ['AT'], 'TG': ['GT']}


In [18]:
k = 3
text = 'ACGTGTATA'

for i in range(len(text) - k + 1):
    node = text[i:i + k - 1]
    if i + 2*k - (k - 1) <= len(text) + 1:
        nextNode = text[i + k - (k - 1):i + k]
    else:
        nextNode = None
    print(node)
    print(nextNode)


AC
CG
CG
GT
GT
TG
TG
GT
GT
TA
TA
AT
AT
TA


In [None]:
def overlap_graph(patterns: List[str]) -> Dict[str, List[str]]:
    adjacencyDict = {}

    patterns = set(patterns)
    patterns = list(patterns)
    for read in sorted(patterns):
        adjacencyDict[read] = []
        suffix = read[1:]
        for pattern in patterns:
            prefix = pattern[:-1]
            if suffix == prefix:
                adjacencyDict[read].append(pattern)

    removeKeys = [item for item in adjacencyDict if adjacencyDict[item] == []]

    for key in removeKeys:
        del adjacencyDict[key]

    return adjacencyDict

In [4]:
from typing import List, Dict, Iterable
import random

# Please do not remove package declarations because these are used by the autograder.

def de_bruijn_kmers(k_mers: List[str]) -> tuple[Dict[str, List[str]], str, str]:
    """Forms the de Bruijn graph of a collection of k-mers."""
    adjacency = {}
    in_out = {}
    for i in range(len(k_mers)):
        kmer = k_mers[i]
        prefix = kmer[:len(kmer)-1]
        suffix = kmer[1:]
        if prefix not in adjacency.keys():
            adjacency[prefix] = [suffix]
        else:
            adjacency[prefix].append(suffix)
        if suffix not in in_out.keys():
            in_out[suffix] = [1, 0]
        else:
            in_out[suffix][0] = in_out[suffix][0] + 1
        if prefix not in in_out.keys():
            in_out[prefix] = [0, 1]
        else:
            in_out[prefix][1] = in_out[prefix][1] + 1
    start = ""
    end = ""
    for key in in_out.keys():
        num_in = in_out[key][0]
        num_out = in_out[key][1]
        if num_out - num_in > 0:
            start = key
        elif num_out - num_in < 0:
            end = key
    if end != "":
        if end not in adjacency.keys():
            adjacency[end] = [start]
        else:
            adjacency[end].append(start)
    return adjacency, start, end

def create_cycle(g: Dict[str, List[str]], ordered: bool, cycle=[], start = "") -> List[str]:
    if cycle == []:
        start = random.choice(list(g.keys()))
    cycle.append(start)
    while True:
        if len(g[start]) > 0:
            next = random.choice(g[start])
            g[start].remove(next)
            cycle.append(next)
            start = next
        else:
            for key in list(g.keys()):
                if len(g[key]) == 0:
                    del g[key]
            if ordered:
                return cycle[:-1]
            else:
                return cycle

# Insert your string_reconstruction function here, along with any subroutines you need
def string_reconstruction(patterns: List[str], k: int) -> str:
    """Reconstructs a string from its k-mer composition."""
    g, start, end = de_bruijn_kmers(patterns)
    if start == "":
        ordered = False
    else:
        ordered = True
    cycle = create_cycle(g, ordered)
    while len(g) > 0:
        for _ in range(len(cycle)):
            if cycle[0] not in g.keys():
                rotation = cycle.pop(0)
                cycle.append(rotation)
            else:
                next_start = cycle[0]
                break
        
        cycle = create_cycle(g, ordered, cycle, next_start)
    if start != "":
        for _ in range(len(cycle)):
            if cycle[0] != start or cycle[-1] != end:
                rotation = cycle.pop(0)
                cycle.append(rotation)
    reconstructed_string = cycle[0]
    for i in range(1, len(cycle)):
        reconstructed_string = reconstructed_string+cycle[i][-1]
    return reconstructed_string

test = ["ACG", "CGT", "GTG", "TGT", "GTA", "TAT", "ATA"]
k = 3

print(string_reconstruction(test, k))

ACGTGTATA


In [117]:
import sys
from typing import List, Dict, Iterable, Tuple
import random

# Please do not remove package declarations because these are used by the autograder.

def de_bruijn_kmers(PairedReads: List[Tuple[str, str]]) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str]]], str, str]:
    """Forms the de Bruijn graph of a collection of k-mers."""
    adjacency = {}
    in_out = {}
    for i in range(len(PairedReads)):
        read1 = PairedReads[i][0]
        read2 = PairedReads[i][1]
        prefix1 = read1[:len(read1)-1]
        suffix1 = read1[1:]
        prefix2 = read2[:len(read2)-1]
        suffix2 = read2[1:]
        prefixTuple = (prefix1, prefix2)
        suffixTuple = (suffix1, suffix2)
        if prefixTuple not in adjacency.keys():
            adjacency[prefixTuple] = [suffixTuple]
        else:
            adjacency[prefixTuple].append(suffixTuple)
        if suffixTuple not in in_out.keys():
            in_out[suffixTuple] = [1, 0]
        else:
            in_out[suffixTuple][0] = in_out[suffixTuple][0] + 1
        if prefixTuple not in in_out.keys():
            in_out[prefixTuple] = [0, 1]
        else:
            in_out[prefixTuple][1] = in_out[prefixTuple][1] + 1
    start = ""
    end = ""
    for key in in_out.keys():
        num_in = in_out[key][0]
        num_out = in_out[key][1]
        if num_out - num_in > 0:
            start = key
        elif num_out - num_in < 0:
            end = key
    if end != "":
        if end not in adjacency.keys():
            adjacency[end] = [start]
        else:
            adjacency[end].append(start)
    return adjacency, start, end

def create_cycle(g: Dict[str, List[str]], ordered: bool, cycle=[], start = "") -> List[str]:
    if cycle == []:
        start = random.choice(list(g.keys()))
    cycle.append(start)
    while True:
        if len(g[start]) > 0:
            next = random.choice(g[start])
            g[start].remove(next)
            cycle.append(next)
            start = next
        else:
            for key in list(g.keys()):
                if len(g[key]) == 0:
                    del g[key]
            if ordered:
                return cycle[:-1]
            else:
                return cycle

# Insert your string_reconstruction function here, along with any subroutines you need
def StringReconstructionReadPairs(PairedReads: List[Tuple[str, str]], k: int, d: int) -> str:
    """Reconstructs a string from its k-mer composition."""
    is_correct = False
    
    while is_correct == False:    
        g, start, end = de_bruijn_kmers(PairedReads)
        if start == "":
            ordered = False
        else:
            ordered = True
    
        cycle = create_cycle(g, ordered)
        while len(g) > 0:
            for _ in range(len(cycle)):
                if cycle[0] not in g.keys():
                    rotation = cycle.pop(0)
                    cycle.append(rotation)
                else:
                    next_start = cycle[0]
                    break
            
            cycle = create_cycle(g, ordered, cycle, next_start)
        if start != "":
            for _ in range(len(cycle)):
                if cycle[0] != start or cycle[-1] != end:
                    rotation = cycle.pop(0)
                    cycle.append(rotation)

        read1 = cycle[0][0]
        read2 = cycle[0][1]
        for i in range(1, len(cycle)):
            read1 += cycle[i][0][-1]
            read2 += cycle[i][1][-1]

        if read1[k+d:] == read2[:-k-d]:
            is_correct = True
    
    reconstructed_string = read1 + read2[-k-d:]
    return reconstructed_string


inputReads1 = [('ACAC', 'CTCT'), ('ACAT', 'CTCA'), ('CACA', 'TCTC'), ('GACA', 'TCTC')]

inputReads2 = [('TCA', 'GCA'), ('TTC', 'TGC'), ('AAT', 'CAT'), ('ATT', 'ATG')]

inputReads3 = [('GG', 'GA'), ('GT', 'AT'), ('TG', 'TA'), ('GA', 'AC'), ('AT', 'CT')]

inputReads4 = [('GTTT', 'ATTT'), ('TTTA', 'TTTG'), ('TTAC', 'TTGT'), ('TACG', 'TGTA'), ('ACGT', 'GTAT'), ('CGTT', 'TATT')]

inputReads5 = [('GGG', 'GGG'), ('AGG', 'GGG'), ('GGG', 'GGT'), ('GGG', 'GGG'), ('GGG', 'GGG')]


# print(StringReconstructionReadPairs(inputReads1, 4, 2))
# print('GACACATCTCTCA : Correct')
print(StringReconstructionReadPairs(inputReads2, 3, 1))
print('AATTCATGCA : Correct')
# print(StringReconstructionReadPairs(inputReads3, 2, 1))
# print('GGTGATACT : Correct')
# print(StringReconstructionReadPairs(inputReads4, 4, 2))
# print('TTTACGTTTGTATTT : Correct')
print(StringReconstructionReadPairs(inputReads5, 3, 2))
print('AGGGGGGGGGGT : Correct')

AATTCATGCA
AATTCATGCA : Correct


KeyError: ''