# Assembly

Humberto Ortiz Zuazaga

Example code for chapter 3.

# ba3a

Compute the kmer composition of text


In [2]:
def Composition(k, text):
    kmers = []
    for i in range(len(text) - k + 1):
        kmers.append(text[i:i+k])
    return sorted(kmers)

In [3]:
# prueba del libro
Composition(3, "TATGGGGTGC")

['ATG', 'GGG', 'GGG', 'GGT', 'GTG', 'TAT', 'TGC', 'TGG']

# Representar grafos

Vamos a usar objetos de python para representar grafos usando listas de adyacencia.

Input: una lista de k-mers.   
Output: el grafo de solape (overlap) (con flechitas)

In [4]:
class OverlapGraph():
    # para crear un OverlapGraph
    def __init__(self, pattern):
        self.pattern = pattern
        self.index = 0
        self.G = {}
        k = len(pattern[0])

        for primero in pattern:
            if primero not in self.G:
                self.G[primero] = set()
            for segundo in pattern:
                # check if end of primero is same as begining of segundo
                if primero[-(k-1):] == segundo[:k-1]:
                    self.G[primero].add(segundo)
    def print(self):
        "Para imprimir un overlap graph como lo quiere rosalind.info"
        for kmer, edges in sorted(self):
            for edge in edges:
                print(kmer, "->", edge)

    # metodos especiales de python para iteracion (for loop)
    def __iter__(self):
        return self
    
    # pasar al proximo elemento
    def __next__(self):
        if self.index == len(self.pattern):
            self.index = 0
            raise StopIteration
            
        kmer = self.pattern[self.index]
        edges = list(self.G[kmer])
        self.index = self.index + 1
        return (kmer, edges)
    
    # extraer la lista de adyacencia de un elemento
    def __getitem__(self, kmer):
        return list(self.G[kmer])

In [5]:
G = OverlapGraph(Composition(3, "TAATGCCATGGGATGTT"))

In [6]:
list(G)

[('AAT', ['ATG']),
 ('ATG', ['TGT', 'TGG', 'TGC']),
 ('ATG', ['TGT', 'TGG', 'TGC']),
 ('ATG', ['TGT', 'TGG', 'TGC']),
 ('CAT', ['ATG']),
 ('CCA', ['CAT']),
 ('GAT', ['ATG']),
 ('GCC', ['CCA']),
 ('GGA', ['GAT']),
 ('GGG', ['GGG', 'GGA']),
 ('GTT', []),
 ('TAA', ['AAT']),
 ('TGC', ['GCC']),
 ('TGG', ['GGG', 'GGA']),
 ('TGT', ['GTT'])]

In [7]:
G.print()

AAT -> ATG
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
CAT -> ATG
CCA -> CAT
GAT -> ATG
GCC -> CCA
GGA -> GAT
GGG -> GGG
GGG -> GGA
TAA -> AAT
TGC -> GCC
TGG -> GGG
TGG -> GGA
TGT -> GTT


In [8]:
pattern = Composition(3, "TAATGCCATGGGATGTT")
OverlapGraph(pattern).print()

AAT -> ATG
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
CAT -> ATG
CCA -> CAT
GAT -> ATG
GCC -> CCA
GGA -> GAT
GGG -> GGG
GGG -> GGA
TAA -> AAT
TGC -> GCC
TGG -> GGG
TGG -> GGA
TGT -> GTT


# De Bruijn graph

Try solving problem BA3D from rosalind.info.

In [9]:
def debruijn(filename):
    with open(filename) as f:
        k = int(f.readline())
        #print(k)
        secuencia = f.readline().strip()
        G = {}
        for i in range(len(secuencia)-k+1):
            kmero = secuencia[i:i+k]
            principio = kmero[:-1]
            final = kmero[1:]
            if principio not in G:
                G[principio] = [final]
            else:
                G[principio].append(final)
    # el grafo esta en G
    # imprimir bonito al archivo "results.txt"
    
    with open("results.txt", mode="w") as f:
        keys = G.keys()
    
        for kmer in sorted(keys):
            edges = G[kmer]
            print(kmer, "->", ",".join(sorted(edges)), file=f)

In [44]:
# cambia el nombre del archivo
debruijn("foo.txt")

# Pruebas

Probando como seleccionar partes de un kmero.

In [11]:
kmero = "AAGA"

In [12]:
kmero[:-1]

'AAG'

In [13]:
kmero[1:]

'AGA'

In [14]:
edges = ['CTC', 'CTA']

In [15]:
edges

['CTC', 'CTA']

In [16]:
",".join(edges)

'CTC,CTA'

In [17]:
# Ejemplo de rosalind.info
pattern = ["ATGCG",
"GCATG",
"CATGC",
"AGGCA",
"GGCAT"]

In [18]:
OverlapGraph(pattern).print()

AGGCA -> GGCAT
CATGC -> ATGCG
GCATG -> CATGC
GGCAT -> GCATG


In [19]:
a = [5, 4, 1]

In [20]:
a[-2:]

[4, 1]

In [21]:
a[0:1]

[5]

In [22]:
[4, 1] + [5]

[4, 1, 5]

# Eulerian Cycle

Take a stab at BA3F from rosalind.info.

Read a file like:

```
0 -> 3
1 -> 0
2 -> 1,3
3 -> 2
```

In [23]:
def read_ba3f(filename):
    G = {}
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            fields = line.strip().split(" -> ")
            head = fields[0]
            edges = fields[1].split(",")
            G[head] = edges
    return G

In [24]:
read_ba3f("ba3f.txt")

{'0': ['3'],
 '1': ['0'],
 '2': ['1', '6'],
 '3': ['2'],
 '4': ['2'],
 '5': ['4'],
 '6': ['5', '8'],
 '7': ['9'],
 '8': ['7'],
 '9': ['6']}

In [25]:
import random

In [32]:
def unexplored_edges(G):
    for elt in (G.values()):
        if len(elt) > 0:
            return True
    return False


In [48]:
def EulerianCycle(G):
    cycle = []
    start = random.choice(list(G.keys()))
    cycle.append(start)
    next = G[start].pop()
    while next != start:
        cycle.append(next)
        next = G[next].pop()
    # falta chequear si hay edges sin usar y construir Cycle'
    while unexplored_edges(G):
        # encontrar newStart
        for elt in cycle:
            if len(G[elt]) > 0:
                newStart = elt
                break
        splice = cycle.index(newStart)
        newcycle = cycle[splice:] + cycle[:splice] + [newStart]
        next = G[newStart].pop()
        while next != newStart:
            newcycle.append(next)
            next = G[next].pop()
            cycle = newcycle
            
    return G, cycle

In [49]:
G = read_ba3f("ba3f.txt")

newG, cycle = EulerianCycle(G)

In [41]:
newG

{'0': [],
 '1': [],
 '2': [],
 '3': [],
 '4': [],
 '5': [],
 '6': [],
 '7': [],
 '8': [],
 '9': []}

In [30]:
list(newG.values())

[['3'], ['0'], ['1', '6'], ['2'], ['2'], ['4'], ['5'], [], [], []]

In [50]:
"->".join(cycle)

'1->0->3->2->6->8->7->9->6->5->4->2'

In [34]:
cycle.index('6')

3

In [37]:
cycle[:cycle.index('6')]

['8', '7', '9']

In [38]:
cycle[cycle.index('6'):]

['6']

In [43]:
pattern = Composition(3, "TAATGCCATGGGATGTTTAA")
OverlapGraph(pattern).print()

AAT -> ATG
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
CAT -> ATG
CCA -> CAT
GAT -> ATG
GCC -> CCA
GGA -> GAT
GGG -> GGG
GGG -> GGA
GTT -> TTA
GTT -> TTT
TAA -> AAT
TAA -> AAT
TGC -> GCC
TGG -> GGG
TGG -> GGA
TGT -> GTT
TTA -> TAA
TTT -> TTA
TTT -> TTT


In [54]:
G = read_ba3f("/Users/humberto/Downloads/rosalind_ba3f(1).txt")

newG, cycle = EulerianCycle(G)

In [55]:
"->".join(cycle+[cycle[0]])

'1337->352->353->354->111->535->536->537->1576->1577->1578->537->1483->1485->2124->2123->2122->1485->1484->537->111->52->97->947->948->946->97->129->1080->1078->1079->129->128->127->97->98->204->514->516->800->799->801->1861->1862->1863->801->516->515->2007->2005->2006->2369->2368->2370->2006->515->204->203->202->945->1402->1404->1403->945->943->1261->1263->1262->943->944->202->1758->1757->2611->2612->2613->1757->1756->2544->2543->2542->1756->202->98->99->2385->2384->2383->99->177->412->457->458->459->412->413->414->2279->2278->2280->414->177->175->176->539->538->540->176->99->52->565->1731->1729->1793->1794->1792->1729->1730->565->566->1900->1902->1901->566->567->1781->1782->2223->2222->2221->1782->1780->567->52->54->255->956->2869->2870->2871->956->957->955->1165->1166->1167->955->255->760->1561->1563->1690->1691->1692->2958->2957->2956->1692->1563->1562->760->762->761->255->589->1151->1150->1152->2515->2516->2517->1152->589->651->649->650->850->852->1185->1183->1184->852->851->650->