# Assembly

Humberto Ortiz Zuazaga

Example code for chapter 3.

# ba3a

Compute the kmer composition of text


In [1]:
def Composition(k, text):
    kmers = []
    for i in range(len(text) - k + 1):
        kmers.append(text[i:i+k])
    return sorted(kmers)

In [2]:
# prueba del libro
Composition(3, "TATGGGGTGC")

['ATG', 'GGG', 'GGG', 'GGT', 'GTG', 'TAT', 'TGC', 'TGG']

# Representar grafos

Vamos a usar objetos de python para representar grafos usando listas de adyacencia.

Input: una lista de k-mers.   
Output: el grafo de solape (overlap) (con flechitas)

In [3]:
class OverlapGraph():
    # para crear un OverlapGraph
    def __init__(self, pattern):
        self.pattern = pattern
        self.index = 0
        self.G = {}
        k = len(pattern[0])

        for primero in pattern:
            if primero not in self.G:
                self.G[primero] = set()
            for segundo in pattern:
                # check if end of primero is same as begining of segundo
                if primero[-(k-1):] == segundo[:k-1]:
                    self.G[primero].add(segundo)
    def print(self):
        "Para imprimir un overlap graph como lo quiere rosalind.info"
        for kmer, edges in sorted(self):
            for edge in edges:
                print(kmer, "->", edge)

    # metodos especiales de python para iteracion (for loop)
    def __iter__(self):
        return self
    
    # pasar al proximo elemento
    def __next__(self):
        if self.index == len(self.pattern):
            self.index = 0
            raise StopIteration
            
        kmer = self.pattern[self.index]
        edges = list(self.G[kmer])
        self.index = self.index + 1
        return (kmer, edges)
    
    # extraer la lista de adyacencia de un elemento
    def __getitem__(self, kmer):
        return list(self.G[kmer])

In [4]:
G = OverlapGraph(Composition(3, "TAATGCCATGGGATGTT"))

In [5]:
list(G)

[('AAT', ['ATG']),
 ('ATG', ['TGT', 'TGG', 'TGC']),
 ('ATG', ['TGT', 'TGG', 'TGC']),
 ('ATG', ['TGT', 'TGG', 'TGC']),
 ('CAT', ['ATG']),
 ('CCA', ['CAT']),
 ('GAT', ['ATG']),
 ('GCC', ['CCA']),
 ('GGA', ['GAT']),
 ('GGG', ['GGA', 'GGG']),
 ('GTT', []),
 ('TAA', ['AAT']),
 ('TGC', ['GCC']),
 ('TGG', ['GGA', 'GGG']),
 ('TGT', ['GTT'])]

In [6]:
G.print()

AAT -> ATG
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
CAT -> ATG
CCA -> CAT
GAT -> ATG
GCC -> CCA
GGA -> GAT
GGG -> GGA
GGG -> GGG
TAA -> AAT
TGC -> GCC
TGG -> GGA
TGG -> GGG
TGT -> GTT


In [7]:
pattern = Composition(3, "TAATGCCATGGGATGTT")
OverlapGraph(pattern).print()

AAT -> ATG
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
ATG -> TGT
ATG -> TGG
ATG -> TGC
CAT -> ATG
CCA -> CAT
GAT -> ATG
GCC -> CCA
GGA -> GAT
GGG -> GGA
GGG -> GGG
TAA -> AAT
TGC -> GCC
TGG -> GGA
TGG -> GGG
TGT -> GTT


# De Bruijn graph

Try solving problem BA3D from rosalind.info.

In [8]:
def debruijn(filename):
    with open(filename) as f:
        k = int(f.readline())
        #print(k)
        secuencia = f.readline().strip()
        G = {}
        for i in range(len(secuencia)-k+1):
            kmero = secuencia[i:i+k]
            principio = kmero[:-1]
            final = kmero[1:]
            if principio not in G:
                G[principio] = [final]
            else:
                G[principio].append(final)
    # el grafo esta en G
    # imprimir bonito al archivo "results.txt"
    
    with open("results.txt", mode="w") as f:
        keys = G.keys()
    
        for kmer in sorted(keys):
            edges = G[kmer]
            print(kmer, "->", ",".join(sorted(edges)), file=f)

In [9]:
# cambia el nombre del archivo
debruijn("/Users/humberto/Downloads/rosalind_ba3d(1).txt")

# Pruebas

Probando como seleccionar partes de un kmero.

In [10]:
kmero = "AAGA"

In [11]:
kmero[:-1]

'AAG'

In [12]:
kmero[1:]

'AGA'

In [13]:
edges = ['CTC', 'CTA']

In [14]:
edges

['CTC', 'CTA']

In [15]:
",".join(edges)

'CTC,CTA'

In [16]:
# Ejemplo de rosalind.info
pattern = ["ATGCG",
"GCATG",
"CATGC",
"AGGCA",
"GGCAT"]

In [17]:
OverlapGraph(pattern).print()

AGGCA -> GGCAT
CATGC -> ATGCG
GCATG -> CATGC
GGCAT -> GCATG
