# Assembling Reads Into Genomes

## MyGraph

In [21]:
class MyGraph:
    
    def __init__(self, g = {}):
        ''' Constructor - takes dictionary to fill the graph as input; default is empty dictionary '''
        self.graph = g    

    def print_graph(self):
        ''' Prints the content of the graph as adjacency list '''
        for v in self.graph.keys():
            print (v, " -> ", self.graph[v])

    ## get basic info

    def get_nodes(self):
        ''' Returns list of nodes in the graph '''
        return list(self.graph.keys())
        
    def get_edges(self): 
        ''' Returns edges in the graph as a list of tuples (origin, destination) '''
        edges = []
        for v in self.graph.keys():
            for d in self.graph[v]:
                edges.append((v,d))
        return edges
      
    def size(self):
        ''' Returns size of the graph : number of nodes, number of edges '''
        return len(self.get_nodes()), len(self.get_edges())
      
    ## add nodes and edges    
    
    def add_vertex(self, v):
        ''' Add a vertex to the graph; tests if vertex exists not adding if it does '''
        if v not in self.graph.keys():
            self.graph[v] = []
        
    def add_edge(self, o, d):
        ''' Add edge to the graph; if vertices do not exist, they are added to the graph ''' 
        if o not in self.graph.keys():
            self.add_vertex(o)
        if d not in self.graph.keys():
            self.add_vertex(d)  
        if d not in self.graph[o]:
            self.graph[o].append(d)

    ## successors, predecessors, adjacent nodes
        
    def get_successors(self, v):
        return list(self.graph[v])     # needed to avoid list being overwritten of result of the function is used           
             
    def get_predecessors(self, v):
        res = []
        for k in self.graph.keys(): 
            if v in self.graph[k]: 
                res.append(k)
        return res
    
    def get_adjacents(self, v):
        suc = self.get_successors(v)
        pred = self.get_predecessors(v)
        res = pred
        for p in suc: 
            if p not in res: res.append(p)
        return res
        
    ## degrees    
    
    def out_degree(self, v):
        return len(self.graph[v])
    
    def in_degree(self, v):
        return len(self.get_predecessors(v))
        
    def degree(self, v):
        return len(self.get_adjacents(v))
        
    def all_degrees(self, deg_type = "inout"):
        ''' Computes the degree (of a given type) for all nodes.
        deg_type can be "in", "out", or "inout" '''
        degs = {}
        for v in self.graph.keys():
            if deg_type == "out" or deg_type == "inout":
                degs[v] = len(self.graph[v])
            else: degs[v] = 0
        if deg_type == "in" or deg_type == "inout":
            for v in self.graph.keys():
                for d in self.graph[v]:
                    if deg_type == "in" or v not in self.graph[d]:
                        degs[d] = degs[d] + 1
        return degs
    
    def highest_degrees(self, all_deg= None, deg_type = "inout", top= 10):
        if all_deg is None: 
            all_deg = self.all_degrees(deg_type)
        ord_deg = sorted(list(all_deg.items()), key=lambda x : x[1], reverse = True)
        return list(map(lambda x:x[0], ord_deg[:top]))
        
    
    ## topological metrics over degrees

    def mean_degree(self, deg_type = "inout"):
        degs = self.all_degrees(deg_type)
        return sum(degs.values()) / float(len(degs))
        
    def prob_degree(self, deg_type = "inout"):
        degs = self.all_degrees(deg_type)
        res = {}
        for k in degs.keys():
            if degs[k] in res.keys():
                res[degs[k]] += 1
            else:
                res[degs[k]] = 1
        for k in res.keys():
            res[k] /= float(len(degs))
        return res    
    
    
    ## BFS and DFS searches    
    
    def reachable_bfs(self, v):
        l = [v]
        res = []
        while len(l) > 0:
            node = l.pop(0)
            if node != v: res.append(node)
            for elem in self.graph[node]:
                if elem not in res and elem not in l and elem != node:
                    l.append(elem)
        return res
        
    def reachable_dfs(self, v):
        l = [v]
        res = []
        while len(l) > 0:
            node = l.pop(0)
            if node != v: res.append(node)
            s = 0
            for elem in self.graph[node]:
                if elem not in res and elem not in l:
                    l.insert(s, elem)
                    s += 1
        return res    
    
    def distance(self, s, d):
        if s == d: return 0
        l = [(s,0)]
        visited = [s]
        while len(l) > 0:
            node, dist = l.pop(0)
            for elem in self.graph[node]:
                if elem == d: return dist + 1
                elif elem not in visited: 
                    l.append((elem,dist+1))
                    visited.append(elem)
        return None
        
    def shortest_path(self, s, d):
        if s == d: return 0
        l = [(s,[])]
        visited = [s]
        while len(l) > 0:
            node, preds = l.pop(0)
            for elem in self.graph[node]:
                if elem == d: return preds+[node,elem]
                elif elem not in visited: 
                    l.append((elem,preds+[node]))
                    visited.append(elem)
        return None
        
    def reachable_with_dist(self, s):
        res = []
        l = [(s,0)]
        while len(l) > 0:
            node, dist = l.pop(0)
            if node != s: res.append((node,dist))
            for elem in self.graph[node]:
                if not is_in_tuple_list(l,elem) and not is_in_tuple_list(res,elem): 
                    l.append((elem,dist+1))
        return res
 
    ## mean distances ignoring unreachable nodes
    def mean_distances(self):
        tot = 0
        num_reachable = 0
        for k in self.graph.keys(): 
            distsk = self.reachable_with_dist(k)
            for _, dist in distsk:
                tot += dist
            num_reachable += len(distsk)
        meandist = float(tot) / num_reachable
        n = len(self.get_nodes())
        return meandist, float(num_reachable)/((n-1)*n)  
    
    def closeness_centrality(self, node):
        dist = self.reachable_with_dist(node)
        if len(dist)==0: return 0.0
        s = 0.0
        for d in dist: s += d[1]
        return len(dist) / s
        
    
    def highest_closeness(self, top = 10): 
        cc = {}
        for k in self.graph.keys():
            cc[k] = self.closeness_centrality(k)
        print(cc)
        ord_cl = sorted(list(cc.items()), key=lambda x : x[1], reverse = True)
        return list(map(lambda x:x[0], ord_cl[:top]))
            
    
    def betweenness_centrality(self, node):
        total_sp = 0
        sps_with_node = 0
        for s in self.graph.keys(): 
            for t in self.graph.keys(): 
                if s != t and s != node and t != node:
                    sp = self.shortest_path(s, t)
                    if sp is not None:
                        total_sp += 1
                        if node in sp: sps_with_node += 1 
        return sps_with_node / total_sp
                    
    
    ## cycles    
    def node_has_cycle (self, v):
        l = [v]
        res = False
        visited = [v]
        while len(l) > 0:
            node = l.pop(0)
            for elem in self.graph[node]:
                if elem == v: return True
                elif elem not in visited: 
                    l.append(elem)
                    visited.append(elem)
        return res       
    
    def has_cycle(self):
        res = False
        for v in self.graph.keys():
            if self.node_has_cycle(v): return True
        return res

    ## clustering
        
    def clustering_coef(self, v):
        adjs = self.get_adjacents(v)
        if len(adjs) <=1: return 0.0
        ligs = 0
        for i in adjs:
            for j in adjs:
                if i != j:
                    if j in self.graph[i] or i in self.graph[j]: 
                        ligs = ligs + 1
        return float(ligs)/(len(adjs)*(len(adjs)-1))
        
    def all_clustering_coefs(self):
        ccs = {}
        for k in self.graph.keys():
            ccs[k] = self.clustering_coef(k)
        return ccs
        
    def mean_clustering_coef(self):
        ccs = self.all_clustering_coefs()
        return sum(ccs.values()) / float(len(ccs))
            
    def mean_clustering_perdegree(self, deg_type = "inout"):
        degs = self.all_degrees(deg_type)
        ccs = self.all_clustering_coefs()
        degs_k = {}
        for k in degs.keys():
            if degs[k] in degs_k.keys(): degs_k[degs[k]].append(k)
            else: degs_k[degs[k]] = [k]
        ck = {}
        for k in degs_k.keys():
            tot = 0
            for v in degs_k[k]: tot += ccs[v]
            ck[k] = float(tot) / len(degs_k[k])
        return ck

    ## Hamiltonian

    def check_if_valid_path(self, p):
        if p[0] not in self.graph.keys(): return False
        for i in range(1,len(p)):
            if p[i] not in self.graph.keys() or p[i] not in self.graph[p[i-1]]:
                return False
        return True
        
    def check_if_hamiltonian_path(self, p):
        if not self.check_if_valid_path(p): return False
        to_visit = list(self.get_nodes())
        if len(p) != len(to_visit): return False
        for i in range(len(p)):
            if p[i] in to_visit: to_visit.remove(p[i])
            else: return False
        if not to_visit: return True
        else: return False
    
    def search_hamiltonian_path(self):
        for ke in self.graph.keys():
            p = self.search_hamiltonian_path_from_node(ke)
            if p != None:
                return p
        return None
    
    def search_hamiltonian_path_from_node(self, start):
        current = start
        visited = {start:0}
        path = [start]
        while len(path) < len(self.get_nodes()):
            nxt_index = visited[current]
            if len(self.graph[current]) > nxt_index:
                nxtnode = self.graph[current][nxt_index]
                visited[current] += 1
                if nxtnode not in path:
                    path.append(nxtnode)
                    visited[nxtnode] = 0                    
                    current = nxtnode      
            else: 
                if len(path) > 1: 
                    rmvnode = path.pop()
                    del visited[rmvnode]
                    current = path[-1]
                else: return None
        return path

    # Eulerian
    def check_balanced_node(self, node):
        return self.in_degree(node) == self.out_degree(node)
        
    def check_balanced_graph(self):
        for n in self.graph.keys():
            if not self.check_balanced_node(n): return False
        return True
    
    def check_nearly_balanced_graph(self):
        res = None, None
        for n in self.graph.keys():
            indeg= self.in_degree(n)
            outdeg= self.out_degree(n)
            if indeg - outdeg == 1 and res[1] is None: res = res[0], n
            elif indeg - outdeg == -1 and res[0] is None: res = n, res[1]
            elif indeg == outdeg: pass
            else: return None, None 
        return res

    def is_connected(self):
        total = len(self.graph.keys()) - 1
        for v in self.graph.keys():
            reachable_v = self.reachable_bfs(v)
            if (len(reachable_v) < total): return False
        return True

    def eulerian_cycle(self):
        if not self.is_connected() or not self.check_balanced_graph(): return None
        edges_visit = list(self.get_edges())
        res = []
        while edges_visit:
            pair = edges_visit[0]
            i = 1
            if res != []:
                while pair[0] not in res:
                    pair = edges_visit[i]
                    i = i + 1
            edges_visit.remove(pair)
            start, nxt = pair
            cycle = [start, nxt]
            while nxt != start:
                for suc in self.graph[nxt]:
                    if (nxt, suc) in edges_visit:
                        pair = (nxt,suc)
                        nxt = suc
                        cycle.append(nxt)
                        edges_visit.remove(pair)
            if not res: res = cycle
            else:
                pos = res.index(cycle[0])
                for i in range(len(cycle)-1):
                    res.insert(pos + i +1, cycle[i+1])
        return res                 
      
    def eulerian_path(self):
        unb = self.check_nearly_balanced_graph()
        if unb[0] is None or unb[1] is None: return None
        self.graph[unb[1]].append(unb[0])
        cycle = self.eulerian_cycle()
        for i in range(len(cycle)-1):
            if cycle[i] == unb[1] and cycle[i+1] ==  unb[0]:
                break
        path = cycle[i+1:] + cycle[1:i+1]
        return path


def is_in_tuple_list(tl, val):
    res = False
    for (x,y) in tl:
        if val == x: return True
    return res

    
if __name__ == "__main__":
    gr = MyGraph()
    gr.add_vertex(1)
    gr.add_vertex(2)
    gr.add_vertex(3)
    gr.add_vertex(4)
    gr.add_edge(1,2)
    gr.add_edge(2,3)
    gr.add_edge(3,2)
    gr.add_edge(3,4)
    gr.add_edge(4,2)
    print('Graph 1: ')
    print()
    gr.print_graph()
    print('size: ',gr.size())
    
    print()
    print('get_successors: ',gr.get_successors(2))
    print('get_predecessors: ', gr.get_predecessors(2))
    print('get_adjacents: ',gr.get_adjacents(2))
    
    print()
    print ('in_degree: ', gr.in_degree(2))
    print ('out_degree: ', gr.out_degree(2))
    print ('degree: ',gr.degree(2))
    
    print()
    print('inout: ', gr.all_degrees("inout"))
    print('in: ',gr.all_degrees("in"))
    print('out: ',gr.all_degrees("out"))

Graph 1: 

1  ->  [2]
2  ->  [3]
3  ->  [2, 4]
4  ->  [2]
size:  (4, 5)

get_successors:  [3]
get_predecessors:  [1, 3, 4]
get_adjacents:  [1, 3, 4]

in_degree:  3
out_degree:  1
degree:  3

inout:  {1: 1, 2: 3, 3: 2, 4: 2}
in:  {1: 0, 2: 3, 3: 1, 4: 1}
out:  {1: 1, 2: 1, 3: 2, 4: 1}


In [25]:
def Graph2():
    gr2 = MyGraph({1:[2,3,4], 2:[5,6],3:[6,8],4:[8],5:[7],6:[],7:[],8:[]})
    print('Graph2: ')
    print()
    gr2.print_graph()
    print()
    print('bfs: ',gr2.reachable_bfs(1))
    print('dfs: ',gr2.reachable_dfs(1))
    
    print()
    print('Distance shortest_path')
    print(gr2.distance(1,7))
    print(gr2.shortest_path(1,7))
    print(gr2.distance(1,8))
    print(gr2.shortest_path(1,8))
    print(gr2.distance(6,1))
    print(gr2.shortest_path(6,1))
    
    print()
    print(gr2.reachable_with_dist(1))
    
    print()
    print('cycle')
    print(gr.has_cycle())
    print(gr2.has_cycle())
    
    print()
    print('mean_degree: ',gr.mean_degree())
    print('prob_degree: ',gr.prob_degree())
    print('mean_distances: ',gr.mean_distances())
    print('clustering_coef 1 : ',gr.clustering_coef(1))
    print('clustering_coef 2 : ',gr.clustering_coef(2))
    
Graph2()

Graph2: 

1  ->  [2, 3, 4]
2  ->  [5, 6]
3  ->  [6, 8]
4  ->  [8]
5  ->  [7]
6  ->  []
7  ->  []
8  ->  []

bfs:  [2, 3, 4, 5, 6, 8, 7]
dfs:  [2, 5, 7, 6, 3, 8, 4]

Distance shortest_path
3
[1, 2, 5, 7]
2
[1, 3, 8]
None
None

[(2, 1), (3, 1), (4, 1), (5, 2), (6, 2), (8, 2), (7, 3)]

cycle
True
False

mean_degree:  2.0
prob_degree:  {1: 0.25, 3: 0.25, 2: 0.5}
mean_distances:  (1.5555555555555556, 0.75)
clustering_coef 1 :  0.0
clustering_coef 2 :  0.3333333333333333


## OverlapGraph

In [3]:
class OverlapGraph(MyGraph):
    
    def __init__(self, frags, reps = True):
        MyGraph.__init__(self, {})
        if reps: self.create_overlap_graph_with_reps(frags)
        else: self.create_overlap_graph(frags)
        self.reps = reps
        
    ## create overlap graph from list of sequences (fragments)
    def create_overlap_graph(self, frags):
        for seq in frags:
            self.add_vertex(seq)
        for seq in frags:
            suf = suffix(seq)
            for seq2 in frags:
                if prefix(seq2) == suf:
                    self.add_edge(seq, seq2)
    
    def create_overlap_graph_with_reps(self, frags):
        idnum = 1
        for seq in frags:
            self.add_vertex(seq+ "-" + str(idnum))
            idnum = idnum + 1
        idnum = 1
        for seq in frags:
            suf = suffix(seq)
            for seq2 in frags:
                if prefix(seq2) == suf:
                    for x in self.get_instances(seq2):
                        self.add_edge(seq+ "-" + str(idnum), x)    
            idnum = idnum + 1
    
    def get_instances(self, seq):
        res = []
        for k in self.graph.keys():
            if seq in k: res.append(k)
        return res
    
    def get_seq(self, node):
        if node not in self.graph.keys(): return None
        if self.reps: return node.split("-")[0]
        else: return node
    
    def seq_from_path(self, path):
        if not self.check_if_hamiltonian_path(path): return None
        seq = self.get_seq(path[0])
        for i in range(1,len(path)):
            nxt = self.get_seq(path[i])
            seq += nxt[-1]
        return seq    
                    
# auxiliary
def composition(k, seq):
    res = []
    for i in range(len(seq)-k+1):
        res.append(seq[i:i+k])
    res.sort()
    return res
    
def suffix (seq): 
    return seq[1:]
    
def prefix(seq):
    return seq[:-1]

## Test

In [26]:
def test1():
    seq = "CAATCATGATG"
    k = 3
    print('seq: ', seq)
    print('comp: ',composition(k, seq))

test1()

seq:  CAATCATGATG
comp:  ['AAT', 'ATC', 'ATG', 'ATG', 'CAA', 'CAT', 'GAT', 'TCA', 'TGA']


In [30]:
def test2():
    frags = ["ACC", "ATA", "CAT", "CCA", "TAA"]
    ovgr = OverlapGraph(frags, False)
    print('OverlapGraph: False')
    print()
    ovgr.print_graph()

test2()

OverlapGraph: False

ACC  ->  ['CCA']
ATA  ->  ['TAA']
CAT  ->  ['ATA']
CCA  ->  ['CAT']
TAA  ->  []


In [44]:
def test3():
     frags = [ "ATA", "ACC", "ATG", "ATT", "CAT", "CAT", "CAT", "CCA", "GCA", "GGC", "TAA", "TCA", "TGG", "TTC", "TTT"]
     ovgr = OverlapGraph(frags, True)
     print('OverlapGraph: True')
     print()
     ovgr.print_graph()
     print()
     path = ['ACC-2', 'CCA-8', 'CAT-5', 'ATG-3', 'TGG-13', 'GGC-10', 'GCA-9', 'CAT-6', 'ATT-4', 'TTT-15', 'TTC-14', 'TCA-12', 'CAT-7', 'ATA-1', 'TAA-11']
    
     print('\033[2;31;43m path: \033[0;0m',path)
     print()
     print(ovgr.seq_from_path(path))
     
     print('\033[2;31;43m valid_path: \033[0;0m ',ovgr.check_if_valid_path(path))
     print ('\033[2;31;43m hamiltonian_path: \033[0;0m ',ovgr.check_if_hamiltonian_path(path))

test3()

OverlapGraph: True

ATA-1  ->  ['TAA-11']
ACC-2  ->  ['CCA-8']
ATG-3  ->  ['TGG-13']
ATT-4  ->  ['TTC-14', 'TTT-15']
CAT-5  ->  ['ATA-1', 'ATG-3', 'ATT-4']
CAT-6  ->  ['ATA-1', 'ATG-3', 'ATT-4']
CAT-7  ->  ['ATA-1', 'ATG-3', 'ATT-4']
CCA-8  ->  ['CAT-5', 'CAT-6', 'CAT-7']
GCA-9  ->  ['CAT-5', 'CAT-6', 'CAT-7']
GGC-10  ->  ['GCA-9']
TAA-11  ->  []
TCA-12  ->  ['CAT-5', 'CAT-6', 'CAT-7']
TGG-13  ->  ['GGC-10']
TTC-14  ->  ['TCA-12']
TTT-15  ->  ['TTC-14', 'TTT-15']

[2;31;43m path: [0;0m ['ACC-2', 'CCA-8', 'CAT-5', 'ATG-3', 'TGG-13', 'GGC-10', 'GCA-9', 'CAT-6', 'ATT-4', 'TTT-15', 'TTC-14', 'TCA-12', 'CAT-7', 'ATA-1', 'TAA-11']

ACCATGGCATTTCATAA
[2;31;43m valid_path: [0;0m  True
[2;31;43m hamiltonian_path: [0;0m  True


In [49]:
def test4():
    frags = [ "ATA", "ACC", "ATG", "ATT", "CAT", "CAT", "CAT", "CCA", "GCA", "GGC", "TAA", "TCA", "TGG", "TTC", "TTT"]
    ovgr = OverlapGraph(frags, True)
    ovgr.print_graph()
    print()
    path = ovgr.search_hamiltonian_path()
    print('\033[2;31;43m path: \033[0;0m', path)
    print ('\033[2;31;43m  hamilton path: \033[0;0m ',ovgr.check_if_hamiltonian_path(path))
    print (ovgr.seq_from_path(path))

test4()

ATA-1  ->  ['TAA-11']
ACC-2  ->  ['CCA-8']
ATG-3  ->  ['TGG-13']
ATT-4  ->  ['TTC-14', 'TTT-15']
CAT-5  ->  ['ATA-1', 'ATG-3', 'ATT-4']
CAT-6  ->  ['ATA-1', 'ATG-3', 'ATT-4']
CAT-7  ->  ['ATA-1', 'ATG-3', 'ATT-4']
CCA-8  ->  ['CAT-5', 'CAT-6', 'CAT-7']
GCA-9  ->  ['CAT-5', 'CAT-6', 'CAT-7']
GGC-10  ->  ['GCA-9']
TAA-11  ->  []
TCA-12  ->  ['CAT-5', 'CAT-6', 'CAT-7']
TGG-13  ->  ['GGC-10']
TTC-14  ->  ['TCA-12']
TTT-15  ->  ['TTC-14', 'TTT-15']

[2;31;43m path: [0;0m ['ACC-2', 'CCA-8', 'CAT-5', 'ATG-3', 'TGG-13', 'GGC-10', 'GCA-9', 'CAT-6', 'ATT-4', 'TTT-15', 'TTC-14', 'TCA-12', 'CAT-7', 'ATA-1', 'TAA-11']
[2;31;43m  hamilton path: [0;0m  True
ACCATGGCATTTCATAA


In [55]:
def test5():
    orig_sequence = "CAATCATGATGATGATC"
    frags = composition(3, orig_sequence)
    print ('\033[2;31;43m frags: \033[0;0m', frags)
    ovgr = OverlapGraph(frags, True)
    print()
    ovgr.print_graph()
    print()
    path = ovgr.search_hamiltonian_path()
    print('\033[2;31;43m hamilton path: \033[0;0m ',path)
    print()
    print(ovgr.seq_from_path(path))

test5()

[2;31;43m frags: [0;0m ['AAT', 'ATC', 'ATC', 'ATG', 'ATG', 'ATG', 'CAA', 'CAT', 'GAT', 'GAT', 'GAT', 'TCA', 'TGA', 'TGA', 'TGA']

AAT-1  ->  ['ATC-2', 'ATC-3', 'ATG-4', 'ATG-5', 'ATG-6']
ATC-2  ->  ['TCA-12']
ATC-3  ->  ['TCA-12']
ATG-4  ->  ['TGA-13', 'TGA-14', 'TGA-15']
ATG-5  ->  ['TGA-13', 'TGA-14', 'TGA-15']
ATG-6  ->  ['TGA-13', 'TGA-14', 'TGA-15']
CAA-7  ->  ['AAT-1']
CAT-8  ->  ['ATC-2', 'ATC-3', 'ATG-4', 'ATG-5', 'ATG-6']
GAT-9  ->  ['ATC-2', 'ATC-3', 'ATG-4', 'ATG-5', 'ATG-6']
GAT-10  ->  ['ATC-2', 'ATC-3', 'ATG-4', 'ATG-5', 'ATG-6']
GAT-11  ->  ['ATC-2', 'ATC-3', 'ATG-4', 'ATG-5', 'ATG-6']
TCA-12  ->  ['CAA-7', 'CAT-8']
TGA-13  ->  ['GAT-9', 'GAT-10', 'GAT-11']
TGA-14  ->  ['GAT-9', 'GAT-10', 'GAT-11']
TGA-15  ->  ['GAT-9', 'GAT-10', 'GAT-11']

[2;31;43m hamilton path: [0;0m  ['CAA-7', 'AAT-1', 'ATC-2', 'TCA-12', 'CAT-8', 'ATG-4', 'TGA-13', 'GAT-9', 'ATG-5', 'TGA-14', 'GAT-10', 'ATG-6', 'TGA-15', 'GAT-11', 'ATC-3']

CAATCATGATGATGATC


## DeBruijnGraph

In [11]:
class DeBruijnGraph (MyGraph):
    
    def __init__(self, frags):
        MyGraph.__init__(self, {})
        self.create_deBruijn_graph(frags)

    def add_edge(self, o, d):
        if o not in self.graph.keys():
            self.add_vertex(o)
        if d not in self.graph.keys():
            self.add_vertex(d)
        self.graph[o].append(d)

    def in_degree(self, v):
        res = 0
        for k in self.graph.keys(): 
            if v in self.graph[k]: 
                res += self.graph[k].count(v)
        return res

    def create_deBruijn_graph(self, frags):
        for seq in frags:
            suf = suffix(seq)
            self.add_vertex(suf)
            pref = prefix(seq)
            self.add_vertex(pref)
            self.add_edge(pref, suf)

    def seq_from_path(self, path):
        seq = path[0]
        for i in range(1,len(path)):
            nxt = path[i]
            seq += nxt[-1]
        return seq 
    
def suffix (seq): 
    return seq[1:]
    
def prefix(seq):
    return seq[:-1]

def composition(k, seq):
    res = []
    for i in range(len(seq)-k+1):
        res.append(seq[i:i+k])
    res.sort()
    return res

In [58]:
def test6():
    frags = [ "ATA", "ACC", "ATG", "ATT", "CAT", "CAT", "CAT", "CCA", "GCA", "GGC", "TAA", "TCA", "TGG", "TTC", "TTT"]
    dbgr = DeBruijnGraph(frags)
    dbgr.print_graph()
    print()
    print('\033[2;31;43m balanced_graph: \033[0;0m ',dbgr.check_nearly_balanced_graph())
    print('\033[2;31;43m eulerian_path:  \033[0;0m ',dbgr.eulerian_path())

test6()

TA  ->  ['AA']
AT  ->  ['TA', 'TG', 'TT']
CC  ->  ['CA']
AC  ->  ['CC']
TG  ->  ['GG']
TT  ->  ['TC', 'TT']
CA  ->  ['AT', 'AT', 'AT']
GC  ->  ['CA']
GG  ->  ['GC']
AA  ->  []
TC  ->  ['CA']

[2;31;43m balanced_graph: [0;0m  ('AC', 'AA')
[2;31;43m eulerian_path:  [0;0m  ['AC', 'CC', 'CA', 'AT', 'TT', 'TT', 'TC', 'CA', 'AT', 'TG', 'GG', 'GC', 'CA', 'AT', 'TA', 'AA']


In [62]:
def test7():
    orig_sequence = "ATGCAATGGTCTG"
    frags = composition(3, orig_sequence)
    dbgr = DeBruijnGraph(frags)
    dbgr.print_graph()
    print()
    print('balanced_graph: ',dbgr.check_nearly_balanced_graph())
    print()
    p= dbgr.eulerian_path()
    print('eulerian_path: ',p)
    print (dbgr.seq_from_path(p))
    
test7()

AT  ->  ['TG', 'TG']
AA  ->  ['AT']
TG  ->  ['GC', 'GG']
CA  ->  ['AA']
CT  ->  ['TG']
GC  ->  ['CA']
GT  ->  ['TC']
GG  ->  ['GT']
TC  ->  ['CT']

balanced_graph:  ('AT', 'TG')

eulerian_path:  ['AT', 'TG', 'GC', 'CA', 'AA', 'AT', 'TG', 'GG', 'GT', 'TC', 'CT', 'TG']
ATGCAATGGTCTG
