In [33]:
from Bio import SeqIO
import Bio
from collections import defaultdict

class Vertex:
    
    def __init__(self, seq):
        self.seq = seq
        self.coverage = 1
        self.in_edges = {}
        self.out_edges = {}
        
    def increase_coverage(self):
        self.coverage += 1

class Edge:
    
    def __init__(self,k1,k2):
        self.seq = k1 + k2[-1]
        self.n = 2
        self.coverage = 0
    
    def calc_coverage(self,c1,c2):
        self.coverage = (c1+c2)/2


class Graph:

    def __init__(self,k):
        self.vertices = {}
        self.k = k
        
    def add_read(self,read):
        read_lng = len(read)
        if read_lng < self.k:
            return
            
        kmer = read[:k]
        if kmer in self.vertices:
            self.vertices[kmer].increase_coverage()
        else:
            self.vertices[kmer] = Vertex(kmer)
        
        for next_kmer_indx in range(1,read_lng-k+1,1):
            next_kmer = read[next_kmer_indx:(next_kmer_indx+k)]
            if next_kmer in self.vertices:
                self.vertices[next_kmer].increase_coverage()
            else:
                self.vertices[next_kmer] = Vertex(next_kmer)
            
            new_edge = Edge(kmer,next_kmer)
            
            self.vertices[next_kmer].in_edges[kmer]  = [new_edge]
            
            self.vertices[kmer].out_edges[next_kmer] = [new_edge]

            kmer = next_kmer
    
    def calc_init_edge_coverage(self):
        
        for current_vertex in self.vertices.keys():
            for next_vertex in self.vertices[current_vertex].out_edges.keys():
                self.vertices[current_vertex].out_edges[next_vertex][0].calc_coverage(self.vertices[current_vertex].coverage,self.vertices[next_vertex].coverage)
        
    def Graph_viz(self, PathToWrite, full):
        
        self.graph =  Digraph(comment='assembly')

        if full:
            for v, e in self.vertices.items():
                self.graph.node(v, label='{}'.format(v))
                
                for i, j in e.out_edges.items():
                    self.graph.edge(v, i,label='{}'.format(j[0].seq))

        else:
            for v, e in self.vertices.items():
                self.graph.node(v, label='coverage={}'.format(my_graph.vertices[v].coverage))
                
                for i, j in e.out_edges.items():
                    self.graph.edge(v, i,label='len={},cov={}'.format(len(j[0].seq), j[0].coverage))
        
        print(self.graph.source)
        with open (PathToWrite, 'w') as handle:
            handle.write(self.graph.source)
        
       

if __name__ == '__main__':
    
    dataset = 'Загрузки/hw_4_5_dataset.fasta'

    k = 5
    
    my_graph = Graph(k)
    direction = 'reverse'
    with open(dataset, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            if direction == 'reverse':
                read = str(record.reverse_complement().seq)
                my_graph.add_read(read)
            else:
                read = str(record.seq)
                my_graph.add_read(read)

    my_graph.calc_init_edge_coverage()
    
    
    for v in my_graph.vertices:
        print('Vertex: {}, coverage: {}'.format(v,my_graph.vertices[v].coverage))
        for e in my_graph.vertices[v].out_edges:
            print('-> Out edge: {}'.format(e))
        for e in my_graph.vertices[v].in_edges:
            print('-> In edge: {}'.format(e))            
    
    my_graph.Graph_viz('Graph.dot', full=False)

Vertex: GTGTA, coverage: 286
-> Out edge: TGTAG
-> Out edge: TGTAT
-> Out edge: TGTAC
-> Out edge: TGTAA
-> In edge: GGTGT
-> In edge: TGTGT
-> In edge: AGTGT
-> In edge: CGTGT
Vertex: GCTCC, coverage: 457
-> Out edge: CTCCA
-> Out edge: CTCCC
-> Out edge: CTCCG
-> Out edge: CTCCT
-> In edge: TGCTC
-> In edge: GGCTC
-> In edge: AGCTC
-> In edge: CGCTC
Vertex: AGTGC, coverage: 286
-> Out edge: GTGCT
-> Out edge: GTGCG
-> Out edge: GTGCC
-> Out edge: GTGCA
-> In edge: TAGTG
-> In edge: CAGTG
-> In edge: GAGTG
-> In edge: AAGTG
Vertex: AACTC, coverage: 199
-> Out edge: ACTCG
-> Out edge: ACTCA
-> Out edge: ACTCT
-> Out edge: ACTCC
-> In edge: AAACT
-> In edge: TAACT
-> In edge: CAACT
-> In edge: GAACT
Vertex: ATATC, coverage: 6
-> Out edge: TATCT
-> Out edge: TATCC
-> In edge: AATAT
-> In edge: CATAT
-> In edge: TATAT
-> In edge: GATAT
Vertex: ACGCG, coverage: 101
-> Out edge: CGCGA
-> Out edge: CGCGG
-> Out edge: CGCGT
-> In edge: GACGC
-> In edge: CACGC
-> In edge: TACGC
-> In edge: AAC

-> In edge: GATCC
Vertex: TCGTA, coverage: 104
-> Out edge: CGTAC
-> Out edge: CGTAG
-> Out edge: CGTAA
-> In edge: CTCGT
-> In edge: GTCGT
-> In edge: TTCGT
-> In edge: ATCGT
Vertex: TAAAC, coverage: 293
-> Out edge: AAACC
-> Out edge: AAACT
-> Out edge: AAACA
-> Out edge: AAACG
-> In edge: TTAAA
-> In edge: ATAAA
-> In edge: GTAAA
-> In edge: CTAAA
Vertex: GTCCC, coverage: 560
-> Out edge: TCCCA
-> Out edge: TCCCC
-> Out edge: TCCCG
-> Out edge: TCCCT
-> In edge: CGTCC
-> In edge: TGTCC
-> In edge: AGTCC
-> In edge: GGTCC
Vertex: TTTCA, coverage: 11
-> Out edge: TTCAG
-> Out edge: TTCAA
-> Out edge: TTCAT
-> Out edge: TTCAC
-> In edge: GTTTC
-> In edge: TTTTC
-> In edge: CTTTC
Vertex: TCCCA, coverage: 565
-> Out edge: CCCAC
-> Out edge: CCCAT
-> Out edge: CCCAG
-> Out edge: CCCAA
-> In edge: CTCCC
-> In edge: ATCCC
-> In edge: GTCCC
-> In edge: TTCCC
Vertex: AGGAC, coverage: 658
-> Out edge: GGACC
-> Out edge: GGACA
-> Out edge: GGACT
-> Out edge: GGACG
-> In edge: TAGGA
-> In edge: 

-> In edge: GCAGA
Vertex: CTACA, coverage: 197
-> Out edge: TACAC
-> Out edge: TACAT
-> Out edge: TACAG
-> Out edge: TACAA
-> In edge: CCTAC
-> In edge: TCTAC
-> In edge: ACTAC
-> In edge: GCTAC
Vertex: ACAAT, coverage: 208
-> Out edge: CAATC
-> Out edge: CAATT
-> Out edge: CAATG
-> Out edge: CAATA
-> In edge: CACAA
-> In edge: AACAA
-> In edge: TACAA
-> In edge: GACAA
Vertex: GCGAC, coverage: 274
-> Out edge: CGACA
-> Out edge: CGACG
-> Out edge: CGACT
-> Out edge: CGACC
-> In edge: TGCGA
-> In edge: CGCGA
-> In edge: GGCGA
-> In edge: AGCGA
Vertex: TCAGG, coverage: 195
-> Out edge: CAGGC
-> Out edge: CAGGT
-> Out edge: CAGGG
-> Out edge: CAGGA
-> In edge: TTCAG
-> In edge: CTCAG
-> In edge: GTCAG
-> In edge: ATCAG
Vertex: ACGTC, coverage: 98
-> Out edge: CGTCT
-> Out edge: CGTCG
-> Out edge: CGTCC
-> Out edge: CGTCA
-> In edge: CACGT
-> In edge: AACGT
-> In edge: TACGT
-> In edge: GACGT
Vertex: TGAAC, coverage: 526
-> Out edge: GAACT
-> Out edge: GAACC
-> Out edge: GAACG
-> Out edge:

-> In edge: AGAGA
Vertex: TAGTC, coverage: 376
-> Out edge: AGTCG
-> Out edge: AGTCC
-> Out edge: AGTCA
-> Out edge: AGTCT
-> In edge: GTAGT
-> In edge: CTAGT
-> In edge: TTAGT
-> In edge: ATAGT
Vertex: GTGCT, coverage: 204
-> Out edge: TGCTC
-> Out edge: TGCTT
-> Out edge: TGCTA
-> Out edge: TGCTG
-> In edge: CGTGC
-> In edge: AGTGC
-> In edge: TGTGC
-> In edge: GGTGC
Vertex: AGGAG, coverage: 661
-> Out edge: GGAGC
-> Out edge: GGAGT
-> Out edge: GGAGG
-> Out edge: GGAGA
-> In edge: TAGGA
-> In edge: AAGGA
-> In edge: GAGGA
-> In edge: CAGGA
Vertex: GGCGA, coverage: 356
-> Out edge: GCGAT
-> Out edge: GCGAG
-> Out edge: GCGAA
-> Out edge: GCGAC
-> In edge: GGGCG
-> In edge: TGGCG
-> In edge: AGGCG
-> In edge: CGGCG
Vertex: CGGCG, coverage: 103
-> Out edge: GGCGG
-> Out edge: GGCGT
-> Out edge: GGCGC
-> Out edge: GGCGA
-> In edge: CCGGC
-> In edge: TCGGC
-> In edge: ACGGC
-> In edge: GCGGC
Vertex: AATGC, coverage: 194
-> Out edge: ATGCG
-> Out edge: ATGCC
-> Out edge: ATGCT
-> Out edge

Vertex: GTTAC, coverage: 8
-> Out edge: TTACG
-> Out edge: TTACC
-> Out edge: TTACA
-> In edge: TGTTA
-> In edge: GGTTA
-> In edge: CGTTA
-> In edge: AGTTA
Vertex: GTGAG, coverage: 479
-> Out edge: TGAGG
-> Out edge: TGAGA
-> Out edge: TGAGT
-> Out edge: TGAGC
-> In edge: CGTGA
-> In edge: AGTGA
-> In edge: GGTGA
-> In edge: TGTGA
Vertex: TTTAA, coverage: 386
-> Out edge: TTAAC
-> Out edge: TTAAT
-> Out edge: TTAAG
-> Out edge: TTAAA
-> In edge: GTTTA
-> In edge: CTTTA
-> In edge: ATTTA
-> In edge: TTTTA
Vertex: GTAAA, coverage: 929
-> Out edge: TAAAT
-> Out edge: TAAAG
-> Out edge: TAAAA
-> Out edge: TAAAC
-> In edge: GGTAA
-> In edge: CGTAA
-> In edge: TGTAA
-> In edge: AGTAA
Vertex: TACTA, coverage: 184
-> Out edge: ACTAA
-> Out edge: ACTAG
-> Out edge: ACTAC
-> In edge: CTACT
-> In edge: TTACT
-> In edge: ATACT
-> In edge: GTACT
Vertex: ATCGC, coverage: 7
-> Out edge: TCGCT
-> Out edge: TCGCA
-> In edge: AATCG
-> In edge: CATCG
-> In edge: GATCG
Vertex: TACAT, coverage: 197
-> Out 

// assembly
digraph {
	GTGTA [label="coverage=286"]
	GTGTA -> TGTAG [label="len=6,cov=235.5"]
	GTGTA -> TGTAT [label="len=6,cov=333.5"]
	GTGTA -> TGTAC [label="len=6,cov=189.0"]
	GTGTA -> TGTAA [label="len=6,cov=423.5"]
	GCTCC [label="coverage=457"]
	GCTCC -> CTCCA [label="len=6,cov=552.0"]
	GCTCC -> CTCCC [label="len=6,cov=373.5"]
	GCTCC -> CTCCG [label="len=6,cov=369.5"]
	GCTCC -> CTCCT [label="len=6,cov=415.5"]
	AGTGC [label="coverage=286"]
	AGTGC -> GTGCT [label="len=6,cov=245.0"]
	AGTGC -> GTGCG [label="len=6,cov=333.5"]
	AGTGC -> GTGCC [label="len=6,cov=151.0"]
	AGTGC -> GTGCA [label="len=6,cov=330.0"]
	AACTC [label="coverage=199"]
	AACTC -> ACTCG [label="len=6,cov=103.0"]
	AACTC -> ACTCA [label="len=6,cov=151.0"]
	AACTC -> ACTCT [label="len=6,cov=264.5"]
	AACTC -> ACTCC [label="len=6,cov=289.0"]
	ATATC [label="coverage=6"]
	ATATC -> TATCT [label="len=6,cov=54.0"]
	ATATC -> TATCC [label="len=6,cov=53.0"]
	ACGCG [label="coverage=101"]
	ACGCG -> CGCGA [label="len=6,cov=56.5"]
	ACGC