In [1]:
import numpy as np
from Bio import SeqIO
import os

In [2]:
class DBG:
    def __init__(self, k, file=None, strategy=None):
        self.k = k
        self.nodes = dict() # kmer -> Node
        self.edges = dict() # kkmer -> Edge (kk == k+1)
        
        if file:
            seqs = self.load_data(file)
            self.create(seqs)
            self.condense()
            if strategy == "tails":
                self.remove_tails()
                self.condense()
            elif strategy == "lowcov":
                self.remove_lowcov()
                self.condense()
    
    
    def load_data(self, file):
        filename, fileext = os.path.splitext(file)
        if fileext in [".fasta", ".fa", ".fna"]:
            ext = 'fasta' 
        elif fileext in [".fastq", ".fq"]:
            ext = 'fastq' 
        else: 
            raise TypeError("Unsupported file type:" + fileext)
        
        seqs = set()
        for record in SeqIO.parse(file, ext):
            seqs.add(str(record.seq))
            seqs.add(str(record.seq.reverse_complement()))
        return seqs
    
    
    def create(self, seqs):
        for seq in seqs:
            for kkmer in kmerIterator(self.k+1, seq):
                a, b = kkmer[:-1], kkmer[1:]
                if kkmer not in self.edges:
                    e = Edge(self.k, kkmer, self)
                    self.edges[kkmer] = e
                if a not in self.nodes:
                    self.nodes[a] = Node(self.k, a, self, outcome=kkmer)
                else:
                    self.nodes[a].update(outcome=kkmer)
                if b not in self.nodes:
                    self.nodes[b] = Node(self.k, b, self, income=kkmer)
                else:
                    self.nodes[b].update(income=kkmer)
    
    def condense(self):
        keys = list(self.nodes.keys())
        for key in keys:
            node = self.nodes[key]
            if len(node.ins) == 1 and len(node.outs) == 1:
                a = next(iter(node.ins))
                b = next(iter(node.outs))
                if a in self.edges and b in self.edges:
                    self.edges[a].seq += self.edges[b].seq[self.k:]
                    next_node = self.nodes[self.edges[b].seq[-self.k:]]
                    next_node.ins.remove(b)
                    next_node.ins.add(a)
                    del self.edges[b]
    
    
    def getMeanCov(self):
        return np.mean([e.getCov() for e in self.edges.values()])
    
    def remove_tails(self, coef=2):
        f = True
        while f:
            f = False
            meancov = self.getMeanCov()
            keys = list(self.edges.keys())
            for key in keys:
                edge = self.edges[key]
                if edge.getCov() * coef < meancov or len(edge.seq) < self.k * 2:
                    if len(self.nodes[edge.seq[-self.k:]].outs) == 0:
                        self.nodes[edge.seq[:self.k]].outs.remove(key)
                        del self.edges[key]
                        f = True
                    elif len(self.nodes[edge.seq[:self.k]].ins) == 0:
                        self.nodes[edge.seq[-self.k:]].ins.remove(key)
                        del self.edges[key]
                        f = True
                
    
    
    def remove_lowcov(self, coef=2):
        f = True
        while f:
            f = False
            meancov = self.getMeanCov()
            keys = list(self.edges.keys())
            for key in keys:
                edge = self.edges[key]
                if edge.getCov() * coef < meancov or len(edge.seq) < self.k * 2:
                    self.nodes[edge.seq[:self.k]].outs.remove(key)
                    self.nodes[edge.seq[-self.k:]].ins.remove(key)
                    del self.edges[key]
                    f = True
                
            
    def toFasta(self, file):
        f = open(file + ".fasta", 'w')
        for i, e in enumerate(self.edges.values()):
            print(f">{i+1}", file=f)
            print(e.seq, file=f)
        f.close()
    
    
    def toGFA(self, file):
        f = open(file + ".gfa", 'w')
        links = set()
        for i, e in enumerate(self.edges.values()):
            e.id = i+1
            print("S", e.id, e.seq, f"LN:i:{len(e.seq)}", f"KC:f:{e.getKC()/(len(e.seq)-self.k+1)*len(e.seq)}", sep="\t", file=f)
            links.add(e.seq[:self.k])
            links.add(e.seq[-self.k:])
            
        for link in links:
            v = self.nodes[link]
            for a in v.ins:
                for b in v.outs:
                    print("L", self.edges[a].id, "+", self.edges[b].id, "+", f"{self.k-1}M", sep="\t", file=f)
        f.close()
        
    
    
class Node:
    def __init__(self, k, kmer, dbg, income=None, outcome=None):
        self.dbg = dbg
        self.k = k
        self.kmer = kmer
        self.cov = 1
        self.ins = set() # set of incoming edges
        self.outs = set() # set of outcoming edges
        if income: 
            self.ins.add(income)
        if outcome: 
            self.outs.add(outcome)
       
        
    def update(self, income=None, outcome=None):
        if income: 
            self.ins.add(income)
            self.cov += 1
        if outcome: 
            self.outs.add(outcome)
            #self.cov += 1
            
    def __str__(self):
        return f"len={len(self.kmer)}, cov={self.cov}, kmer={self.kmer}, {self.ins}, {self.outs}"
        
        
class Edge:
    def __init__(self, k, kkmer, dbg):
        self.dbg = dbg
        self.k = k
        self.seq = kkmer
         
    def getCov(self):
        covs = []
        for kmer in kmerIterator(self.k, self.seq):
            covs.append(self.dbg.nodes[kmer].cov)
        return np.mean(covs)
    
    def getKC(self):
        covs = []
        for kmer in kmerIterator(self.k, self.seq):
            covs.append(self.dbg.nodes[kmer].cov)
        return np.sum(covs)
    
    def __str__(self):
        return f"len={len(self.seq)}, cov={self.getCov()}, seq={self.seq}"
    
        
        
class kmerIterator:
    def __init__(self, k, kmer):
        self.pos = 0
        self.kmer = kmer
        self.n = len(self.kmer)
        self.k = k

    def __iter__(self):
        self.pos = 0
        return self

    def __next__(self):
        if self.k + self.pos > self.n:
            raise StopIteration
        ans = self.kmer[self.pos:self.pos+self.k]
        self.pos += 1
        return ans

In [3]:
dbg = DBG(5, "data/test.fq")
dbg.toFasta("out/edges")
dbg.toGFA("out/edges")

In [4]:
for data in ["s_6.first1000", "s_6.first10000", "s_6.first100000"]:
    dbg = DBG(55, "data/" + data + ".fastq")
    dbg.toFasta("out/" + data)
    dbg.toGFA("out/" + data)
    print(data, "done")

s_6.first1000 done
s_6.first10000 done
s_6.first100000 done


In [5]:
# remove tails
for data in ["s_6.first1000", "s_6.first10000", "s_6.first100000"]:
    dbg = DBG(55, "data/" + data + ".fastq", strategy="tails")
    dbg.toFasta("out/" + data + "_tails")
    dbg.toGFA("out/" + data + "_tails")
    print(data, "done")

s_6.first1000 done
s_6.first10000 done
s_6.first100000 done


In [6]:
# remove lowcov
for data in ["s_6.first1000", "s_6.first10000", "s_6.first100000"]:
    dbg = DBG(55, "data/" + data + ".fastq", strategy="lowcov")
    dbg.toFasta("out/" + data + "_lowcov")
    dbg.toGFA("out/" + data + "_lowcov")
    print(data, "done")

s_6.first1000 done
s_6.first10000 done
s_6.first100000 done
