In [2]:

from collections import defaultdict

def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

In [3]:
overlap('ATGCGGGT', 'GGTGATTTA')

3

In [4]:
#Builds an Overlap graph

def overlaps(reads, k):
    index = defaultdict(set)
    for read in reads:
        for i in range(len(read) - k + 1): #Associate every k-mer to a read that occurs in it
            kmer = read[i:i+k]
            index[kmer].add(read)
    
    graph = defaultdict(set)
    for read in reads:
        suffix = read[-k:] #Suffix of read
        occur_reads = index[suffix] #Reads with occurences of that suffix
        for ocr in occur_reads: #For every read with that suffix
            if read != ocr: #Ignore overlap of suffix with its own read
                if overlap(read, ocr, k): #If overlap occurs between a suffix and read
                    graph[read].add(ocr) #Associate both the reads with edge in graph
    edges = 0
    for read in graph:
        edges += len(graph[read]) #count number of edges in graph
    
    return len(graph), edges, graph
    
