In [1]:
from __future__ import print_function
import subprocess
from tempfile import TemporaryFile
import kevlar
import khmer
import pysam

In [2]:
def get_unique_kmers(sequence, ksize=31):
    ct = khmer._Counttable(ksize, [1])
    kmers = set()
    for kmer in ct.get_kmers(sequence):
        minkmer = kevlar.revcommin(kmer)
        if minkmer not in kmers:
            kmers.add(minkmer)
            yield kmer

In [3]:
def unique_kmer_string(infile, ksize=31):
    output = ''
    instream = open(infile, 'r')
    for defline, sequence in kevlar.seqio.parse_fasta(instream):
        for n, kmer in enumerate(get_unique_kmers(sequence, ksize)):
            output += '>kmer{:d}\n{:s}\n'.format(n, kmer)
    return output

In [4]:
def get_exact_matches(infile, bwaindexfile, ksize=31):
    kmers = unique_kmer_string(infile, ksize)
    cmd = 'bwa mem -k {k} -T {k} {idx} -'.format(k=ksize, idx=bwaindexfile)
    cmdargs = cmd.split(' ')
    with TemporaryFile() as samfile:
        bwaproc = subprocess.Popen(cmdargs, stdin=subprocess.PIPE, stdout=samfile, universal_newlines=True)
        stdout, stderr = bwaproc.communicate(input=kmers)
        samfile.seek(0)
        sam = pysam.AlignmentFile(samfile, 'r')
        for record in sam:
            if record.is_unmapped:
                continue
            seqid = sam.get_reference_name(record.reference_id)
            yield seqid, record.pos

In [5]:
matchgen = get_exact_matches('human-sim-pico/part.cc9.fa', 'human-sim-pico/human.random.fa', ksize=25)
matches = [m for m in matchgen]
len(matches)
minpos = min([p for s, p in matches])
maxpos = max([p for s, p in matches])
print(minpos, maxpos)

1527048 1527201


In [6]:
def select_region(matchlist, maxdiff=1000, delta=100):
    seqids = set([s for s, p in matchlist])
    if len(seqids) > 1:
        return None

    minpos = min([p for s, p in matchlist])
    maxpos = max([p for s, p in matchlist])
    if maxpos - minpos > maxdiff:
        return None
    return seqids.pop(), minpos-100, maxpos+100

In [7]:
select_region(matches)

('seq1', 1526948, 1527301)

In [8]:
1527201 - 1527048

153