# Burrows-Wheeler inexact matching with seeds


In [2]:
## Subroutines
import PatternMatching as pm
from collections import Counter

# pattern must have a k-mer of size k= n//(d+1) with exact match to match text with ≤d-mismatches
def split_pattern(pattern, d):
    k = len(pattern)//(d+1)
    seeds = []
    for i in range(0,d*k,k):        
        seeds.append(pattern[i:i+k])
    seeds.append(pattern[d*k:])
    return seeds, k

def get_first_last(fm_index):
    # recreate First/Last cols -> [(symbol, rank in last)]
    last = []
    for i, s in enumerate(fm_index.bwt):
        if s == '$':
            last.append((s,0))
        else:
            last.append((s, fm_index.checkpt[s][i//fm_index.k]+ Counter(fm_index.bwt[i - i%fm_index.k: i])[s]))
    first = [(fm_index.bwt[i],last[i][1]) for i in range(len(fm_index.bwt))]
    first = sorted(first, key = lambda x: x[0])        
    return first, last

def seed_detection(fm_index, seeds, k):
    
    detect_ranges = []
    putative = False
    for i in range(len(seeds)):
        seed_det = fm_index.pattern_match(seeds[i])
        if seed_det == None:
            putative = True
        detect_ranges.append(seed_det)
    
    match_starts = []
    first,last = get_first_last(fm_index)
    for seed in range(len(detect_ranges)):
        window = detect_ranges[seed]
        if window == None:
            match_starts.append([])
        else:
            seed_starts = []
            for i in range(window[0], window[1]):
                steps=0
                j=i
                while True:
                    if j in fm_index.psa:
                        seed_starts.append(fm_index.psa.index(j)*fm_index.k + steps)
                        break
                    steps+=1
                    j = first.index(last[j])
            match_starts.append((seed*k, seed_starts))
    
    # move match start to pattern start by zero'ing the offset
    to_extend = []
    for match in match_starts:
        if match:
            offset = match[0]
            for start in match[1]:
                hit = start - offset
                if hit not in to_extend and hit >= 0:
                    to_extend.append(hit)
    
    # add to set of hits to score by Hammond dist
    return list(set(to_extend))


def seed_extension(pattern, text, to_extend, d):
    
    def HammondD(a,b,threshold):
        # Hammond distance with break at threshold
        dist = 0
        for i in range(len(a)):
            if a[i] != b[i]: 
                dist += 1
                if dist > d:
                    return dist
        return dist
    
    hits = []
    for start in to_extend: 
        if start+len(pattern)<=len(text):
            if HammondD(pattern, text[start: start+len(pattern)], d) <= d:
                hits.append(start)
    return hits
 
def multiple_inexact_match(text, patterns, d):
    """return all start positions of ≤d-mismatch query patterns in text"""
    # create the FM-index using BW class in PatternMatch module
    fm_index = pm.BurrowsWheeler(text)
    seed_hits = []
    for pattern in patterns:
        # split into d+1 seeds
        seeds, k = split_pattern(pattern, d)
        # seed detection by exact pattern match
        to_extend = seed_detection(fm_index, seeds, k)
        # find hits with ≤d mismatches
        hits = seed_extension(pattern, text, to_extend, d)
        seed_hits += hits
    seed_hits.sort()
    return seed_hits


In [19]:
text = "ABABABABBBBABB"
patterns =['ABB']
d = 1

import time

start = time.time()

solution = multiple_inexact_match(text,patterns, 1)
solution = ' '.join(str(i) for i in solution)
print(time.time() - start)
solution

setting default k

+++ Finished early, at stage h= 4
done at h= 4
ManberMyers: time (s) = 0.0002570152282714844
0.0007910728454589844


'0 2 4 6 7 8 11'

In [21]:
with open("/Users/jasonmoggridge/Desktop/MultipleApproximatePatternMatching.txt", 'r') as file:
    dna = file.readline().strip()
    patterns = file.readline().strip().split(' ')
    d = int(file.readline().strip())
    solutions = [int(i) for i in file.readline().strip().split(' ')]
solution = multiple_inexact_match(dna, patterns, d)
solution = ' '.join(str(i) for i in solution)


setting default k

+++ Finished early, at stage h= 8
done at h= 8
ManberMyers: time (s) = 0.5323939323425293
392.5063650608063


In [22]:
[int(i) for i in solution.split(' ')] == solutions




False

In [23]:
solution[:10]

'4 11 15 20'

In [37]:
result = [int(i) for i in solution.split(' ')]


In [38]:
len(result)

2000

In [39]:
len(solutions)

2000

In [40]:
len(dna)

10000

In [41]:
solutions.sort()

In [42]:
result == solutions

True

In [3]:
with open("/Users/jasonmoggridge/Desktop/dataset_304_10.txt", 'r') as file:
    dna = file.readline().strip()
    patterns = file.readline().strip().split(' ')
    d = int(file.readline())
result = multiple_inexact_match(dna, patterns, d)
result_string = ' '.join(str(i) for i in result)

In [8]:
with open("/Users/jasonmoggridge/Desktop/rosalind_ba9o.txt", 'r') as file:
    dna = file.readline().strip()
    patterns = file.readline().strip().split(' ')
    d = int(file.readline())
result = multiple_inexact_match(dna, patterns, d)
result_string = ' '.join(str(i) for i in result)
with open ("/Users/jasonmoggridge/Desktop/rosalind_ba9o_output.txt", 'w') as file:
    file.write(result_string)

setting default k

+++ Finished early, at stage h= 8
done at h= 8
ManberMyers: time (s) = 0.5359389781951904


In [9]:
print('done')

done
