Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Fm index #57

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
backup*/*
benchar/cbenchar/build/*
.vscode
*/__pycache__
*/*/__pycache__
152 changes: 61 additions & 91 deletions string_indexing/fm_index.py
Original file line number Diff line number Diff line change
@@ -1,118 +1,88 @@

class FMIndex:

# all of strings beginns with # (idk why?)
# i sppose that patterns do not starts with #

class _FMIndex:
def __init__ (self, SA, BWT, text, n):
self.L = BWT
self.F = '#$' + ''.join(text[SA[i]] for i in range(1, n + 1))
self.n = n
self.SA = SA
self.sampleSize = 8 # const for sampling
self.sample_size = 8 # const for sampling

#prepare char mapping for F
self.mapperOfChar = { self.F[2] : 0}
self.begginings = [2]
self.mapper_of_chars = { self.F[2] : 0}
self.beginnings = [2]
last = self.F[2]
lenOfBeginings = 1
for i in range(3, n+2):
if self.F[i] != last:
last = self.F[i]
self.begginings.append(i)
self.mapperOfChar[last] = lenOfBeginings
lenOfBeginings += 1
self.beginnings.append(i)
self.mapper_of_chars[last] = len(self.beginnings) - 1

self.lenOfAlphabet = len(self.mapperOfChar)
self.len_of_alphabet = len(self.mapper_of_chars)

#prepare closest samplings
currentSample = 0
self.closestSample = [0]
current_sample = 0
self.closest_sample = [0]
for i in range(1, n+2):
if abs(currentSample-i) > abs(currentSample + self.sampleSize-i) and (i + self.sampleSize < self.n):
currentSample += self.sampleSize
self.closestSample.append(currentSample)
if abs(current_sample-i) > abs(current_sample + self.sample_size-i) and (i + self.sample_size < self.n):
current_sample += self.sample_size
self.closest_sample.append(current_sample)

#Generate values for occ for given samples O(|A|*n)
self.occInSampleForChar = { self.L[i]: [0] for i in range(1, n+2)}
for c in self.mapperOfChar:
currValue = 0
nextSample = self.sampleSize
self.occ_in_sample_for_char = { self.L[i]: [0] for i in range(1, n+2)}
for c in self.mapper_of_chars:
current_value = 0
next_sample = self.sample_size
for i in range(1, n+2):
if self.L[i] == c:
currValue += 1
if i == nextSample:
self.occInSampleForChar[c].append(currValue)
nextSample = nextSample + self.sampleSize

# should be private
def getRangeOfOccurence(self, p, size):
if size > self.n:
return [-1, -1]

currChar = p[size-1]
if currChar not in self.mapperOfChar:
return [-1, -1]
current_value += 1
if i == next_sample:
self.occ_in_sample_for_char[c].append(current_value)
next_sample = next_sample + self.sample_size

def from_suffix_array_and_bwt (SA, BWT, text, n):
return _FMIndex(SA, BWT, text, n)

mapIdx = self.mapperOfChar[currChar]
l = self.begginings[mapIdx]
r = self.n + 1
if mapIdx != self.lenOfAlphabet - 1:
r = self.begginings[mapIdx + 1] - 1

for i in range(size-2, -1, -1):
currChar = p[i]
if currChar not in self.mapperOfChar:
return [-1, -1]
occurencesBefore = self._getOcc(currChar, l - 1)
occurencesAfter = self._getOcc(currChar, r)
if occurencesBefore == occurencesAfter:
return [-1, -1]
mapIdx = self.mapperOfChar[currChar]
l = self.begginings[mapIdx] + occurencesBefore
r = self.begginings[mapIdx] + occurencesAfter - 1
if r < l:
return [-1, -1]
return [l, r]
# O(|p|)
def count(fm, p, size):
(low, high) = _get_range_of_occurrences(fm, p, size)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please replace with low, high = _get_range_of_occurrences(...)

return max(high - low + 1, 0) if low > -1 else 0

# O(|p|)
def count(self, p, size):
ran = self.getRangeOfOccurence(p, size)
if ran[0] == -1:
return 0
return max(ran[1] - ran[0] + 1, 0)
# O(|p| + k) where k is the number or occurances of p in text
def contains(fm, p, l):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since structures are written SA, BWT etc., probably we should use FM not fm.

(low, high) = _get_range_of_occurrences(fm, p, l)
yield from sorted([fm.SA[i-1] for i in range(low, high + 1) if low > -1])


#Should be private
def _getOcc(self, c, i):
closestSample = self.closestSample[i]
toAdd = 0
if closestSample < i:
for j in range(closestSample + 1, i + 1):
if self.L[j] == c:
toAdd += 1
elif closestSample > i:
for j in range(i+1, closestSample + 1):
if self.L[j] == c:
toAdd -= 1

return self.occInSampleForChar[c][(closestSample)//self.sampleSize] + toAdd

#O(|p|)
def query(self, p, l):
return self.count(p, l) > 0
def _get_occ(fm, c, i):
if fm.closest_sample[i] < i:
to_add = sum(1 for c_prim in fm.L[fm.closest_sample[i] + 1:i + 1] if c_prim == c)
else:
to_add = sum(-1 for c_prim in fm.L[i + 1:fm.closest_sample[i] + 1] if c_prim == c)
return fm.occ_in_sample_for_char[c][fm.closest_sample[i] // fm.sample_size] + to_add

# O(|p| + k) where k is the number or occurances of p in text
def get_all_occurrance(self, p, l):
arr = self.getRangeOfOccurence(p, l)
if arr[0] == -1:
return []
return [self.SA[i-1] for i in range(arr[0], arr[1] + 1)]
def _get_range_of_occurrences(fm, p, size):
if size > fm.n or size == 0:
return (-1, -1)

if p[-1] not in fm.mapper_of_chars:
return (-1, -1)

# O(|p|)
def get_any_occurrance(self, p, l):
arr = self.getRangeOfOccurence(p, l)
if arr[0] == -1:
return -1
return self.SA[arr[0]-1]
map_idx = fm.mapper_of_chars[p[-1]]
l = fm.beginnings[map_idx]
r = fm.n + 1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

l, r = fm.beginnings[map_idx], fm.n + 1
or even better

l = fm.beginnings[map_idx]
r = fm.beginnings[map_idx + 1] - 1 if map_idx != fm.len_of_alphabet - 1 else fm.n + 1

if map_idx != fm.len_of_alphabet - 1:
r = fm.beginnings[map_idx + 1] - 1

for i in range(size-1, 0, -1):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for c in p[::-1]: wouldn't be simpler?

if p[i] not in fm.mapper_of_chars:
return (-1, -1)
occurencesBefore = _get_occ(fm, p[i], l - 1)
occurencesAfter = _get_occ(fm, p[i], r)
if occurencesBefore == occurencesAfter:
return (-1, -1)
map_idx = fm.mapper_of_chars[p[i]]
l = fm.beginnings[map_idx] + occurencesBefore
r = fm.beginnings[map_idx] + occurencesAfter - 1
if r < l:
return (-1, -1)
return (l, r)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a new line and return -1, -1 etc., not (-1, -1) (though I guess it should be equivalent)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add empty line at the end - and don't forget to run `make check' linter

10 changes: 9 additions & 1 deletion test/test_exact_string_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,20 @@

from generator import rand
from exact_string_matching import forward, backward, other
from string_indexing import lcp, suffix_tree, suffix_array
from string_indexing import lcp, suffix_tree, suffix_array, fm_index
from compression import burrows_wheeler

def lcp_lr_contains(t, w, n, m):
SA = suffix_array.skew(t, n)
LCP_LR = lcp.build_lcp_lr(lcp.kasai(SA, t, n), n)
return lcp.contains(SA, LCP_LR, t, w, n, m)

def fm_index_contains(t, w, n, m):
SA = suffix_array.skew(t, n)
BWT = burrows_wheeler.transform_from_suffix_array(SA, t, n)
fm = fm_index.from_suffix_array_and_bwt(SA, BWT, t, n)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Staying true to the convention above let this be FM, ok?

return fm_index.contains(fm, w, m)

EXACT_STRING_MATCHING_ALGORITHMS = [
[ 'Morris-Pratt', forward.morris_pratt ],
[ 'Knuth-Morris-Pratt', forward.knuth_morris_pratt ],
Expand Down Expand Up @@ -45,6 +52,7 @@ def lcp_lr_contains(t, w, n, m):
suffix_array.prefix_doubling(t, n), t, w, n, m),
],
[ 'lcp-lr array', lcp_lr_contains ],
[ 'Fm index', fm_index_contains]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either FM index or fm index

]

class TestExactStringMatching(unittest.TestCase):
Expand Down
69 changes: 0 additions & 69 deletions test/test_fm_index.py

This file was deleted.