Skip to content

Commit

Permalink
Lexicon.find_stems now supports a tag filter
Browse files Browse the repository at this point in the history
  • Loading branch information
jtauber committed Apr 27, 2016
1 parent 6c1c372 commit 3f93805
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 3 deletions.
18 changes: 18 additions & 0 deletions docs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,24 @@ be:
>>> sorted(lexicon.stem_to_lemma_key_regex["παυσ"])
[('παύω', 'A[AM][NPDSO]', ()), ('παύω', 'F[AM]', ())]

Tag filters can be used to limit which stems are considered.

>>> lexicon.add("ἵστημι", "A[AM][NPDSO]", "στησ", {"-intransitive"})
>>> lexicon.add("ἵστημι", "A[AM][NPDSO]", "στα{root}", {"-transitive"})

Note that ``-intransitive`` means the stem doesn't apply if intransitive.
This approach means that in the absence of any tag filters, both possibilities
are returned (whereas had ``+transitive`` and ``+intransitive`` been used,
neither stem would come up in the default case of no tag filter).

>>> sorted(lexicon.find_stems("ἵστημι", "AAN"))
['στα{root}', 'στησ']
>>> lexicon.find_stems("ἵστημι", "AAN", {"transitive"})
{'στησ'}
>>> lexicon.find_stems("ἵστημι", "AAN", {"intransitive"})
{'στα{root}'}


Inflexion
---------

Expand Down
25 changes: 22 additions & 3 deletions inflexion/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,32 @@ def add(self, lemma, key_regex, stem, tags=None):
# we use tuple(sorted(...)) to make deterministically hashable
(lemma, key_regex, tuple(sorted(tags))))

def find_stems(self, lemma, key):
def find_stems(self, lemma, key, tag_filter=None):
"""
returns a (possibly empty) stem_set for the given lemma and key
"""
result = set()
tag_filter = tag_filter or set()

prev_key_regex = None

for key_regex, stem, tags in self.lemma_to_stems[lemma]:

skip = False
for tag in tags:
if tag[0] == "+" and tag[1:] not in tag_filter:
skip = True
break
if tag[0] == "-" and tag[1:] in tag_filter:
skip = True
break
if skip:
continue

if re.match(key_regex, key):
result = {stem} # we don't break or return as we want last @@@
if key_regex != prev_key_regex: # this means multiple stems
result = set() # for same key_regex must be
prev_key_regex = key_regex # contiguously added

result.add(stem)

return result
22 changes: 22 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,28 @@ def test_find_stems(self):
lexicon.add("FOO", "bar", "foo")
self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})

def test_find_stems_with_tags_1(self):
lexicon = Lexicon()
lexicon.add("FOO", "bar", "faa", {'+a'})
lexicon.add("FOO", "bar", "fee", {'-a'})
self.assertEqual(
lexicon.find_stems("FOO", "barista"),
{"fee"}
)
self.assertEqual(
lexicon.find_stems("FOO", "barista", {"a"}),
{"faa"}
)

def test_find_stems_with_tags_2(self):
lexicon = Lexicon()
lexicon.add("FOO", "bar", "faa", {'-a'})
lexicon.add("FOO", "bar", "fee", {'-b'})
self.assertEqual(
lexicon.find_stems("FOO", "barista"),
{"faa", "fee"}
)


class MainTest(unittest.TestCase):

Expand Down

0 comments on commit 3f93805

Please sign in to comment.