Lexicon.find_stems now supports a tag filter

jtauber · Apr 27, 2016 · 3f93805 · 3f93805
1 parent 6c1c372
commit 3f93805
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 3 deletions.
diff --git a/docs.rst b/docs.rst
@@ -144,6 +144,24 @@ be:
 >>> sorted(lexicon.stem_to_lemma_key_regex["παυσ"])
 [('παύω', 'A[AM][NPDSO]', ()), ('παύω', 'F[AM]', ())]
 
+Tag filters can be used to limit which stems are considered.
+
+>>> lexicon.add("ἵστημι", "A[AM][NPDSO]", "στησ", {"-intransitive"})
+>>> lexicon.add("ἵστημι", "A[AM][NPDSO]", "στα{root}", {"-transitive"})
+
+Note that ``-intransitive`` means the stem doesn't apply if intransitive.
+This approach means that in the absence of any tag filters, both possibilities
+are returned (whereas had ``+transitive`` and ``+intransitive`` been used,
+neither stem would come up in the default case of no tag filter).
+
+>>> sorted(lexicon.find_stems("ἵστημι", "AAN"))
+['στα{root}', 'στησ']
+>>> lexicon.find_stems("ἵστημι", "AAN", {"transitive"})
+{'στησ'}
+>>> lexicon.find_stems("ἵστημι", "AAN", {"intransitive"})
+{'στα{root}'}
+
+
 Inflexion
 ---------
 

diff --git a/inflexion/lexicon.py b/inflexion/lexicon.py
@@ -17,13 +17,32 @@ def add(self, lemma, key_regex, stem, tags=None):
             # we use tuple(sorted(...)) to make deterministically hashable
             (lemma, key_regex, tuple(sorted(tags))))
 
-    def find_stems(self, lemma, key):
+    def find_stems(self, lemma, key, tag_filter=None):
         """
         returns a (possibly empty) stem_set for the given lemma and key
         """
-        result = set()
+        tag_filter = tag_filter or set()
+
+        prev_key_regex = None
+
         for key_regex, stem, tags in self.lemma_to_stems[lemma]:
+
+            skip = False
+            for tag in tags:
+                if tag[0] == "+" and tag[1:] not in tag_filter:
+                    skip = True
+                    break
+                if tag[0] == "-" and tag[1:] in tag_filter:
+                    skip = True
+                    break
+            if skip:
+                continue
+
             if re.match(key_regex, key):
-                result = {stem}  # we don't break or return as we want last @@@
+                if key_regex != prev_key_regex:  # this means multiple stems
+                    result = set()               # for same key_regex must be
+                    prev_key_regex = key_regex   # contiguously added
+
+                result.add(stem)
 
         return result
diff --git a/test.py b/test.py
@@ -145,6 +145,28 @@ def test_find_stems(self):
         lexicon.add("FOO", "bar", "foo")
         self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
 
+    def test_find_stems_with_tags_1(self):
+        lexicon = Lexicon()
+        lexicon.add("FOO", "bar", "faa", {'+a'})
+        lexicon.add("FOO", "bar", "fee", {'-a'})
+        self.assertEqual(
+            lexicon.find_stems("FOO", "barista"),
+            {"fee"}
+        )
+        self.assertEqual(
+            lexicon.find_stems("FOO", "barista", {"a"}),
+            {"faa"}
+        )
+
+    def test_find_stems_with_tags_2(self):
+        lexicon = Lexicon()
+        lexicon.add("FOO", "bar", "faa", {'-a'})
+        lexicon.add("FOO", "bar", "fee", {'-b'})
+        self.assertEqual(
+            lexicon.find_stems("FOO", "barista"),
+            {"faa", "fee"}
+        )
+
 
 class MainTest(unittest.TestCase):