StemmingRuleSet.inflect now supports tag filter

jtauber · Apr 26, 2016 · 3e45e25 · 3e45e25
1 parent 282ec8d
commit 3e45e25
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 2 deletions.
diff --git a/docs.rst b/docs.rst
@@ -91,6 +91,26 @@ We can also inflect a given stem according to a given key:
 ...     print(sorted(result.items()))
 [('base', 'παυ'), ('ending', 'ει'), ('rule', SandhiRule('|>ει<ει|')), ('used_default', True)]
 
+Rules can be included or excluded based on their tags. A ``+tag`` will only be
+used if ``tag`` is used as a filter on ``possible_stems`` or ``inflect``. A
+``-tag`` will not be used if ``tag`` is used as a filter.
+
+>>> rules.add("PAI.1P", "|α!>α<|μεν", {"-enclitic"})
+SandhiRule('|α!>α<|μεν', tags={'-enclitic'})
+>>> rules.add("PAI.1P", "|α!>α<|μέν", {"+enclitic"})
+SandhiRule('|α!>α<|μέν', tags={'+enclitic'})
+
+>>> for result in rules.inflect("ἱστα!", "PAI.1P"):
+...     print(result['base'] + result['ending'])
+ἱσταμεν
+
+>>> for result in rules.inflect("φα!", "PAI.1P", {"enclitic"}):
+...     print(result['base'] + result['ending'])
+φαμέν
+
+In the above two examples, different rules are triggered depending on whether
+the ``"enclitic"`` tag filter is passed in.
+
 
 lexicon
 -------

diff --git a/inflexion/sandhi.py b/inflexion/sandhi.py
@@ -19,7 +19,11 @@ def __init__(self, rule, tags=None):
         self.surface = self.a + self.c + self.e
 
     def __repr__(self):
-        return "SandhiRule('{0.a}|{0.b}>{0.c}<{0.d}|{0.e}')".format(self)
+        if self.tags:
+            return "SandhiRule('{0.a}|{0.b}>{0.c}<{0.d}|{0.e}', " \
+                "tags={1})".format(self, self.tags)
+        else:
+            return "SandhiRule('{0.a}|{0.b}>{0.c}<{0.d}|{0.e}')".format(self)
 
     def match_theme(self, stem):
         """

diff --git a/inflexion/stemming.py b/inflexion/stemming.py
@@ -17,11 +17,23 @@ def add(self, key, rule, tags=None):
         self.surface_to_key_stem[r.surface].add((key, r))
         return r
 
-    def inflect(self, stem, key):
+    def inflect(self, stem, key, tag_filter=None):
         base_endings = []
         default = []
+        tag_filter = tag_filter or set()
 
         for rule in self.key_to_rules[key]:
+            skip = False
+            for tag in rule.tags:
+                if tag[0] == "+" and tag[1:] not in tag_filter:
+                    skip = True
+                    break
+                if tag[0] == "-" and tag[1:] in tag_filter:
+                    skip = True
+                    break
+            if skip:
+                continue
+
             base = rule.match_theme(stem)
             if base is not None:
                 if rule.stem:

diff --git a/test.py b/test.py
@@ -41,6 +41,11 @@ def test_sandhirule_creation_2(self):
         self.assertEqual(rule.surface, "X")
         self.assertEqual(repr(rule), "SandhiRule('|><|X')")
 
+    def test_sandhirule_creation_3(self):
+        rule = SandhiRule("A|B>C<D|E", {"+tag"})
+        self.assertEqual(rule.tags, {"+tag"})
+        self.assertEqual(repr(rule), "SandhiRule('A|B>C<D|E', tags={'+tag'})")
+
     def test_match_theme_1(self):
         rule = SandhiRule("A|B>C<D|E")
         self.assertEqual(rule.match_theme("AB"), "A")
@@ -106,6 +111,20 @@ def test_inflect_3(self):
             "used_default": True,
         }])
 
+    def test_inflect_4(self):
+        rules = StemmingRuleSet()
+        rules.add("foo", "A|B>C<D|E")
+        rules.add("foo", "A|B>C<D|F", {"+bar"})
+        rules.add("foo", "A|B>C<D|G", {"-bar"})
+        self.assertEqual(sorted([
+            r["base"] + r["ending"]
+            for r in rules.inflect("FAB", "foo")]),
+            ["FACE", "FACG"])
+        self.assertEqual(sorted([
+            r["base"] + r["ending"]
+            for r in rules.inflect("FAB", "foo", {"bar"})]),
+            ["FACE", "FACF"])
+
 
 class LexiconTest(unittest.TestCase):