Skip to content

Commit

Permalink
StemmingRuleSet.inflect now supports tag filter
Browse files Browse the repository at this point in the history
  • Loading branch information
jtauber committed Apr 26, 2016
1 parent 282ec8d commit 3e45e25
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 2 deletions.
20 changes: 20 additions & 0 deletions docs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,26 @@ We can also inflect a given stem according to a given key:
... print(sorted(result.items()))
[('base', 'παυ'), ('ending', 'ει'), ('rule', SandhiRule('|>ει<ει|')), ('used_default', True)]

Rules can be included or excluded based on their tags. A ``+tag`` will only be
used if ``tag`` is used as a filter on ``possible_stems`` or ``inflect``. A
``-tag`` will not be used if ``tag`` is used as a filter.

>>> rules.add("PAI.1P", "|α!>α<|μεν", {"-enclitic"})
SandhiRule('|α!>α<|μεν', tags={'-enclitic'})
>>> rules.add("PAI.1P", "|α!>α<|μέν", {"+enclitic"})
SandhiRule('|α!>α<|μέν', tags={'+enclitic'})

>>> for result in rules.inflect("ἱστα!", "PAI.1P"):
... print(result['base'] + result['ending'])
ἱσταμεν

>>> for result in rules.inflect("φα!", "PAI.1P", {"enclitic"}):
... print(result['base'] + result['ending'])
φαμέν

In the above two examples, different rules are triggered depending on whether
the ``"enclitic"`` tag filter is passed in.


lexicon
-------
Expand Down
6 changes: 5 additions & 1 deletion inflexion/sandhi.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ def __init__(self, rule, tags=None):
self.surface = self.a + self.c + self.e

def __repr__(self):
return "SandhiRule('{0.a}|{0.b}>{0.c}<{0.d}|{0.e}')".format(self)
if self.tags:
return "SandhiRule('{0.a}|{0.b}>{0.c}<{0.d}|{0.e}', " \
"tags={1})".format(self, self.tags)
else:
return "SandhiRule('{0.a}|{0.b}>{0.c}<{0.d}|{0.e}')".format(self)

def match_theme(self, stem):
"""
Expand Down
14 changes: 13 additions & 1 deletion inflexion/stemming.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,23 @@ def add(self, key, rule, tags=None):
self.surface_to_key_stem[r.surface].add((key, r))
return r

def inflect(self, stem, key):
def inflect(self, stem, key, tag_filter=None):
base_endings = []
default = []
tag_filter = tag_filter or set()

for rule in self.key_to_rules[key]:
skip = False
for tag in rule.tags:
if tag[0] == "+" and tag[1:] not in tag_filter:
skip = True
break
if tag[0] == "-" and tag[1:] in tag_filter:
skip = True
break
if skip:
continue

base = rule.match_theme(stem)
if base is not None:
if rule.stem:
Expand Down
19 changes: 19 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def test_sandhirule_creation_2(self):
self.assertEqual(rule.surface, "X")
self.assertEqual(repr(rule), "SandhiRule('|><|X')")

def test_sandhirule_creation_3(self):
rule = SandhiRule("A|B>C<D|E", {"+tag"})
self.assertEqual(rule.tags, {"+tag"})
self.assertEqual(repr(rule), "SandhiRule('A|B>C<D|E', tags={'+tag'})")

def test_match_theme_1(self):
rule = SandhiRule("A|B>C<D|E")
self.assertEqual(rule.match_theme("AB"), "A")
Expand Down Expand Up @@ -106,6 +111,20 @@ def test_inflect_3(self):
"used_default": True,
}])

def test_inflect_4(self):
rules = StemmingRuleSet()
rules.add("foo", "A|B>C<D|E")
rules.add("foo", "A|B>C<D|F", {"+bar"})
rules.add("foo", "A|B>C<D|G", {"-bar"})
self.assertEqual(sorted([
r["base"] + r["ending"]
for r in rules.inflect("FAB", "foo")]),
["FACE", "FACG"])
self.assertEqual(sorted([
r["base"] + r["ending"]
for r in rules.inflect("FAB", "foo", {"bar"})]),
["FACE", "FACF"])


class LexiconTest(unittest.TestCase):

Expand Down

0 comments on commit 3e45e25

Please sign in to comment.