Skip to content

Commit

Permalink
regex match themes and order stem list
Browse files Browse the repository at this point in the history
  • Loading branch information
jtauber committed Apr 26, 2016
1 parent e9ceb24 commit 3253f1c
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 31 deletions.
28 changes: 14 additions & 14 deletions docs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,20 +96,20 @@ matching ``A[AM]I`` (i.e. active or middle aorist indicatives) to "ἐπαυσ".

>>> from inflexion.lexicon import Lexicon
>>> lexicon = Lexicon()
>>> lexicon.add("παύω", {
... "P": "παυ",
... "I": "ἐπαυ",
... "F[AM]": "παυσ",
... "A[AM][NPDSO]": "παυσ",
... "A[AM]I": "ἐπαυσ",
... "XA": "πεπαυκ",
... "YA": "ἐπεπαυκ",
... "X[MP]": "πεπαυ",
... "Y[MP]": "ἐπεπαυ",
... "AP[NPDSO]": "παυθ",
... "API": "ἐπαυθ",
... "FP": "παυθησ",
... })
>>> lexicon.add("παύω", [
... ("P", "παυ"),
... ("I", "ἐπαυ"),
... ("F[AM]", "παυσ"),
... ("A[AM][NPDSO]", "παυσ"),
... ("A[AM]I", "ἐπαυσ"),
... ("XA", "πεπαυκ"),
... ("YA", "ἐπεπαυκ"),
... ("X[MP]", "πεπαυ"),
... ("Y[MP]", "ἐπεπαυ"),
... ("AP[NPDSO]", "παυθ"),
... ("API", "ἐπαυθ"),
... ("FP", "παυθησ"),
... ])

This can then be used look up a stem (perhaps from
``StemmingRuleSet.possible_stems``) to see what lemma and key regex it could
Expand Down
15 changes: 8 additions & 7 deletions inflexion/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@ def __init__(self):

def add(self, lemma, stems):
"""
stems is a dictionary of key regex to stem
stems is a list of (key regex, stem) pairs
"""
self.lemma_to_stems[lemma] = stems
for key_regex, stem in stems.items():
for key_regex, stem in stems:
self.stem_to_lemma_key_regex[stem].add((lemma, key_regex))

def find_stems(self, lemma, key):
"""
returns a (possibly empty) set of stems for the given lemma and key
returns a stem (or None) for the given lemma and key
"""
stems = set()
for key_regex, stem in self.lemma_to_stems[lemma].items():
result = None
for key_regex, stem in self.lemma_to_stems[lemma]:
if re.match(key_regex, key):
stems.add(stem)
return stems
result = stem # we don't break or return as we want last

return result
2 changes: 1 addition & 1 deletion inflexion/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def add_stemming_rule_set(self, stemming_rule_set):
def generate(self, lemma, key):
stems = set()
for lexicon in self.lexicons:
stems.update(lexicon.find_stems(lemma, key))
stems.add(lexicon.find_stems(lemma, key))

results = set()
for stem in stems:
Expand Down
7 changes: 5 additions & 2 deletions inflexion/sandhi.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import re


class SandhiRule:

def __init__(self, rule):
Expand All @@ -18,12 +21,12 @@ def __repr__(self):

def match_theme(self, stem):
"""
If the given stem ends with this rule's stem part, return the theme
If the given stem matches this rule's stem part, return the theme
(which may be more than this rule's theme part if this rule's stem part
is only the rightmost part of the given stem) or return None if stems
don't match.
"""
if stem.endswith(self.stem):
if re.match(".*" + self.stem + "$", stem):
if self.b:
return stem[:-len(self.b)]
else:
Expand Down
3 changes: 1 addition & 2 deletions inflexion/stemming.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ def inflect(self, stem, key):

for rule in self.key_to_rules[key]:
base = rule.match_theme(stem)

if base:
if base is not None:
if rule.stem:
base_endings.append((base, rule.distinguisher, rule))
else:
Expand Down
10 changes: 5 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,24 +111,24 @@ class LexiconTest(unittest.TestCase):

def test_lexicon(self):
lexicon = Lexicon()
lexicon.add("FOO", {"bar": "foo"})
self.assertEqual(lexicon.lemma_to_stems["FOO"]["bar"], "foo")
lexicon.add("FOO", [("bar", "foo")])
self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", "foo")])
self.assertEqual(
lexicon.stem_to_lemma_key_regex["foo"],
{("FOO", "bar")}
)

def test_find_stems(self):
lexicon = Lexicon()
lexicon.add("FOO", {"bar": "foo"})
self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
lexicon.add("FOO", [("bar", "foo")])
self.assertEqual(lexicon.find_stems("FOO", "barista"), "foo")


class MainTest(unittest.TestCase):

def setUp(self):
lexicon = Lexicon()
lexicon.add("FOO", {"bar": "foo"})
lexicon.add("FOO", [("bar", "foo")])
rules = StemmingRuleSet()
rules.add("barista", "|o><|llow")
self.inflexion = Inflexion()
Expand Down

0 comments on commit 3253f1c

Please sign in to comment.