regex match themes and order stem list

jtauber · Apr 26, 2016 · 3253f1c · 3253f1c
1 parent e9ceb24
commit 3253f1c
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 31 deletions.
diff --git a/docs.rst b/docs.rst
@@ -96,20 +96,20 @@ matching ``A[AM]I`` (i.e. active or middle aorist indicatives) to "ἐπαυσ".
 
 >>> from inflexion.lexicon import Lexicon
 >>> lexicon = Lexicon()
->>> lexicon.add("παύω", {
-...     "P": "παυ",
-...     "I": "ἐπαυ",
-...     "F[AM]": "παυσ",
-...     "A[AM][NPDSO]": "παυσ",
-...     "A[AM]I": "ἐπαυσ",
-...     "XA": "πεπαυκ",
-...     "YA": "ἐπεπαυκ",
-...     "X[MP]": "πεπαυ",
-...     "Y[MP]": "ἐπεπαυ",
-...     "AP[NPDSO]": "παυθ",
-...     "API": "ἐπαυθ",
-...     "FP": "παυθησ",
-... })
+>>> lexicon.add("παύω", [
+...     ("P", "παυ"),
+...     ("I", "ἐπαυ"),
+...     ("F[AM]", "παυσ"),
+...     ("A[AM][NPDSO]", "παυσ"),
+...     ("A[AM]I", "ἐπαυσ"),
+...     ("XA", "πεπαυκ"),
+...     ("YA", "ἐπεπαυκ"),
+...     ("X[MP]", "πεπαυ"),
+...     ("Y[MP]", "ἐπεπαυ"),
+...     ("AP[NPDSO]", "παυθ"),
+...     ("API", "ἐπαυθ"),
+...     ("FP", "παυθησ"),
+... ])
 
 This can then be used look up a stem (perhaps from
 ``StemmingRuleSet.possible_stems``) to see what lemma and key regex it could

diff --git a/inflexion/lexicon.py b/inflexion/lexicon.py
@@ -12,18 +12,19 @@ def __init__(self):
 
     def add(self, lemma, stems):
         """
-        stems is a dictionary of key regex to stem
+        stems is a list of (key regex, stem) pairs
         """
         self.lemma_to_stems[lemma] = stems
-        for key_regex, stem in stems.items():
+        for key_regex, stem in stems:
             self.stem_to_lemma_key_regex[stem].add((lemma, key_regex))
 
     def find_stems(self, lemma, key):
         """
-        returns a (possibly empty) set of stems for the given lemma and key
+        returns a stem (or None) for the given lemma and key
         """
-        stems = set()
-        for key_regex, stem in self.lemma_to_stems[lemma].items():
+        result = None
+        for key_regex, stem in self.lemma_to_stems[lemma]:
             if re.match(key_regex, key):
-                stems.add(stem)
-        return stems
+                result = stem  # we don't break or return as we want last
+
+        return result
diff --git a/inflexion/main.py b/inflexion/main.py
@@ -16,7 +16,7 @@ def add_stemming_rule_set(self, stemming_rule_set):
     def generate(self, lemma, key):
         stems = set()
         for lexicon in self.lexicons:
-            stems.update(lexicon.find_stems(lemma, key))
+            stems.add(lexicon.find_stems(lemma, key))
 
         results = set()
         for stem in stems:

diff --git a/inflexion/sandhi.py b/inflexion/sandhi.py
@@ -1,3 +1,6 @@
+import re
+
+
 class SandhiRule:
 
     def __init__(self, rule):
@@ -18,12 +21,12 @@ def __repr__(self):
 
     def match_theme(self, stem):
         """
-        If the given stem ends with this rule's stem part, return the theme
+        If the given stem matches this rule's stem part, return the theme
         (which may be more than this rule's theme part if this rule's stem part
         is only the rightmost part of the given stem) or return None if stems
         don't match.
         """
-        if stem.endswith(self.stem):
+        if re.match(".*" + self.stem + "$", stem):
             if self.b:
                 return stem[:-len(self.b)]
             else:

diff --git a/inflexion/stemming.py b/inflexion/stemming.py
@@ -23,8 +23,7 @@ def inflect(self, stem, key):
 
         for rule in self.key_to_rules[key]:
             base = rule.match_theme(stem)
-
-            if base:
+            if base is not None:
                 if rule.stem:
                     base_endings.append((base, rule.distinguisher, rule))
                 else:

diff --git a/test.py b/test.py
@@ -111,24 +111,24 @@ class LexiconTest(unittest.TestCase):
 
     def test_lexicon(self):
         lexicon = Lexicon()
-        lexicon.add("FOO", {"bar": "foo"})
-        self.assertEqual(lexicon.lemma_to_stems["FOO"]["bar"], "foo")
+        lexicon.add("FOO", [("bar", "foo")])
+        self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", "foo")])
         self.assertEqual(
             lexicon.stem_to_lemma_key_regex["foo"],
             {("FOO", "bar")}
         )
 
     def test_find_stems(self):
         lexicon = Lexicon()
-        lexicon.add("FOO", {"bar": "foo"})
-        self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})
+        lexicon.add("FOO", [("bar", "foo")])
+        self.assertEqual(lexicon.find_stems("FOO", "barista"), "foo")
 
 
 class MainTest(unittest.TestCase):
 
     def setUp(self):
         lexicon = Lexicon()
-        lexicon.add("FOO", {"bar": "foo"})
+        lexicon.add("FOO", [("bar", "foo")])
         rules = StemmingRuleSet()
         rules.add("barista", "|o><|llow")
         self.inflexion = Inflexion()