diff --git a/docs.rst b/docs.rst index 6273b1c..0ef6a2a 100644 --- a/docs.rst +++ b/docs.rst @@ -116,7 +116,7 @@ lexicon ------- A ``Lexicon`` is currently a mapping between lemmas and stems where stems -are dictionaries mapping key regexes to stems. +are dictionaries mapping key regexes to stem sets. For example, the stems dictionary in the example below maps (for the lemma παύω) any key matching ``P`` (i.e. present forms) to the stem "παυ" and any key @@ -125,18 +125,18 @@ matching ``A[AM]I`` (i.e. active or middle aorist indicatives) to "ἐπαυσ". >>> from inflexion.lexicon import Lexicon >>> lexicon = Lexicon() >>> lexicon.add("παύω", [ -... ("P", "παυ"), -... ("I", "ἐπαυ"), -... ("F[AM]", "παυσ"), -... ("A[AM][NPDSO]", "παυσ"), -... ("A[AM]I", "ἐπαυσ"), -... ("XA", "πεπαυκ"), -... ("YA", "ἐπεπαυκ"), -... ("X[MP]", "πεπαυ"), -... ("Y[MP]", "ἐπεπαυ"), -... ("AP[NPDSO]", "παυθ"), -... ("API", "ἐπαυθ"), -... ("FP", "παυθησ"), +... ("P", {"παυ"}), +... ("I", {"ἐπαυ"}), +... ("F[AM]", {"παυσ"}), +... ("A[AM][NPDSO]", {"παυσ"}), +... ("A[AM]I", {"ἐπαυσ"}), +... ("XA", {"πεπαυκ"}), +... ("YA", {"ἐπεπαυκ"}), +... ("X[MP]", {"πεπαυ"}), +... ("Y[MP]", {"ἐπεπαυ"}), +... ("AP[NPDSO]", {"παυθ"}), +... ("API", {"ἐπαυθ"}), +... ("FP", {"παυθησ"}), ... ]) This can then be used look up a stem (perhaps from diff --git a/inflexion/lexicon.py b/inflexion/lexicon.py index 191aee1..a240ce8 100644 --- a/inflexion/lexicon.py +++ b/inflexion/lexicon.py @@ -5,26 +5,27 @@ class Lexicon: def __init__(self): - # mapping of lemma to stems (dictionary of key regex to stem) + # mapping of lemma to stems (dictionary of key regex to stem_set) self.lemma_to_stems = {} # a reverse mapping of stem to lemma, key regex pairs self.stem_to_lemma_key_regex = defaultdict(set) def add(self, lemma, stems): """ - stems is a list of (key regex, stem) pairs + stems is a list of (key regex, stem_set) pairs """ self.lemma_to_stems[lemma] = stems - for key_regex, stem in stems: - self.stem_to_lemma_key_regex[stem].add((lemma, key_regex)) + for key_regex, stem_set in stems: + for stem in stem_set: + self.stem_to_lemma_key_regex[stem].add((lemma, key_regex)) def find_stems(self, lemma, key): """ - returns a stem (or None) for the given lemma and key + returns a (possibly empty) stem_set for the given lemma and key """ - result = None - for key_regex, stem in self.lemma_to_stems[lemma]: + result = set() + for key_regex, stem_set in self.lemma_to_stems[lemma]: if re.match(key_regex, key): - result = stem # we don't break or return as we want last + result = stem_set # we don't break or return as we want last return result diff --git a/inflexion/main.py b/inflexion/main.py index cf20a99..596a79d 100644 --- a/inflexion/main.py +++ b/inflexion/main.py @@ -19,7 +19,7 @@ def generate(self, lemma, key, tag_filter=None): stems = set() for lexicon in self.lexicons: - stems.add(lexicon.find_stems(lemma, key)) + stems.update(lexicon.find_stems(lemma, key)) results = set() for stem in stems: diff --git a/test.py b/test.py index 0fdb8a1..64d8225 100755 --- a/test.py +++ b/test.py @@ -130,8 +130,8 @@ class LexiconTest(unittest.TestCase): def test_lexicon(self): lexicon = Lexicon() - lexicon.add("FOO", [("bar", "foo")]) - self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", "foo")]) + lexicon.add("FOO", [("bar", {"foo"})]) + self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", {"foo"})]) self.assertEqual( lexicon.stem_to_lemma_key_regex["foo"], {("FOO", "bar")} @@ -139,15 +139,15 @@ def test_lexicon(self): def test_find_stems(self): lexicon = Lexicon() - lexicon.add("FOO", [("bar", "foo")]) - self.assertEqual(lexicon.find_stems("FOO", "barista"), "foo") + lexicon.add("FOO", [("bar", {"foo"})]) + self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"}) class MainTest(unittest.TestCase): def setUp(self): lexicon = Lexicon() - lexicon.add("FOO", [("bar", "foo")]) + lexicon.add("FOO", [("bar", {"foo"})]) rules = StemmingRuleSet() rules.add("barista", "|o><|llow") self.inflexion = Inflexion()