Skip to content

Commit

Permalink
stems in lexicon are now sets of stems
Browse files Browse the repository at this point in the history
  • Loading branch information
jtauber committed Apr 26, 2016
1 parent 9ae3f9d commit 2b86643
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 27 deletions.
26 changes: 13 additions & 13 deletions docs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ lexicon
-------

A ``Lexicon`` is currently a mapping between lemmas and stems where stems
are dictionaries mapping key regexes to stems.
are dictionaries mapping key regexes to stem sets.

For example, the stems dictionary in the example below maps (for the lemma
παύω) any key matching ``P`` (i.e. present forms) to the stem "παυ" and any key
Expand All @@ -125,18 +125,18 @@ matching ``A[AM]I`` (i.e. active or middle aorist indicatives) to "ἐπαυσ".
>>> from inflexion.lexicon import Lexicon
>>> lexicon = Lexicon()
>>> lexicon.add("παύω", [
... ("P", "παυ"),
... ("I", "ἐπαυ"),
... ("F[AM]", "παυσ"),
... ("A[AM][NPDSO]", "παυσ"),
... ("A[AM]I", "ἐπαυσ"),
... ("XA", "πεπαυκ"),
... ("YA", "ἐπεπαυκ"),
... ("X[MP]", "πεπαυ"),
... ("Y[MP]", "ἐπεπαυ"),
... ("AP[NPDSO]", "παυθ"),
... ("API", "ἐπαυθ"),
... ("FP", "παυθησ"),
... ("P", {"παυ"}),
... ("I", {"ἐπαυ"}),
... ("F[AM]", {"παυσ"}),
... ("A[AM][NPDSO]", {"παυσ"}),
... ("A[AM]I", {"ἐπαυσ"}),
... ("XA", {"πεπαυκ"}),
... ("YA", {"ἐπεπαυκ"}),
... ("X[MP]", {"πεπαυ"}),
... ("Y[MP]", {"ἐπεπαυ"}),
... ("AP[NPDSO]", {"παυθ"}),
... ("API", {"ἐπαυθ"}),
... ("FP", {"παυθησ"}),
... ])

This can then be used look up a stem (perhaps from
Expand Down
17 changes: 9 additions & 8 deletions inflexion/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@
class Lexicon:

def __init__(self):
# mapping of lemma to stems (dictionary of key regex to stem)
# mapping of lemma to stems (dictionary of key regex to stem_set)
self.lemma_to_stems = {}
# a reverse mapping of stem to lemma, key regex pairs
self.stem_to_lemma_key_regex = defaultdict(set)

def add(self, lemma, stems):
"""
stems is a list of (key regex, stem) pairs
stems is a list of (key regex, stem_set) pairs
"""
self.lemma_to_stems[lemma] = stems
for key_regex, stem in stems:
self.stem_to_lemma_key_regex[stem].add((lemma, key_regex))
for key_regex, stem_set in stems:
for stem in stem_set:
self.stem_to_lemma_key_regex[stem].add((lemma, key_regex))

def find_stems(self, lemma, key):
"""
returns a stem (or None) for the given lemma and key
returns a (possibly empty) stem_set for the given lemma and key
"""
result = None
for key_regex, stem in self.lemma_to_stems[lemma]:
result = set()
for key_regex, stem_set in self.lemma_to_stems[lemma]:
if re.match(key_regex, key):
result = stem # we don't break or return as we want last
result = stem_set # we don't break or return as we want last

return result
2 changes: 1 addition & 1 deletion inflexion/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def generate(self, lemma, key, tag_filter=None):

stems = set()
for lexicon in self.lexicons:
stems.add(lexicon.find_stems(lemma, key))
stems.update(lexicon.find_stems(lemma, key))

results = set()
for stem in stems:
Expand Down
10 changes: 5 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,24 +130,24 @@ class LexiconTest(unittest.TestCase):

def test_lexicon(self):
lexicon = Lexicon()
lexicon.add("FOO", [("bar", "foo")])
self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", "foo")])
lexicon.add("FOO", [("bar", {"foo"})])
self.assertEqual(lexicon.lemma_to_stems["FOO"], [("bar", {"foo"})])
self.assertEqual(
lexicon.stem_to_lemma_key_regex["foo"],
{("FOO", "bar")}
)

def test_find_stems(self):
lexicon = Lexicon()
lexicon.add("FOO", [("bar", "foo")])
self.assertEqual(lexicon.find_stems("FOO", "barista"), "foo")
lexicon.add("FOO", [("bar", {"foo"})])
self.assertEqual(lexicon.find_stems("FOO", "barista"), {"foo"})


class MainTest(unittest.TestCase):

def setUp(self):
lexicon = Lexicon()
lexicon.add("FOO", [("bar", "foo")])
lexicon.add("FOO", [("bar", {"foo"})])
rules = StemmingRuleSet()
rules.add("barista", "|o><|llow")
self.inflexion = Inflexion()
Expand Down

0 comments on commit 2b86643

Please sign in to comment.