Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Special analyzers #39

Open
wants to merge 6 commits into from

1 participant

@kmike
Owner

Идея в том, чтоб на основе словаря кроме основного анализатора строить еще несколько специализированных. Например, отдельный анализатор для имен и фамилий, с помощью которого можно было бы более точно разбирать и склонять имена-фамилии, или то же самое для географических названий. Строятся такие анализаторы на словах, у которых указаны соответствующие граммемы.

Есть 2 сценария их использования:

а) отдельно - если известно, что на входе точно геогр. название, можно использовать специальный анализатор вместо общего;
б) интегрировано с "главным" анализатором - для слов кроме обычных гипотез строятся гипотезы, основанные на предположении о том, что слово - имя/фамилия/геогр. название. Например, в случае, если слово пишется с заглавной буквы, или если слова нет в словаре.

Чтоб это смерджить, требуется еще работа, т.к., насколько помню, качество разбора было далеко не идеальным. Сама идея мне все еще кажется правильной.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on May 5, 2013
  1. Specialized analyzers for names, surnames, patronymic names, geograph…

    authored
    …ical names and organization names
Commits on Jun 29, 2013
Commits on Jul 1, 2013
  1. merge changes from master

    authored
  2. better variable names

    authored
  3. minor presentational tweaks

    authored
This page is out of date. Refresh to see the latest.
View
42 pymorphy2/analyzer.py
@@ -7,6 +7,7 @@
from pymorphy2 import opencorpora_dict
from pymorphy2 import units
+from pymorphy2.utils import cached_property
logger = logging.getLogger(__name__)
@@ -80,7 +81,7 @@ class MorphAnalyzer(object):
with a path to dictionaries, or pass ``path`` argument
to :class:`pymorphy2.MorphAnalyzer` constructor::
- >>> morph = pymorphy2.MorphAnalyzer('/path/to/dictionaries') # doctest: +SKIP
+ >>> morph = pymorphy2.MorphAnalyzer('/path/to/dictionaries') # doctest: +SKIP
By default, methods of this class return parsing results
as namedtuples :class:`Parse`. This has performance implications
@@ -108,10 +109,16 @@ class MorphAnalyzer(object):
units.KnownSuffixAnalyzer,
]
- def __init__(self, path=None, result_type=Parse, units=None):
+ GEO_UNITS = [units.GeoAnalyzer]
+ SURNAME_UNITS = [units.SurnameAnalyzer]
- path = self.choose_dictionary_path(path)
- self.dictionary = opencorpora_dict.Dictionary(path)
+ def __init__(self, path=None, result_type=Parse, units=None, dictionary=None):
+
+ if dictionary:
+ self.dictionary = dictionary
+ else:
+ path = self.choose_dictionary_path(path)
+ self.dictionary = opencorpora_dict.Dictionary(path)
if result_type is not None:
# create a subclass with the same name,
@@ -153,6 +160,27 @@ def choose_dictionary_path(cls, path=None):
"or set %s environment variable.") % cls.ENV_VARIABLE
raise ValueError(msg)
+ @cached_property
+ def geo(self):
+ """
+ MorphAnalyzer instance that should be used for working with
+ geographical names.
+ """
+ return self.__class__(
+ dictionary = self.dictionary,
+ units = self.GEO_UNITS
+ )
+
+ @cached_property
+ def surnames(self):
+ """
+ MorphAnalyzer instance that should be used for working with
+ surnames (mostly Russian).
+ """
+ return self.__class__(
+ dictionary = self.dictionary,
+ units = self.SURNAME_UNITS
+ )
def parse(self, word):
"""
@@ -163,11 +191,14 @@ def parse(self, word):
(or plain tuples if ``result_type=None`` was used in constructor).
"""
+ return self._parse(word, self._units)
+
+ def _parse(self, word, units):
res = []
seen = set()
word_lower = word.lower()
- for analyzer in self._units:
+ for analyzer in units:
res.extend(analyzer.parse(word, word_lower, seen))
if res and analyzer.terminal:
@@ -178,7 +209,6 @@ def parse(self, word):
return [self._result_type(*p) for p in res]
-
def tag(self, word):
res = []
seen = set()
View
212 pymorphy2/opencorpora_dict/compile.py
@@ -19,16 +19,17 @@
from pymorphy2 import dawg
from pymorphy2.constants import PARADIGM_PREFIXES, PREDICTION_PREFIXES
-from pymorphy2.utils import longest_common_substring, largest_group
+from pymorphy2.utils import longest_common_substring, largest_elements
logger = logging.getLogger(__name__)
CompiledDictionary = collections.namedtuple(
'CompiledDictionary',
- 'gramtab suffixes paradigms words_dawg prediction_suffixes_dawgs parsed_dict prediction_options'
+ 'gramtab suffixes paradigms words_dawg prediction_suffixes_dawgs extra_prediction_dawgs parsed_dict prediction_options'
)
+EXTRA_GRAMMEMES_FOR_PREDICTION = ['Name', 'Surn', 'Patr', 'Geox', 'Orgn']
def convert_to_pymorphy2(opencorpora_dict_path, out_path, overwrite=False,
prediction_options=None):
@@ -68,8 +69,8 @@ def compile_parsed_dict(parsed_dict, prediction_options=None):
paradigms = []
words = []
- seen_tags = dict()
- seen_paradigms = dict()
+ tag_ids = dict()
+ paradigms_ids = dict()
logger.info("inlining lexeme derivational rules...")
lexemes = _join_lexemes(parsed_dict.lexemes, parsed_dict.links)
@@ -84,18 +85,18 @@ def compile_parsed_dict(parsed_dict, prediction_options=None):
# build gramtab
for suff, tag, pref in paradigm:
- if tag not in seen_tags:
- seen_tags[tag] = len(gramtab)
+ if tag not in tag_ids:
+ tag_ids[tag] = len(gramtab)
gramtab.append(tag)
# build paradigm index
- if paradigm not in seen_paradigms:
- seen_paradigms[paradigm] = len(paradigms)
+ if paradigm not in paradigms_ids:
+ paradigms_ids[paradigm] = len(paradigms)
paradigms.append(
- tuple([(suff, seen_tags[tag], pref) for suff, tag, pref in paradigm])
+ tuple([(suff, tag_ids[tag], pref) for suff, tag, pref in paradigm])
)
- para_id = seen_paradigms[paradigm]
+ para_id = paradigms_ids[paradigm]
paradigm_popularity[para_id] += 1
for idx, (suff, tag, pref) in enumerate(paradigm):
@@ -117,7 +118,7 @@ def get_form(para):
forms = [get_form(para) for para in paradigms]
suffixes = sorted(set(list(itertools.chain(*forms))))
- suffixes_dict = dict(
+ suffix_ids = dict(
(suff, index)
for index, suff in enumerate(suffixes)
)
@@ -127,31 +128,56 @@ def fix_strings(paradigm):
para = []
for suff, tag, pref in paradigm:
para.append(
- (suffixes_dict[suff], tag, PARADIGM_PREFIXES.index(pref))
+ (suffix_ids[suff], tag, PARADIGM_PREFIXES.index(pref))
)
return para
paradigms = (fix_strings(para) for para in paradigms)
paradigms = [_linearized_paradigm(paradigm) for paradigm in paradigms]
- logger.debug('calculating prediction data..')
+ logger.debug('calculating main prediction data..')
suffixes_dawgs_data = _suffixes_prediction_data(
- words, paradigm_popularity, gramtab, paradigms, suffixes, **_prediction_options
+ words, paradigm_popularity, gramtab, paradigms, suffixes,
+ _POS_tags(gramtab),
+ **_prediction_options
)
+ aux_dawgs_data = {}
+ aux_prediction_options = dict(
+ min_ending_freq = 2,
+ min_paradigm_popularity = 1,
+ max_suffix_length = _prediction_options['max_suffix_length'],
+ max_parses_per_grammeme = 4,
+ )
+ for grammeme in EXTRA_GRAMMEMES_FOR_PREDICTION:
+ logger.debug('calculating auxilary prediction data for %s..', grammeme)
+
+ aux_dawgs_data[grammeme] = _suffixes_prediction_data(
+ words, paradigm_popularity, gramtab, paradigms, suffixes,
+ set([grammeme]),
+ **aux_prediction_options
+ )
+
+ # print(aux_dawgs_data)
+
logger.debug('building word DAWG..')
words_dawg = dawg.WordsDawg(words)
del words
- prediction_suffixes_dawgs = []
- for prefix_id, dawg_data in enumerate(suffixes_dawgs_data):
- logger.debug('building prediction_suffixes DAWGs #%d..' % prefix_id)
- prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG(dawg_data))
+ logger.debug('building prediction_suffixes DAWGs..')
+ prediction_suffixes_dawgs = [dawg.PredictionSuffixesDAWG(d) for d in suffixes_dawgs_data]
+
+ logger.debug('building prediction_suffixes DAWGs..')
+
+ extra_prediction_dawgs = {}
+ for grammeme, data in aux_dawgs_data.items():
+ extra_prediction_dawgs[grammeme] = [dawg.PredictionSuffixesDAWG(d) for d in data]
return CompiledDictionary(tuple(gramtab), suffixes, paradigms,
- words_dawg, prediction_suffixes_dawgs, parsed_dict,
- _prediction_options)
+ words_dawg, prediction_suffixes_dawgs,
+ extra_prediction_dawgs,
+ parsed_dict, _prediction_options)
def _join_lexemes(lexemes, links):
@@ -238,38 +264,97 @@ def _to_paradigm(lexeme):
return stem, tuple(zip(suffixes, tags, prefixes))
-def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes,
- min_ending_freq, min_paradigm_popularity, max_suffix_length):
+def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes, grammemes,
+ min_ending_freq, min_paradigm_popularity, max_suffix_length,
+ max_parses_per_grammeme=1):
- logger.debug('calculating prediction data: removing non-productive paradigms..')
- productive_paradigms = set(
- para_id
- for (para_id, count) in paradigm_popularity.items()
- if count >= min_paradigm_popularity
- )
+ productive_paradigms = _popular_paradigms(paradigm_popularity, min_paradigm_popularity)
- # ["suffix"] => number of occurrences
- # this is for removing non-productive suffixes
- ending_counts = collections.defaultdict(int)
+ def iter_words():
+ for word, (para_id, idx) in _show_progress(words, 1e6):
+ if para_id not in productive_paradigms:
+ continue
+ yield word, (para_id, idx)
+
+ logger.debug('collecting word suffixes statistics..')
+ words_info = _iter_words_info(iter_words(), paradigms, gramtab, suffixes)
+ ending_counts, endings = _ending_stats(words_info, max_suffix_length, grammemes)
- # [form_prefix_id]["suffix"]["POS"][(para_id, idx)] => number or occurrences
- # this is for selecting most popular parses
+ # logger.debug('preparing data for DAWGs building..') # it is fast
+ dawgs_data = []
+ for form_prefix_id in sorted(endings.keys()):
+ _endings = endings[form_prefix_id]
+
+ counted_suffixes_dawg_data = []
+
+ for word_end in _endings:
+ if ending_counts[word_end] < min_ending_freq:
+ continue
+
+ for grammeme in _endings[word_end]:
+ common_endings = largest_elements(
+ _endings[word_end][grammeme].items(),
+ operator.itemgetter(1),
+ max_parses_per_grammeme
+ )
+
+ for form, cnt in common_endings:
+ record = word_end, (cnt,) + form
+ counted_suffixes_dawg_data.append(record)
+
+
+ dawgs_data.append(counted_suffixes_dawg_data)
+
+ return dawgs_data
+
+
+def _ending_stats(words_info, max_suffix_length, interesting_grammemes):
+ """
+ Return (ending_counts, endings) tuple.
+
+ ending_counts: ["suffix"] => number of occurrences
+ it is for removing non-productive suffixes
+
+ endings: [form_prefix_id]["suffix"]["grammeme"][(para_id, idx)] => number or occurrences
+ it is for selecting most popular parses
+
+ """
endings = {}
for form_prefix_id in range(len(PARADIGM_PREFIXES)):
endings[form_prefix_id] = collections.defaultdict(
lambda: collections.defaultdict(
lambda: collections.defaultdict(int)))
+ ending_counts = collections.defaultdict(int)
+ interesting_grammemes = set(interesting_grammemes)
- logger.debug('calculating prediction data: checking word endings..')
- for word, (para_id, idx) in words:
-
- if para_id not in productive_paradigms:
+ for word, tag, form_prefix, form_suffix, form_prefix_id, para_id, idx in words_info:
+ grammemes = set(_to_grammemes(tag)) & interesting_grammemes
+ if not grammemes:
continue
- paradigm = paradigms[para_id]
+ _endings = endings[form_prefix_id]
- form_count = len(paradigm) // 3
+ for word_end in _iter_prediction_suffixes(word, form_suffix, max_suffix_length):
+ ending_counts[word_end] += 1
+ for grammeme in grammemes:
+ _endings[word_end][grammeme][(para_id, idx)] += 1
+
+ return ending_counts, endings
+
+
+def _popular_paradigms(paradigm_popularity, min_count):
+ return set(
+ para_id
+ for (para_id, count) in paradigm_popularity.items()
+ if count >= min_count
+ )
+
+
+def _iter_words_info(words, paradigms, gramtab, suffixes):
+ for word, (para_id, idx) in words:
+ paradigm = paradigms[para_id]
+ form_count = len(paradigm) // 3
tag = gramtab[paradigm[form_count + idx]]
form_prefix_id = paradigm[2*form_count + idx]
form_prefix = PARADIGM_PREFIXES[form_prefix_id]
@@ -283,40 +368,15 @@ def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, su
# pseudo-paradigms are useless for prediction
continue
- POS = tuple(tag.replace(' ', ',', 1).split(','))[0]
+ yield word, tag, form_prefix, form_suffix, form_prefix_id, para_id, idx
- for i in range(max(len(form_suffix), 1), max_suffix_length+1): #was: 1,2,3,4,5
- word_end = word[-i:]
- ending_counts[word_end] += 1
- endings[form_prefix_id][word_end][POS][(para_id, idx)] += 1
+def _to_grammemes(tag):
+ return tag.replace(' ', ',', 1).split(',')
- dawgs_data = []
- for form_prefix_id in sorted(endings.keys()):
- logger.debug('calculating prediction data: preparing DAWGs data #%d..' % form_prefix_id)
- counted_suffixes_dawg_data = []
- endings_with_prefix = endings[form_prefix_id]
-
- for suff in endings_with_prefix:
- if ending_counts[suff] < min_ending_freq:
- continue
-
- for POS in endings_with_prefix[suff]:
-
- common_endings = largest_group(
- endings_with_prefix[suff][POS].items(),
- operator.itemgetter(1)
- )
-
- for form, cnt in common_endings:
- counted_suffixes_dawg_data.append(
- (suff, (cnt,)+ form)
- )
-
- dawgs_data.append(counted_suffixes_dawg_data)
-
- return dawgs_data
+def _POS_tags(gramtab):
+ return set(_to_grammemes(tag)[0] for tag in gramtab)
def _linearized_paradigm(paradigm):
@@ -339,3 +399,17 @@ def _create_out_path(out_path, overwrite=False):
return False
return True
+
+def _iter_prediction_suffixes(word, form_suffix, max_suffix_length):
+ min_length = max(len(form_suffix), 1)
+ for i in range(min_length, max_suffix_length+1):
+ yield word[-i:]
+
+
+def _show_progress(iterator, print_every):
+ """ Print "NUM done" message every ``print_every`` iteration. """
+ for index, el in enumerate(iterator):
+ if index and not (index % int(print_every)):
+ logger.debug("%d done", index)
+ yield el
+
View
36 pymorphy2/opencorpora_dict/storage.py
@@ -29,7 +29,7 @@
LoadedDictionary = collections.namedtuple(
'LoadedDictionary',
- 'meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, Tag, paradigm_prefixes'
+ 'meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, extra_prediction_dawgs, Tag, paradigm_prefixes'
)
@@ -59,14 +59,22 @@ def load_dict(path, gramtab_format='opencorpora-int'):
prediction_suffixes_dawgs = []
for prefix_id in range(len(paradigm_prefixes)):
fn = _f('prediction-suffixes-%s.dawg' % prefix_id)
-
assert os.path.exists(fn)
-
prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn))
+ extra_prediction_dawgs = {}
+ for grammeme in meta.get('extra_prediction_dawg_lengths', []):
+ extra_prediction_dawgs[grammeme] = []
+ for prefix_id in range(len(paradigm_prefixes)):
+ fn = _f('%s-prediction-suffixes-%s.dawg' % (grammeme, prefix_id))
+ assert os.path.exists(fn)
+ d = dawg.PredictionSuffixesDAWG().load(fn)
+ extra_prediction_dawgs[grammeme].append(d)
+
+
return LoadedDictionary(meta, gramtab, suffixes, paradigms, words,
prediction_prefixes, prediction_suffixes_dawgs,
- Tag, paradigm_prefixes)
+ extra_prediction_dawgs, Tag, paradigm_prefixes)
def save_compiled_dict(compiled_dict, out_path):
@@ -80,12 +88,12 @@ def save_compiled_dict(compiled_dict, out_path):
json_write(_f('grammemes.json'), compiled_dict.parsed_dict.grammemes)
gramtab_formats = {}
- for format, Tag in tagset.registry.items():
+ for format_, Tag in tagset.registry.items():
Tag._init_grammemes(compiled_dict.parsed_dict.grammemes)
new_gramtab = [Tag._from_internal_tag(tag) for tag in compiled_dict.gramtab]
- gramtab_name = "gramtab-%s.json" % format
- gramtab_formats[format] = gramtab_name
+ gramtab_name = "gramtab-%s.json" % format_
+ gramtab_formats[format_] = gramtab_name
json_write(_f(gramtab_name), new_gramtab)
@@ -101,6 +109,11 @@ def save_compiled_dict(compiled_dict, out_path):
for prefix_id, prediction_suffixes_dawg in enumerate(compiled_dict.prediction_suffixes_dawgs):
prediction_suffixes_dawg.save(_f('prediction-suffixes-%s.dawg' % prefix_id))
+ for grammeme, dawgs in compiled_dict.extra_prediction_dawgs.items():
+ for prefix_id, prediction_suffixes_dawg in enumerate(dawgs):
+ fname = '%s-prediction-suffixes-%s.dawg' % (grammeme.lower(), prefix_id)
+ prediction_suffixes_dawg.save(_f(fname))
+
dawg.DAWG(PREDICTION_PREFIXES).save(_f('prediction-prefixes.dawg'))
json_write(_f('paradigm-prefixes.json'), PARADIGM_PREFIXES)
@@ -118,6 +131,12 @@ def _dawg_len(dawg):
for prediction_suffixes_dawg in compiled_dict.prediction_suffixes_dawgs:
prediction_suffixes_dawg_lenghts.append(_dawg_len(prediction_suffixes_dawg))
+ logger.debug(' extra_prediction_dawg_lengths')
+ extra_dawg_lengths = dict(
+ (grammeme, _dawg_len(d[0]))
+ for grammeme, d in compiled_dict.extra_prediction_dawgs.items()
+ )
+
meta = [
['format_version', CURRENT_FORMAT_VERSION],
['pymorphy2_version', pymorphy2.__version__],
@@ -139,6 +158,7 @@ def _dawg_len(dawg):
['prediction_suffixes_dawg_lengths', prediction_suffixes_dawg_lenghts],
['prediction_prefixes_dawg_length', len(PREDICTION_PREFIXES)],
['paradigm_prefixes_length', len(PARADIGM_PREFIXES)],
+ ['extra_prediction_dawg_lengths', extra_dawg_lengths],
]
json_write(_f('meta.json'), meta, indent=4)
@@ -161,7 +181,7 @@ def _load_tag_class(gramtab_format, grammemes_filename):
Tag = tagset.registry[gramtab_format]
- # FIXME: clone the class
+ # FIXME: clone the class?
# Tag = type(Tag.__name__, (Tag,), {
# 'KNOWN_GRAMMEMES': Tag.KNOWN_GRAMMEMES.copy(),
# })
View
9 pymorphy2/opencorpora_dict/wrapper.py
@@ -27,6 +27,7 @@ def __init__(self, path):
self.words = self._data.words
self.prediction_prefixes = self._data.prediction_prefixes
self.prediction_suffixes_dawgs = self._data.prediction_suffixes_dawgs
+ self.extra_prediction_dawgs = self._data.extra_prediction_dawgs
self.meta = self._data.meta
self.Tag = self._data.Tag
@@ -43,7 +44,6 @@ def build_tag_info(self, para_id, idx):
tag_id = paradigm[tag_info_offset + idx]
return self.gramtab[tag_id]
-
def build_paradigm_info(self, para_id):
"""
Return a list of
@@ -67,7 +67,6 @@ def build_paradigm_info(self, para_id):
)
return res
-
def build_normal_form(self, para_id, idx, fixed_word):
"""
Build a normal form.
@@ -89,7 +88,6 @@ def build_normal_form(self, para_id, idx, fixed_word):
return normal_prefix + stem + normal_suffix
-
def build_stem(self, paradigm, idx, fixed_word):
"""
Return word stem (given a word, paradigm and the word index).
@@ -107,7 +105,6 @@ def build_stem(self, paradigm, idx, fixed_word):
else:
return fixed_word[len(prefix):]
-
def word_is_known(self, word, strict_ee=False):
"""
Check if a ``word`` is in the dictionary.
@@ -127,7 +124,6 @@ def word_is_known(self, word, strict_ee=False):
else:
return bool(self.words.similar_keys(word, self.ee))
-
def iter_known_words(self, prefix=""):
"""
Return an iterator over ``(word, tag, normal_form, para_id, idx)``
@@ -140,8 +136,7 @@ def iter_known_words(self, prefix=""):
normal_form = self.build_normal_form(para_id, idx, word)
yield word, tag, normal_form, para_id, idx
-
def __repr__(self):
- return str("<%s>") % self.__class__.__name__
+ return str("%s(%r)") % (self.__class__.__name__, self.path)
View
8 pymorphy2/units/__init__.py
@@ -3,11 +3,13 @@
from .by_lookup import DictionaryAnalyzer
-from .by_analogy import (KnownPrefixAnalyzer, KnownSuffixAnalyzer,
- UnknownPrefixAnalyzer)
+from .by_analogy import (
+ KnownPrefixAnalyzer, KnownSuffixAnalyzer, UnknownPrefixAnalyzer,
+ NameAnalyzer, SurnameAnalyzer, PatronymicAnalyzer,
+ OrganizationAnalyzer, GeoAnalyzer)
from .by_hyphen import (HyphenatedWordsAnalyzer, HyphenAdverbAnalyzer,
HyphenSeparatedParticleAnalyzer)
from .by_shape import (LatinAnalyzer, PunctuationAnalyzer, NumberAnalyzer,
- RomanNumberAnalyzer)
+ RomanNumberAnalyzer)
View
54 pymorphy2/units/by_analogy.py
@@ -177,7 +177,7 @@ def parse(self, word, word_lower, seen_parses):
# or maybe use a proper discounting?
total_counts = [1] * len(self._paradigm_prefixes)
- for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower):
+ for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word, word_lower):
for i in self._prediction_splits:
@@ -193,6 +193,7 @@ def parse(self, word, word_lower, seen_parses):
tag = self.dict.build_tag_info(para_id, idx)
# skip non-productive tags
+ # XXX: move this check to dictionary compilation step?
if not tag.is_productive():
continue
total_counts[prefix_id] += cnt
@@ -228,7 +229,7 @@ def tag(self, word, word_lower, seen_tags):
# ``self.parse(...)``.
result = []
- for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower):
+ for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word, word_lower):
for i in self._prediction_splits:
@@ -240,9 +241,9 @@ def tag(self, word, word_lower, seen_tags):
for fixed_suffix, parses in para_data:
for cnt, para_id, idx in parses:
-
tag = self.dict.build_tag_info(para_id, idx)
+ # XXX: move this check to dictionary compilation step?
if not tag.is_productive():
continue
@@ -258,10 +259,53 @@ def tag(self, word, word_lower, seen_tags):
result.sort(reverse=True)
return [tag for cnt, tag in result]
- def _possible_prefixes(self, word):
+ def _possible_prefixes(self, word, word_lower):
for prefix_id, prefix in self._paradigm_prefixes:
- if not word.startswith(prefix):
+ if not word_lower.startswith(prefix):
continue
suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
yield prefix_id, prefix, suffixes_dawg
+
+
+
+class _SpecialKnownSuffixAnalyzer(KnownSuffixAnalyzer):
+ """
+ Parse the word by checking how the words with similar suffixes
+ are parsed, assuming the word has some predefined GRAMMEME
+ (which should be set in subclasses).
+
+ This class allows to create specialized predictors for given grammemes
+ (e.g. for surnames or geographical location names).
+
+ Prediction data for the grammeme must be available in dictionary.
+ """
+ GRAMMEME = None
+ ESTIMATE_DECAY = 0.6
+
+ def _possible_prefixes(self, word, word_lower):
+ if not word:
+ return []
+
+ #if not word[0].isupper():
+ # # only run for title-cased words
+ # return []
+
+ suffixes_dawg = self.dict.extra_prediction_dawgs[self.GRAMMEME][0]
+ return [(0, '', suffixes_dawg)]
+
+
+class NameAnalyzer(_SpecialKnownSuffixAnalyzer):
+ GRAMMEME = 'Name'
+
+class SurnameAnalyzer(_SpecialKnownSuffixAnalyzer):
+ GRAMMEME = 'Surn'
+
+class PatronymicAnalyzer(_SpecialKnownSuffixAnalyzer):
+ GRAMMEME = 'Patr'
+
+class GeoAnalyzer(_SpecialKnownSuffixAnalyzer):
+ GRAMMEME = 'Geox'
+
+class OrganizationAnalyzer(_SpecialKnownSuffixAnalyzer):
+ GRAMMEME = 'Orgn'
View
31 pymorphy2/utils.py
@@ -7,6 +7,7 @@
import itertools
import codecs
import json
+import heapq
try:
from urllib.request import urlopen
@@ -93,18 +94,22 @@ def json_read(filename, **json_options):
return json.load(f, **json_options)
-def largest_group(iterable, key):
+def largest_elements(iterable, key, n=1):
"""
- Find a group of largest elements (according to ``key``).
+ Return a list of largest elements (according to ``key``).
>>> s = [-4, 3, 5, 7, 4, -7]
- >>> largest_group(s, abs)
+ >>> largest_elements(s, abs, 1)
[7, -7]
+ >>> largest_elements(s, abs, 2)
+ [5, 7, -7]
+ >>> largest_elements(s, abs, 3)
+ [-4, 5, 7, 4, -7]
"""
it1, it2 = itertools.tee(iterable)
- max_key = max(map(key, it1))
- return [el for el in it2 if key(el) == max_key]
+ top_keys = set(heapq.nlargest(n, set(map(key, it1))))
+ return [el for el in it2 if key(el) in top_keys]
def word_splits(word, min_reminder=3, max_prefix_length=5):
@@ -115,3 +120,19 @@ def word_splits(word, min_reminder=3, max_prefix_length=5):
max_split = min(max_prefix_length, len(word)-min_reminder)
split_indexes = range(1, 1+max_split)
return [(word[:i], word[i:]) for i in split_indexes]
+
+
+# from bottle.py
+class cached_property(object):
+ ''' A property that is only computed once per instance and then replaces
+ itself with an ordinary attribute. Deleting the attribute resets the
+ property. '''
+
+ def __init__(self, func):
+ self.__doc__ = getattr(func, '__doc__')
+ self.func = func
+
+ def __get__(self, obj, cls):
+ if obj is None: return self
+ value = obj.__dict__[self.func.__name__] = self.func(obj)
+ return value
View
9 tests/test_opencorpora_dict.py
@@ -2,15 +2,14 @@
from __future__ import absolute_import, unicode_literals
import os
+import pytest
+
import pymorphy2
-from pymorphy2.opencorpora_dict.compile import (_to_paradigm,
- convert_to_pymorphy2)
+from pymorphy2.opencorpora_dict.compile import (_to_paradigm, convert_to_pymorphy2)
from pymorphy2.opencorpora_dict.parse import parse_opencorpora_xml
from pymorphy2.dawg import assert_can_create
from pymorphy2.test_suite_generator import make_test_suite
-import pytest
-
class TestToyDictionary:
@@ -58,6 +57,8 @@ def test_convert_to_pymorphy2(self, tmpdir):
# use it
morph = pymorphy2.MorphAnalyzer(out_path)
assert morph.tag('ёжиться') == [morph.TagClass('INFN,impf,intr')]
+ assert morph.tag('корёжиться') == [morph.TagClass('INFN,impf,intr')]
+ assert morph.tag('коржиться') == [morph.TagClass('INFN,impf,intr')]
def test_test_suite_generator(self, tmpdir):
# just make sure it doesn't raise an exception
Something went wrong with that request. Please try again.