Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

convert predictors.py to 'pymorphy2.units' package

  • Loading branch information...
commit 948bd7adfa56f439dc9cb53b0432093a77cbb712 1 parent 71b3bab
@kmike authored
View
12 docs/misc/api_reference.rst
@@ -8,9 +8,17 @@ Morphological Analyzer
:members:
:undoc-members:
-.. automodule:: pymorphy2.predictors
+Analyzer units
+~~~~~~~~~~~~~~
+
+.. automodule:: pymorphy2.units.by_analogy
+ :members:
+
+.. automodule:: pymorphy2.units.by_shape
+ :members:
+
+.. automodule:: pymorphy2.units.by_hyphen
:members:
- :undoc-members:
Tagset
------
View
40 pymorphy2/analyzer.py
@@ -6,7 +6,7 @@
import logging
from pymorphy2 import opencorpora_dict
-from pymorphy2 import predictors
+from pymorphy2 import units
logger = logging.getLogger(__name__)
@@ -309,18 +309,18 @@ class MorphAnalyzer(object):
"""
ENV_VARIABLE = 'PYMORPHY2_DICT_PATH'
- DEFAULT_PREDICTORS = [
- predictors.PunctuationPredictor,
- predictors.LatinPredictor,
- predictors.HyphenSeparatedParticlePredictor,
- predictors.HyphenatedWordsPredictor,
- predictors.KnownPrefixPredictor,
- predictors.UnknownPrefixPredictor,
- predictors.KnownSuffixPredictor,
+ DEFAULT_UNITS = [
+ units.PunctuationAnalyzer,
+ units.LatinAnalyzer,
+ units.HyphenSeparatedParticleAnalyzer,
+ units.HyphenatedWordsAnalyzer,
+ units.KnownPrefixAnalyzer,
+ units.UnknownPrefixAnalyzer,
+ units.KnownSuffixAnalyzer,
]
DEFAULT_DICTIONARY_CLASS = Dictionary
- def __init__(self, path=None, result_type=Parse, predictors=None,
+ def __init__(self, path=None, result_type=Parse, units=None,
dictionary_class=None):
if dictionary_class is None:
@@ -339,11 +339,11 @@ def __init__(self, path=None, result_type=Parse, predictors=None,
else:
self._result_type = None
- # initialize predictors
- if predictors is None:
- predictors = self.DEFAULT_PREDICTORS
+ # initialize units
+ if units is None:
+ units = self.DEFAULT_UNITS
- self._predictors = [cls(self) for cls in predictors]
+ self._units = [cls(self) for cls in units]
@classmethod
@@ -378,10 +378,10 @@ def parse(self, word):
if not res:
seen = set()
- for predictor in self._predictors:
- res.extend(predictor.parse(word, seen))
+ for unit in self._units:
+ res.extend(unit.parse(word, seen))
- if res and predictor.terminal:
+ if res and unit.terminal:
break
if self._result_type is None:
@@ -396,10 +396,10 @@ def tag(self, word):
if not res:
seen = set()
- for predictor in self._predictors:
- res.extend(predictor.tag(word, seen))
+ for unit in self._units:
+ res.extend(unit.tag(word, seen))
- if res and predictor.terminal:
+ if res and unit.terminal:
break
return res
View
516 pymorphy2/predictors.py
@@ -1,516 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import, unicode_literals, division
-import operator
-import logging
-
-from .utils import word_splits
-from .shapes import is_latin, is_punctuation
-
-logger = logging.getLogger(__name__)
-
-__all__ = [
- "KnownPrefixPredictor",
- "UnknownPrefixPredictor",
- "KnownSuffixPredictor",
- "HyphenSeparatedParticlePredictor",
- "HyphenatedWordsPredictor",
- "PunctuationPredictor",
- "LatinPredictor",
-]
-
-class BasePredictor(object):
-
- terminal = False
-
- def __init__(self, morph):
- """
- @type morph: pymorphy2.analyzer.MorphAnalyzer
- @type self.dict: pymorphy2.analyzer.Dictionary
- """
- self.morph = morph
- self.dict = morph.dictionary
-
- def parse(self, word, seen_parses):
- raise NotImplementedError()
-
- def tag(self, word, seen_tags):
- raise NotImplementedError()
-
- def get_lexeme(self, form, methods):
- # be default, predictor gets a lexeme from a previous predictor:
- assert methods[-1][0] is self
- if len(methods) == 1:
- return self.dict.get_lexeme(form, [])
-
- assert len(methods) > 1, len(methods)
- previous_predictor = methods[-2][0]
- return previous_predictor.get_lexeme(form, methods[:-1])
-
- def normalized(self, form):
- return self.dict.normalized(form)
-
- def __repr__(self):
- return str("<%s>") % self.__class__.__name__
-
-
-class KnownPrefixPredictor(BasePredictor):
- """
- Parse the word by checking if it starts with a known prefix
- and parsing the reminder.
- """
-
- terminal = True
- ESTIMATE_DECAY = 0.75
- MIN_REMINDER_LENGTH = 3
-
- def _word_prefixes(self, word):
- return sorted(
- self.dict.prediction_prefixes.prefixes(word),
- key=len,
- reverse=True,
- )
-
- def parse(self, word, seen_parses):
- result = []
- for prefix in self._word_prefixes(word):
- unprefixed_word = word[len(prefix):]
-
- if len(unprefixed_word) < self.MIN_REMINDER_LENGTH:
- continue
-
- method = (self, prefix)
-
- for fixed_word, tag, normal_form, para_id, idx, estimate, methods in self.morph.parse(unprefixed_word):
-
- if not tag.is_productive():
- continue
-
- parse = (
- prefix+fixed_word, tag, prefix+normal_form,
- para_id, idx, estimate*self.ESTIMATE_DECAY,
- methods+(method,)
- )
-
- _add_parse_if_not_seen(parse, result, seen_parses)
-
- return result
-
- def tag(self, word, seen_tags):
- result = []
- for prefix in self._word_prefixes(word):
- unprefixed_word = word[len(prefix):]
-
- if len(unprefixed_word) < self.MIN_REMINDER_LENGTH:
- continue
-
- for tag in self.morph.tag(unprefixed_word):
- if not tag.is_productive():
- continue
- _add_tag_if_not_seen(tag, result, seen_tags)
-
- return result
-
-
-class UnknownPrefixPredictor(BasePredictor):
- """
- Parse the word by parsing only the word suffix
- (with restrictions on prefix & suffix lengths).
- """
- terminal = False
- ESTIMATE_DECAY = 0.5
-
- def parse(self, word, seen_parses):
- result = []
- for prefix, unprefixed_word in word_splits(word):
-
- method = (self, prefix)
-
- for fixed_word, tag, normal_form, para_id, idx, estimate, methods in self.dict.parse(unprefixed_word):
-
- if not tag.is_productive():
- continue
-
- parse = (prefix+fixed_word, tag, prefix+normal_form,
- para_id, idx, estimate*self.ESTIMATE_DECAY,
- methods+(method,))
- _add_parse_if_not_seen(parse, result, seen_parses)
-
- return result
-
- def tag(self, word, seen_tags):
- result = []
- for _, unprefixed_word in word_splits(word):
- for tag in self.dict.tag(unprefixed_word):
-
- if not tag.is_productive():
- continue
-
- _add_tag_if_not_seen(tag, result, seen_tags)
-
- return result
-
-
-class KnownSuffixPredictor(BasePredictor):
- """
- Parse the word by checking how the words with similar suffixes
- are parsed.
- """
- terminal = False
- ESTIMATE_DECAY = 0.5
-
- def __init__(self, morph):
- super(KnownSuffixPredictor, self).__init__(morph)
-
- self._paradigm_prefixes = list(reversed(list(enumerate(self.dict.paradigm_prefixes))))
- max_suffix_length = self.dict.meta['prediction_options']['max_suffix_length']
- self._prediction_splits = list(reversed(range(1, max_suffix_length+1)))
-
-
- def parse(self, word, seen_parses):
- result = []
-
- # smoothing; XXX: isn't max_cnt better?
- total_counts = [1] * len(self._paradigm_prefixes)
-
- for prefix_id, prefix in self._paradigm_prefixes:
-
- if not word.startswith(prefix):
- continue
-
- suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
-
- for i in self._prediction_splits:
- end = word[-i:] # XXX: this should be counted once, not for each prefix
- para_data = suffixes_dawg.similar_items(end, self.dict.ee)
-
- for fixed_suffix, parses in para_data:
- method = (self, fixed_suffix)
-
- for cnt, para_id, idx in parses:
- tag = self.dict.build_tag_info(para_id, idx)
-
- if not tag.is_productive():
- continue
- total_counts[prefix_id] += cnt
-
- fixed_word = word[:-i] + fixed_suffix
- normal_form = self.dict.build_normal_form(para_id, idx, fixed_word)
-
- parse = (cnt, fixed_word, tag, normal_form,
- para_id, idx, prefix_id, (method,))
- reduced_parse = parse[1:4]
- if reduced_parse in seen_parses:
- continue
-
- result.append(parse)
-
- if total_counts[prefix_id] > 1:
- break
-
- result = [
- (fixed_word, tag, normal_form, para_id, idx, cnt/total_counts[prefix_id] * self.ESTIMATE_DECAY, methods)
- for (cnt, fixed_word, tag, normal_form, para_id, idx, prefix_id, methods) in result
- ]
- result.sort(key=operator.itemgetter(5), reverse=True)
- return result
-
-
- def tag(self, word, seen_tags):
- # XXX: the result order may be different from
- # ``self.parse(...)``.
-
- result = []
-
- for prefix_id, prefix in self._paradigm_prefixes:
-
- if not word.startswith(prefix):
- continue
-
- suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
-
- for i in self._prediction_splits:
- end = word[-i:] # XXX: this should be counted once, not for each prefix
- para_data = suffixes_dawg.similar_items(end, self.dict.ee)
- found = False
-
- for fixed_suffix, parses in para_data:
- for cnt, para_id, idx in parses:
-
- tag = self.dict.build_tag_info(para_id, idx)
-
- if not tag.is_productive():
- continue
-
- found = True
- if tag in seen_tags:
- continue
- seen_tags.add(tag)
- result.append((cnt, tag))
-
- if found:
- break
-
- result.sort(reverse=True)
- return [tag for cnt, tag in result]
-
-
-class HyphenSeparatedParticlePredictor(BasePredictor):
- """
- Parse the word by analyzing it without
- a particle after a hyphen (tokens like "смотри-ка").
-
- .. note::
-
- This predictor doesn't remove particles from the result
- so for normalization you may need to handle
- particles at tokenization level.
-
- """
- terminal = True
- ESTIMATE_DECAY = 0.9
-
- # XXX: maybe the code can be made faster by compiling this list to a DAWG?
- PARTICLES_AFTER_HYPHEN = [
- "-то", "-ка", "-таки", "-де", "-тко", "-тка", "", "-ста"
- ]
-
- def get_lexeme(self, form, methods):
- particle = methods[-1][1]
-
- return list(
- self._suffixed_lexeme(
- super(HyphenSeparatedParticlePredictor, self).get_lexeme(
- self._unsuffixed_form(form, particle),
- methods
- ),
- particle
- )
- )
-
- def _suffixed_lexeme(self, lexeme, suffix):
- for p in lexeme:
- word, tag, normal_form, para_id, idx, estimate, methods = p
- yield (word+suffix, tag, normal_form+suffix,
- para_id, idx, estimate, methods)
-
- def _unsuffixed_form(self, form, suffix):
- word, tag, normal_form, para_id, idx, estimate, methods = form
- return (word[:-len(suffix)], tag, normal_form[:-len(suffix)],
- para_id, idx, estimate, methods)
-
-
- def parse(self, word, seen_parses):
-
- result = []
- for particle in self.PARTICLES_AFTER_HYPHEN:
- if not word.endswith(particle):
- continue
-
- unsuffixed_word = word[:-len(particle)]
- if not unsuffixed_word:
- continue
-
- method = (self, particle)
-
- for fixed_word, tag, normal_form, para_id, idx, estimate, methods in self.morph.parse(unsuffixed_word):
- parse = (
- fixed_word+particle, tag, normal_form+particle,
- para_id, idx, estimate*self.ESTIMATE_DECAY,
- methods+(method,)
- )
- _add_parse_if_not_seen(parse, result, seen_parses)
-
- # If a word ends with with one of the particles,
- # it can't ends with an another.
- break
-
- return result
-
-
- def tag(self, word, seen_tags):
- result = []
- for particle in self.PARTICLES_AFTER_HYPHEN:
- if not word.endswith(particle):
- continue
-
- unsuffixed_word = word[:-len(particle)]
- if not unsuffixed_word:
- continue
-
- result.extend(self.morph.tag(unsuffixed_word))
-
- # If a word ends with with one of the particles,
- # it can't ends with an another.
- break
-
- return result
-
-
-class HyphenatedWordsPredictor(BasePredictor):
- """
- Parse the word by parsing its hyphen-separated parts.
- """
-
- terminal = True
- ESTIMATE_DECAY = 0.75
-
- def _similarity_features(self, tag):
- """
- @type tag: pymorphy2.tagset.OpencorporaTag
- """
- return (tag.POS, tag.number, tag.case, tag.person, tag.tense)
-
- def parse(self, word, seen_parses):
- if '-' not in word:
- return []
-
- result = []
-
- # If there are more than 2 parts, the rest would be parsed
- # by recursion.
- left, right = word.split('-', 1)
-
- left_parses = self.morph.parse(left)
- right_parses = self.morph.parse(right)
-
- # Step 1: Assume that the left part is an uninflected prefix.
- # Examples: интернет-магазин, воздушно-капельный
- method1 = (self, right)
- right_features = []
-
- for fixed_word, tag, normal_form, para_id, idx, estimate, methods in right_parses:
- parse = (
- '-'.join([left, fixed_word]), tag, '-'.join([left, normal_form]),
- para_id, idx, estimate*self.ESTIMATE_DECAY,
- methods+(method1,)
- )
- _add_parse_if_not_seen(parse, result, seen_parses)
- right_features.append(self._similarity_features(tag))
-
- # Step 2: if left and right can be parsed the same way,
- # then it may be the case that both parts should be inflected.
- # Examples: человек-гора, команд-участниц, компания-производитель
-
- method2 = (self, word)
-
- # FIXME: quadratic algorithm
- for left_parse in left_parses:
-
- left_feat = self._similarity_features(left_parse[1])
-
- for parse_index, right_parse in enumerate(right_parses):
- right_feat = right_features[parse_index]
-
- if left_feat != right_feat:
- continue
-
- # tag
- parse = (
- '-'.join([left_parse[0], right_parse[0]]), # word
- left_parse[1], # tag is from the left part
- '-'.join([left_parse[2], right_parse[2]]), # normal form
- left_parse[3], left_parse[4], # para_id, idx?
- left_parse[5]*self.ESTIMATE_DECAY,
- left_parse[6]+(method2,)
- )
- _add_parse_if_not_seen(parse, result, seen_parses)
-
- return result
-
- def tag(self, word, seen_tags):
- result = []
- for p in self.parse(word, set()):
- _add_tag_if_not_seen(p[1], result, seen_tags)
- return result
-
-
-
-
-class _ShapeAnalyzer(BasePredictor):
- ESTIMATE = 0.5
- EXTRA_GRAMMEMES = []
-
- def __init__(self, morph):
- super(_ShapeAnalyzer, self).__init__(morph)
- self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)
-
- def _check_shape(self, word):
- raise NotImplementedError()
-
- def _get_tag(self, word, shape):
- raise NotImplementedError()
-
- def parse(self, word, seen_parses):
- shape = self._check_shape(word)
- if not shape:
- return []
-
- return [(
- word, self._get_tag(word, shape), word,
- None, None, self.ESTIMATE,
- [(self, )],
- )]
-
- def tag(self, word, seen_tags):
- shape = self._check_shape(word)
- if not shape:
- return []
- return [self._get_tag(word, shape)]
-
- def get_lexeme(self, form, methods):
- return [form]
-
- def normalized(self, form):
- return form
-
-
-class PunctuationPredictor(_ShapeAnalyzer):
- """
- This predictor tags punctuation marks as "PNCT".
- """
- terminal = True
- ESTIMATE = 0.5
- EXTRA_GRAMMEMES = ['PNCT']
-
- def __init__(self, morph):
- super(PunctuationPredictor, self).__init__(morph)
- self._tag = self.morph.TagClass('PNCT')
-
- def _get_tag(self, word, shape):
- return self._tag
-
- def _check_shape(self, word):
- return is_punctuation(word)
-
-
-class LatinPredictor(_ShapeAnalyzer):
- """
- This predictor marks latin words with "LATN" tag.
- """
- terminal = True
- ESTIMATE = 0.5
- EXTRA_GRAMMEMES = ['LATN']
-
- def __init__(self, morph):
- super(LatinPredictor, self).__init__(morph)
- self._tag = self.morph.TagClass('LATN')
-
- def _get_tag(self, word, shape):
- return self._tag
-
- def _check_shape(self, word):
- return is_latin(word)
-
-
-def _add_parse_if_not_seen(parse, result_list, seen_parses):
- reduced_parse = parse[:3]
- if reduced_parse in seen_parses:
- return
- seen_parses.add(reduced_parse)
- result_list.append(parse)
-
-def _add_tag_if_not_seen(tag, result_list, seen_tags):
- if tag in seen_tags:
- return
- seen_tags.add(tag)
- result_list.append(tag)
View
6 pymorphy2/units/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from .by_analogy import KnownPrefixAnalyzer, KnownSuffixAnalyzer, UnknownPrefixAnalyzer
+from .by_hyphen import HyphenatedWordsAnalyzer, HyphenSeparatedParticleAnalyzer
+from .by_shape import LatinAnalyzer, PunctuationAnalyzer
View
37 pymorphy2/units/base.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals, division
+
+class BaseAnalyzerUnit(object):
+
+ terminal = False
+
+ def __init__(self, morph):
+ """
+ @type morph: pymorphy2.analyzer.MorphAnalyzer
+ @type self.dict: pymorphy2.analyzer.Dictionary
+ """
+ self.morph = morph
+ self.dict = morph.dictionary
+
+ def parse(self, word, seen_parses):
+ raise NotImplementedError()
+
+ def tag(self, word, seen_tags):
+ raise NotImplementedError()
+
+ def get_lexeme(self, form, methods):
+ # be default, predictor gets a lexeme from a previous predictor:
+ assert methods[-1][0] is self
+ if len(methods) == 1:
+ return self.dict.get_lexeme(form, [])
+
+ assert len(methods) > 1, len(methods)
+ previous_predictor = methods[-2][0]
+ return previous_predictor.get_lexeme(form, methods[:-1])
+
+ def normalized(self, form):
+ return self.dict.normalized(form)
+
+ def __repr__(self):
+ return str("<%s>") % self.__class__.__name__
+
View
230 pymorphy2/units/by_analogy.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+"""
+Analogy analyzer units
+----------------------
+
+This module provides analyzer units that analyzes unknown words by looking
+at how similar known words are analyzed.
+
+"""
+
+from __future__ import absolute_import, unicode_literals, division
+
+import operator
+
+from pymorphy2.units.base import BaseAnalyzerUnit
+from pymorphy2.units.utils import add_parse_if_not_seen, add_tag_if_not_seen
+from pymorphy2.utils import word_splits
+
+
+class KnownPrefixAnalyzer(BaseAnalyzerUnit):
+ """
+ Parse the word by checking if it starts with a known prefix
+ and parsing the reminder.
+
+ Example: псевдокошка -> (псевдо) + кошка.
+ """
+
+ terminal = True
+ ESTIMATE_DECAY = 0.75
+ MIN_REMINDER_LENGTH = 3
+
+ def _word_prefixes(self, word):
+ return sorted(
+ self.dict.prediction_prefixes.prefixes(word),
+ key=len,
+ reverse=True,
+ )
+
+ def parse(self, word, seen_parses):
+ result = []
+ for prefix in self._word_prefixes(word):
+ unprefixed_word = word[len(prefix):]
+
+ if len(unprefixed_word) < self.MIN_REMINDER_LENGTH:
+ continue
+
+ method = (self, prefix)
+
+ for fixed_word, tag, normal_form, para_id, idx, estimate, methods in self.morph.parse(unprefixed_word):
+
+ if not tag.is_productive():
+ continue
+
+ parse = (
+ prefix+fixed_word, tag, prefix+normal_form,
+ para_id, idx, estimate*self.ESTIMATE_DECAY,
+ methods+(method,)
+ )
+
+ add_parse_if_not_seen(parse, result, seen_parses)
+
+ return result
+
+ def tag(self, word, seen_tags):
+ result = []
+ for prefix in self._word_prefixes(word):
+ unprefixed_word = word[len(prefix):]
+
+ if len(unprefixed_word) < self.MIN_REMINDER_LENGTH:
+ continue
+
+ for tag in self.morph.tag(unprefixed_word):
+ if not tag.is_productive():
+ continue
+ add_tag_if_not_seen(tag, result, seen_tags)
+
+ return result
+
+
+class UnknownPrefixAnalyzer(BaseAnalyzerUnit):
+ """
+ Parse the word by parsing only the word suffix
+ (with restrictions on prefix & suffix lengths).
+
+ Example: байткод -> (байт) + код
+
+ """
+ terminal = False
+ ESTIMATE_DECAY = 0.5
+
+ def parse(self, word, seen_parses):
+ result = []
+ for prefix, unprefixed_word in word_splits(word):
+
+ method = (self, prefix)
+
+ for fixed_word, tag, normal_form, para_id, idx, estimate, methods in self.dict.parse(unprefixed_word):
+
+ if not tag.is_productive():
+ continue
+
+ parse = (prefix+fixed_word, tag, prefix+normal_form,
+ para_id, idx, estimate*self.ESTIMATE_DECAY,
+ methods+(method,))
+ add_parse_if_not_seen(parse, result, seen_parses)
+
+ return result
+
+ def tag(self, word, seen_tags):
+ result = []
+ for _, unprefixed_word in word_splits(word):
+ for tag in self.dict.tag(unprefixed_word):
+
+ if not tag.is_productive():
+ continue
+
+ add_tag_if_not_seen(tag, result, seen_tags)
+
+ return result
+
+
+class KnownSuffixAnalyzer(BaseAnalyzerUnit):
+ """
+ Parse the word by checking how the words with similar suffixes
+ are parsed.
+
+ Example: бутявкать -> ...вкать
+
+ """
+
+ terminal = False
+ ESTIMATE_DECAY = 0.5
+
+ def __init__(self, morph):
+ super(KnownSuffixAnalyzer, self).__init__(morph)
+
+ self._paradigm_prefixes = list(reversed(list(enumerate(self.dict.paradigm_prefixes))))
+ max_suffix_length = self.dict.meta['prediction_options']['max_suffix_length']
+ self._prediction_splits = list(reversed(range(1, max_suffix_length+1)))
+
+
+ def parse(self, word, seen_parses):
+ result = []
+
+ # smoothing; XXX: isn't max_cnt better?
+ total_counts = [1] * len(self._paradigm_prefixes)
+
+ for prefix_id, prefix in self._paradigm_prefixes:
+
+ if not word.startswith(prefix):
+ continue
+
+ suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
+
+ for i in self._prediction_splits:
+ end = word[-i:] # XXX: this should be counted once, not for each prefix
+ para_data = suffixes_dawg.similar_items(end, self.dict.ee)
+
+ for fixed_suffix, parses in para_data:
+ method = (self, fixed_suffix)
+
+ for cnt, para_id, idx in parses:
+ tag = self.dict.build_tag_info(para_id, idx)
+
+ if not tag.is_productive():
+ continue
+ total_counts[prefix_id] += cnt
+
+ fixed_word = word[:-i] + fixed_suffix
+ normal_form = self.dict.build_normal_form(para_id, idx, fixed_word)
+
+ parse = (cnt, fixed_word, tag, normal_form,
+ para_id, idx, prefix_id, (method,))
+ reduced_parse = parse[1:4]
+ if reduced_parse in seen_parses:
+ continue
+
+ result.append(parse)
+
+ if total_counts[prefix_id] > 1:
+ break
+
+ result = [
+ (fixed_word, tag, normal_form, para_id, idx, cnt/total_counts[prefix_id] * self.ESTIMATE_DECAY, methods)
+ for (cnt, fixed_word, tag, normal_form, para_id, idx, prefix_id, methods) in result
+ ]
+ result.sort(key=operator.itemgetter(5), reverse=True)
+ return result
+
+
+ def tag(self, word, seen_tags):
+ # XXX: the result order may be different from
+ # ``self.parse(...)``.
+
+ result = []
+
+ for prefix_id, prefix in self._paradigm_prefixes:
+
+ if not word.startswith(prefix):
+ continue
+
+ suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
+
+ for i in self._prediction_splits:
+ end = word[-i:] # XXX: this should be counted once, not for each prefix
+ para_data = suffixes_dawg.similar_items(end, self.dict.ee)
+ found = False
+
+ for fixed_suffix, parses in para_data:
+ for cnt, para_id, idx in parses:
+
+ tag = self.dict.build_tag_info(para_id, idx)
+
+ if not tag.is_productive():
+ continue
+
+ found = True
+ if tag in seen_tags:
+ continue
+ seen_tags.add(tag)
+ result.append((cnt, tag))
+
+ if found:
+ break
+
+ result.sort(reverse=True)
+ return [tag for cnt, tag in result]
+
+
+
View
189 pymorphy2/units/by_hyphen.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+"""
+Analyzer units for unknown words with hyphens
+---------------------------------------------
+"""
+
+from __future__ import absolute_import, unicode_literals, division
+
+from pymorphy2.units.base import BaseAnalyzerUnit
+from pymorphy2.units.utils import add_parse_if_not_seen, add_tag_if_not_seen
+
+
+class HyphenSeparatedParticleAnalyzer(BaseAnalyzerUnit):
+ """
+ Parse the word by analyzing it without
+ a particle after a hyphen.
+
+ Example: смотри-ка -> смотри + "-ка".
+
+ .. note::
+
+ This analyzer doesn't remove particles from the result
+ so for normalization you may need to handle
+ particles at tokenization level.
+
+ """
+ terminal = True
+ ESTIMATE_DECAY = 0.9
+
+ # XXX: maybe the code can be made faster by compiling this list to a DAWG?
+ PARTICLES_AFTER_HYPHEN = [
+ "-то", "-ка", "-таки", "-де", "-тко", "-тка", "", "-ста"
+ ]
+
+ def get_lexeme(self, form, methods):
+ particle = methods[-1][1]
+
+ return list(
+ self._suffixed_lexeme(
+ super(HyphenSeparatedParticleAnalyzer, self).get_lexeme(
+ self._unsuffixed_form(form, particle),
+ methods
+ ),
+ particle
+ )
+ )
+
+ def _suffixed_lexeme(self, lexeme, suffix):
+ for p in lexeme:
+ word, tag, normal_form, para_id, idx, estimate, methods = p
+ yield (word+suffix, tag, normal_form+suffix,
+ para_id, idx, estimate, methods)
+
+ def _unsuffixed_form(self, form, suffix):
+ word, tag, normal_form, para_id, idx, estimate, methods = form
+ return (word[:-len(suffix)], tag, normal_form[:-len(suffix)],
+ para_id, idx, estimate, methods)
+
+
+ def parse(self, word, seen_parses):
+
+ result = []
+ for particle in self.PARTICLES_AFTER_HYPHEN:
+ if not word.endswith(particle):
+ continue
+
+ unsuffixed_word = word[:-len(particle)]
+ if not unsuffixed_word:
+ continue
+
+ method = (self, particle)
+
+ for fixed_word, tag, normal_form, para_id, idx, estimate, methods in self.morph.parse(unsuffixed_word):
+ parse = (
+ fixed_word+particle, tag, normal_form+particle,
+ para_id, idx, estimate*self.ESTIMATE_DECAY,
+ methods+(method,)
+ )
+ add_parse_if_not_seen(parse, result, seen_parses)
+
+ # If a word ends with with one of the particles,
+ # it can't ends with an another.
+ break
+
+ return result
+
+
+ def tag(self, word, seen_tags):
+ result = []
+ for particle in self.PARTICLES_AFTER_HYPHEN:
+ if not word.endswith(particle):
+ continue
+
+ unsuffixed_word = word[:-len(particle)]
+ if not unsuffixed_word:
+ continue
+
+ result.extend(self.morph.tag(unsuffixed_word))
+
+ # If a word ends with with one of the particles,
+ # it can't ends with an another.
+ break
+
+ return result
+
+
+class HyphenatedWordsAnalyzer(HyphenSeparatedParticleAnalyzer):
+ """
+ Parse the word by parsing its hyphen-separated parts.
+
+ Examples:
+
+ * интернет-магазин -> "интернет-" + магазин
+ * человек-гора -> человек + гора
+
+ """
+
+ terminal = True
+ ESTIMATE_DECAY = 0.75
+
+ def _similarity_features(self, tag):
+ """
+ @type tag: pymorphy2.tagset.OpencorporaTag
+ """
+ return (tag.POS, tag.number, tag.case, tag.person, tag.tense)
+
+ def parse(self, word, seen_parses):
+ if '-' not in word:
+ return []
+
+ result = []
+
+ # If there are more than 2 parts, the rest would be parsed
+ # by recursion.
+ left, right = word.split('-', 1)
+
+ left_parses = self.morph.parse(left)
+ right_parses = self.morph.parse(right)
+
+ # Step 1: Assume that the left part is an uninflected prefix.
+ # Examples: интернет-магазин, воздушно-капельный
+ method1 = (self, right)
+ right_features = []
+
+ for fixed_word, tag, normal_form, para_id, idx, estimate, methods in right_parses:
+ parse = (
+ '-'.join([left, fixed_word]), tag, '-'.join([left, normal_form]),
+ para_id, idx, estimate*self.ESTIMATE_DECAY,
+ methods+(method1,)
+ )
+ add_parse_if_not_seen(parse, result, seen_parses)
+ right_features.append(self._similarity_features(tag))
+
+ # Step 2: if left and right can be parsed the same way,
+ # then it may be the case that both parts should be inflected.
+ # Examples: человек-гора, команд-участниц, компания-производитель
+
+ method2 = (self, word)
+
+ # FIXME: quadratic algorithm
+ for left_parse in left_parses:
+
+ left_feat = self._similarity_features(left_parse[1])
+
+ for parse_index, right_parse in enumerate(right_parses):
+ right_feat = right_features[parse_index]
+
+ if left_feat != right_feat:
+ continue
+
+ # tag
+ parse = (
+ '-'.join([left_parse[0], right_parse[0]]), # word
+ left_parse[1], # tag is from the left part
+ '-'.join([left_parse[2], right_parse[2]]), # normal form
+ left_parse[3], left_parse[4], # para_id, idx?
+ left_parse[5]*self.ESTIMATE_DECAY,
+ left_parse[6]+(method2,)
+ )
+ add_parse_if_not_seen(parse, result, seen_parses)
+
+ return result
+
+ def tag(self, word, seen_tags):
+ result = []
+ for p in self.parse(word, set()):
+ add_tag_if_not_seen(p[1], result, seen_tags)
+ return result
+
View
89 pymorphy2/units/by_shape.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+"""
+Analyzer units that analyzes non-word tokes
+-------------------------------------------
+"""
+
+from __future__ import absolute_import, unicode_literals, division
+
+from pymorphy2.units.base import BaseAnalyzerUnit
+from pymorphy2.shapes import is_latin, is_punctuation
+
+class _ShapeAnalyzer(BaseAnalyzerUnit):
+ ESTIMATE = 0.5
+ EXTRA_GRAMMEMES = []
+
+ def __init__(self, morph):
+ super(_ShapeAnalyzer, self).__init__(morph)
+ self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)
+
+ def _check_shape(self, word):
+ raise NotImplementedError()
+
+ def _get_tag(self, word, shape):
+ raise NotImplementedError()
+
+ def parse(self, word, seen_parses):
+ shape = self._check_shape(word)
+ if not shape:
+ return []
+
+ return [(
+ word, self._get_tag(word, shape), word,
+ None, None, self.ESTIMATE,
+ [(self, )],
+ )]
+
+ def tag(self, word, seen_tags):
+ shape = self._check_shape(word)
+ if not shape:
+ return []
+ return [self._get_tag(word, shape)]
+
+ def get_lexeme(self, form, methods):
+ return [form]
+
+ def normalized(self, form):
+ return form
+
+
+class PunctuationAnalyzer(_ShapeAnalyzer):
+ """
+ This analyzer tags punctuation marks as "PNCT".
+ Example: "," -> PNCT
+ """
+ terminal = True
+ ESTIMATE = 0.5
+ EXTRA_GRAMMEMES = ['PNCT']
+
+ def __init__(self, morph):
+ super(PunctuationAnalyzer, self).__init__(morph)
+ self._tag = self.morph.TagClass('PNCT')
+
+ def _get_tag(self, word, shape):
+ return self._tag
+
+ def _check_shape(self, word):
+ return is_punctuation(word)
+
+
+class LatinAnalyzer(_ShapeAnalyzer):
+ """
+ This analyzer marks latin words with "LATN" tag.
+ Example: "pdf" -> LATN
+ """
+ terminal = True
+ ESTIMATE = 0.5
+ EXTRA_GRAMMEMES = ['LATN']
+
+ def __init__(self, morph):
+ super(LatinAnalyzer, self).__init__(morph)
+ self._tag = self.morph.TagClass('LATN')
+
+ def _get_tag(self, word, shape):
+ return self._tag
+
+ def _check_shape(self, word):
+ return is_latin(word)
+
+
View
15 pymorphy2/units/utils.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals, division
+
+def add_parse_if_not_seen(parse, result_list, seen_parses):
+ reduced_parse = parse[:3]
+ if reduced_parse in seen_parses:
+ return
+ seen_parses.add(reduced_parse)
+ result_list.append(parse)
+
+def add_tag_if_not_seen(tag, result_list, seen_tags):
+ if tag in seen_tags:
+ return
+ seen_tags.add(tag)
+ result_list.append(tag)
View
7 setup.py
@@ -24,7 +24,12 @@ def get_version():
long_description = open('README.rst').read(),
license = 'MIT license',
- packages = ['pymorphy2', 'pymorphy2.vendor', 'pymorphy2.opencorpora_dict'],
+ packages = [
+ 'pymorphy2',
+ 'pymorphy2.units',
+ 'pymorphy2.vendor',
+ 'pymorphy2.opencorpora_dict',
+ ],
scripts=['bin/pymorphy'],
requires=['dawg_python (>= 0.5)', 'pymorphy2_dicts (>2.0, <3.0)'],
View
6 tests/test_analyzer.py
@@ -2,7 +2,7 @@
from __future__ import absolute_import, unicode_literals
import pytest
import pymorphy2
-from pymorphy2.predictors import UnknownPrefixPredictor, KnownPrefixPredictor
+from pymorphy2.units.by_analogy import UnknownPrefixAnalyzer, KnownPrefixAnalyzer
from .utils import morph
@@ -231,8 +231,8 @@ class TestTagWithPrefix:
def test_tag_with_unknown_prefix(self):
word = 'мегакот'
- pred1 = UnknownPrefixPredictor(morph)
- pred2 = KnownPrefixPredictor(morph)
+ pred1 = UnknownPrefixAnalyzer(morph)
+ pred2 = KnownPrefixAnalyzer(morph)
parse1 = pred1.tag(word, set())
parse2 = pred2.tag(word, set())
Please sign in to comment.
Something went wrong with that request. Please try again.