Skip to content

Commit

Permalink
Merge d141a77 into 55c52b9
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Nov 4, 2013
2 parents 55c52b9 + d141a77 commit 9b27ea8
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 28 deletions.
15 changes: 15 additions & 0 deletions benchmarks/speed.py
Expand Up @@ -89,6 +89,18 @@ def _run_word_is_known():
for word, cnt in words:
morph.word_is_known(word)

def _run_cyr_repr():
for word, cnt in words:
[p.tag.cyr_repr for p in morph.parse(word)]

def _run_grammemes_cyr():
for word, cnt in words:
[p.tag.grammemes_cyr for p in morph.parse(word)]

def _run_POS_cyr():
for word, cnt in words:
[morph.lat2cyr(p.tag) for p in morph.parse(word)]

def _run_lexeme():
for word, cnt in words[::5]:
[p.lexeme for p in morph.parse(word)]
Expand All @@ -112,6 +124,9 @@ def show_info(bench_name, func, note='', count=len(words)):
show_info("[p.lexeme for p in morph.parse(w)]", _run_lexeme, count=len(words)/5)
show_info("[{'NOUN'} in p.tag for p in morph.parse(w)]", _run_is_noun)
show_info("[p.tag.POS == 'NOUN' for p in morph.parse(w)]", _run_is_noun2)
show_info("[p.tag.cyr_repr for p in morph.parse(word)]", _run_cyr_repr)
show_info("[p.tag.grammemes_cyr for p in morph.parse(word)]", _run_grammemes_cyr)
show_info("[morph.lat2cyr(p.tag) for p in morph.parse(word)]", _run_POS_cyr)

logger.info("")

Expand Down
33 changes: 30 additions & 3 deletions docs/user/guide.rst
Expand Up @@ -113,6 +113,8 @@ pymorphy2 умеет разбирать не только словарные с
Для того, чтоб проверить, есть ли в данном теге отдельная граммема
(или все граммемы из указанного множества), используйте оператор in::

>>> p.tag
OpencorporaTag('VERB,perf,intr plur,past,indc')
>>> 'NOUN' in p.tag # то же самое, что и {'NOUN'} in p.tag
False
>>> 'VERB' in p.tag
Expand All @@ -134,6 +136,8 @@ pymorphy2 умеет разбирать не только словарные с
Кроме того, у каждого тега есть атрибуты, через которые можно получить
часть речи, число и другие характеристики::

>>> p.tag
OpencorporaTag('VERB,perf,intr plur,past,indc')
>>> p.tag.POS # Part of Speech, часть речи
'VERB'
>>> p.tag.animacy # одушевленность
Expand Down Expand Up @@ -181,6 +185,29 @@ pymorphy2 выкидывает исключение, если встречает
...
ValueError: 'plur' is not a valid grammeme for this attribute.


Русские названия тегов и граммем
--------------------------------

Теги и граммемы в pymorphy2 записываются латиницей (например, ``NOUN``).
Но часто удобнее использовать кириллические названия граммем (например,
``СУЩ`` вместо ``NOUN``). Чтобы получить тег в виде строки,
записанной кириллицей, используйте свойство :attr:`OpencorporaTag.cyr_repr`::

>>> p.tag
OpencorporaTag('VERB,perf,intr plur,past,indc')
>>> p.tag.cyr_repr
'ГЛ,сов,неперех мн,прош,изъяв'

Для преобразования произвольных строк с тегами/граммемами между
кириллицей и латиницей используйте методы :meth:`MorphAnalyzer.cyr2lat`
и :meth:`MorphAnalyzer.lat2cyr`::

>>> morph.lat2cyr('NOUN,anim,masc plur,ablt')
'СУЩ,од,мр мн,тв'
>>> morph.cyr2lat('СУЩ,од,мр мн,тв')
'NOUN,anim,masc plur,ablt'

Склонение слов
--------------

Expand Down Expand Up @@ -231,7 +258,7 @@ pymorphy2 умеет склонять (ставить в какую-то дру

Посмотрим, что сделает pymorphy2 в этом примере:

>>> m.parse('думающему')[0].normal_form
>>> morph.parse('думающему')[0].normal_form
'думать'

pymorphy2 сейчас использует алгоритм нахождения нормальной формы,
Expand All @@ -242,7 +269,7 @@ pymorphy2 сейчас использует алгоритм нахождени
Если требуется нормализовывать слова иначе, можно воспользоваться
методом :meth:`Parse.inflect`::

>>> m.parse('думающему')[0].inflect({'sing', 'nomn'}).word
>>> morph.parse('думающему')[0].inflect({'sing', 'nomn'}).word
'думающий'

Согласование слов с числительными
Expand All @@ -269,7 +296,7 @@ pymorphy2 возвращает все допустимые варианты ра

У каждого разбора есть параметр score::

>>> m.parse('на')
>>> morph.parse('на')
[Parse(word='на', tag=OpencorporaTag('PREP'), normal_form='на', score=0.999628, methods_stack=((<DictionaryAnalyzer>, 'на', 23, 0),)),
Parse(word='на', tag=OpencorporaTag('INTJ'), normal_form='на', score=0.000318, methods_stack=((<DictionaryAnalyzer>, 'на', 20, 0),)),
Parse(word='на', tag=OpencorporaTag('PRCL'), normal_form='на', score=5.3e-05, methods_stack=((<DictionaryAnalyzer>, 'на', 21, 0),))]
Expand Down
8 changes: 8 additions & 0 deletions pymorphy2/analyzer.py
Expand Up @@ -330,6 +330,14 @@ def TagClass(self):
"""
return self.dictionary.Tag

def cyr2lat(self, tag_or_grammeme):
""" Return Latin representation for ``tag_or_grammeme`` string """
return self.TagClass.cyr2lat(tag_or_grammeme)

def lat2cyr(self, tag_or_grammeme):
""" Return Cyrillic representation for ``tag_or_grammeme`` string """
return self.TagClass.lat2cyr(tag_or_grammeme)

def __reduce__(self):
args = (self.dictionary.path, self._result_type_orig, self._unit_classes)
return self.__class__, args, None
Expand Down
108 changes: 84 additions & 24 deletions pymorphy2/tagset.py
Expand Up @@ -20,7 +20,6 @@ class _select_grammeme_from(object):
"""
def __init__(self, grammeme_set):
self.grammeme_set = grammeme_set

# ... are descriptors not magical enough?

# In order to fight typos, raise an exception
Expand All @@ -33,7 +32,8 @@ def __eq__(self, other):
if other is None:
return False
if other not in grammeme_set:
raise ValueError("'%s' is not a valid grammeme for this attribute." % other)
known_grammemes = ", ".join(grammeme_set)
raise ValueError("'%s' is not a valid grammeme for this attribute. Valid grammemes: %s" % (other, known_grammemes))
return _str.__eq__(self, other)

def __ne__(self, other):
Expand Down Expand Up @@ -73,7 +73,7 @@ class OpencorporaTag(object):
>>> from pymorphy2 import MorphAnalyzer
>>> morph = MorphAnalyzer()
>>> Tag = morph.TagClass # get an initialzed Tag class
>>> Tag = morph.TagClass # get an initialzed Tag class
>>> tag = Tag('VERB,perf,tran plur,impr,excl')
>>> tag
OpencorporaTag('VERB,perf,tran plur,impr,excl')
Expand Down Expand Up @@ -120,12 +120,10 @@ class OpencorporaTag(object):
>>> tag.POS == 'plur'
Traceback (most recent call last):
...
ValueError: 'plur' is not a valid grammeme for this attribute.
ValueError: 'plur' is not a valid grammeme for this attribute. Valid grammemes: ...
"""

__slots__ = ['_grammemes_tuple', '_grammemes_cache', '_str', '_POS']

# Grammeme categories
# (see http://opencorpora.org/dict.php?act=gram for a full set)
# -------------------------------------------------------------
Expand Down Expand Up @@ -240,6 +238,8 @@ class OpencorporaTag(object):
}
_GRAMMEME_INDICES = collections.defaultdict(int)
_GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
_LAT2CYR = None
_CYR2LAT = None
KNOWN_GRAMMEMES = set()

_NUMERAL_AGREEMENT_GRAMMEMES = (
Expand All @@ -257,6 +257,9 @@ class OpencorporaTag(object):
'voct': 'nomn'
}

__slots__ = ['_grammemes_tuple', '_grammemes_cache', '_str', '_POS',
'_cyr', '_cyr_grammemes_cache']

def __init__(self, tag):
self._str = tag
# XXX: we loose information about which grammemes
Expand All @@ -275,13 +278,8 @@ def __init__(self, tag):
self._grammemes_tuple = grammemes_tuple
self._POS = self._grammemes_tuple[0]
self._grammemes_cache = None

@property
def grammemes(self):
""" A frozenset with grammemes for this tag. """
if self._grammemes_cache is None:
self._grammemes_cache = frozenset(self._grammemes_tuple)
return self._grammemes_cache
self._cyr_grammemes_cache = None
self._cyr = None

# attributes for grammeme categories
POS = _select_grammeme_from(PARTS_OF_SPEECH)
Expand All @@ -297,6 +295,38 @@ def grammemes(self):
transitivity = _select_grammeme_from(TRANSITIVITY)
voice = _select_grammeme_from(VOICES)

@property
def grammemes(self):
""" A frozenset with grammemes for this tag. """
if self._grammemes_cache is None:
self._grammemes_cache = frozenset(self._grammemes_tuple)
return self._grammemes_cache

@property
def grammemes_cyr(self):
""" A frozenset with Cyrillic grammemes for this tag. """
if self._cyr_grammemes_cache is None:
cyr_grammemes = [self._LAT2CYR[g] for g in self._grammemes_tuple]
self._cyr_grammemes_cache = frozenset(cyr_grammemes)
return self._cyr_grammemes_cache

@property
def cyr_repr(self):
""" Cyrillic representation of this tag """
if self._cyr is None:
self._cyr = self.lat2cyr(self)
return self._cyr

@classmethod
def cyr2lat(cls, tag_or_grammeme):
""" Return Latin representation for ``tag_or_grammeme`` string """
return _translate_tag(tag_or_grammeme, cls._CYR2LAT)

@classmethod
def lat2cyr(cls, tag_or_grammeme):
""" Return Cyrillic representation for ``tag_or_grammeme`` string """
return _translate_tag(tag_or_grammeme, cls._LAT2CYR)

def __contains__(self, grammeme):

# {'NOUN', 'sing'} in tag
Expand Down Expand Up @@ -387,7 +417,13 @@ def fix_rare_cases(cls, grammemes):
"""
Replace rare cases (loc2/voct/...) with common ones (loct/nomn/...).
"""
return frozenset(cls.RARE_CASES.get(g,g) for g in grammemes)
return frozenset(cls.RARE_CASES.get(g, g) for g in grammemes)

@classmethod
def add_grammemes_to_known(cls, lat, cyr):
cls.KNOWN_GRAMMEMES.add(lat)
cls._LAT2CYR[lat] = cyr
cls._CYR2LAT[cyr] = lat

@classmethod
def _init_grammemes(cls, dict_grammemes):
Expand All @@ -403,18 +439,22 @@ def _init_grammemes(cls, dict_grammemes):
]
"""
gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes)
with threading.RLock():
cls.KNOWN_GRAMMEMES = set()
cls._CYR2LAT = {}
cls._LAT2CYR = {}
for name, parent, alias, description in dict_grammemes:
cls.add_grammemes_to_known(name, alias)

# figure out parents & children
children = collections.defaultdict(set)
for index, (name, parent, alias, description) in enumerate(dict_grammemes):
if parent:
children[parent].add(name)
if gr.get(parent, None): # parent's parent
children[gr[parent]].add(name)
gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes)

with threading.RLock():
cls.KNOWN_GRAMMEMES = set(gr.keys())
# figure out parents & children
children = collections.defaultdict(set)
for index, (name, parent, alias, description) in enumerate(dict_grammemes):
if parent:
children[parent].add(name)
if gr.get(parent, None): # parent's parent
children[gr[parent]].add(name)

# expand EXTRA_INCOMPATIBLE
for grammeme, g_set in cls._EXTRA_INCOMPATIBLE.items():
Expand Down Expand Up @@ -533,6 +573,26 @@ def _init_alias_map(cls, dict_grammemes):
for name, parent, alias, description in dict_grammemes:
cls._GRAMMEME_ALIAS_MAP[name] = alias


def _translate_tag(tag, mapping):
"""
Translate ``tag`` string according to ``mapping``, assuming grammemes
are separated by commas or whitespaces. Commas/whitespaces positions
are preserved.
"""
if isinstance(tag, OpencorporaTag):
tag = str(tag)
return " ".join([
_translate_comma_separated(whitespace_separated_part, mapping)
for whitespace_separated_part in tag.split()
])


def _translate_comma_separated(tag_part, mapping):
grammemes = [mapping.get(tok, tok) for tok in tag_part.split(',')]
return ",".join(grammemes)


registry = dict()

for tag_type in [CyrillicOpencorporaTag, OpencorporaTag]:
Expand Down
13 changes: 12 additions & 1 deletion pymorphy2/units/by_shape.py
Expand Up @@ -9,15 +9,19 @@
from pymorphy2.units.base import BaseAnalyzerUnit
from pymorphy2.shapes import is_latin, is_punctuation, is_roman_number


class _ShapeAnalyzer(BaseAnalyzerUnit):

terminal = True
SCORE = 0.9
EXTRA_GRAMMEMES = []
EXTRA_GRAMMEMES_CYR = []

def __init__(self, morph):
super(_ShapeAnalyzer, self).__init__(morph)
self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)

for lat, cyr in zip(self.EXTRA_GRAMMEMES, self.EXTRA_GRAMMEMES_CYR):
self.morph.TagClass.add_grammemes_to_known(lat, cyr)

def parse(self, word, word_lower, seen_parses):
shape = self.check_shape(word, word_lower)
Expand Down Expand Up @@ -49,10 +53,13 @@ def get_tag(self, word, shape):

class _SingleShapeAnalyzer(_ShapeAnalyzer):
TAG_STR = None
TAG_STR_CYR = None

def __init__(self, morph):
assert self.TAG_STR is not None
assert self.TAG_STR_CYR is not None
self.EXTRA_GRAMMEMES = self.TAG_STR.split(',')
self.EXTRA_GRAMMEMES_CYR = self.TAG_STR_CYR.split(',')
super(_SingleShapeAnalyzer, self).__init__(morph)
self._tag = self.morph.TagClass(self.TAG_STR)

Expand All @@ -66,6 +73,7 @@ class PunctuationAnalyzer(_SingleShapeAnalyzer):
Example: "," -> PNCT
"""
TAG_STR = 'PNCT'
TAG_STR_CYR = 'ЗПР' # aot.ru uses this name

def check_shape(self, word, word_lower):
return is_punctuation(word)
Expand All @@ -77,6 +85,7 @@ class LatinAnalyzer(_SingleShapeAnalyzer):
Example: "pdf" -> LATN
"""
TAG_STR = 'LATN'
TAG_STR_CYR = 'ЛАТ'

def check_shape(self, word, word_lower):
return is_latin(word)
Expand All @@ -93,13 +102,15 @@ class NumberAnalyzer(_SingleShapeAnalyzer):
"""
TAG_STR = 'NUMB'
TAG_STR_CYR = 'НОМ'

def check_shape(self, word, word_lower):
return word.isdigit()


class RomanNumberAnalyzer(_SingleShapeAnalyzer):
TAG_STR = 'ROMN'
TAG_STR_CYR = 'РИМ'
terminal = False # give LatinAnalyzer a chance

def check_shape(self, word, word_lower):
Expand Down
3 changes: 3 additions & 0 deletions tests/test_parsing.py
Expand Up @@ -6,6 +6,7 @@
import pymorphy2
from .utils import morph


def _to_test_data(text):
"""
Lines should be of this format: <word> <normal_form> <tag>.
Expand Down Expand Up @@ -188,12 +189,14 @@ def _check_analyzer(morph, parses):
parse = morph.parse(word)
assert_parse_is_correct(parse, word, normal_form, tag)


def _check_new_analyzer(parses):
morph = pymorphy2.MorphAnalyzer()
for word, normal_form, tag in parses:
parse = morph.parse(word)
assert_parse_is_correct(parse, word, normal_form, tag)


def _create_morph_analyzer(i):
morph = pymorphy2.MorphAnalyzer()
word, normal_form, tag = random.choice(PARSES)
Expand Down

0 comments on commit 9b27ea8

Please sign in to comment.