Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
ruscorpora.Tag class
  • Loading branch information
kmike committed Feb 4, 2013
1 parent 6190616 commit b1d94e6
Show file tree
Hide file tree
Showing 6 changed files with 437 additions and 23 deletions.
62 changes: 56 additions & 6 deletions README.rst
Expand Up @@ -15,14 +15,14 @@ Installation
Usage
=====

Obtaining corpora
-----------------
Corpus downloading
------------------

Download and unpack the archive with XML files from
http://www.ruscorpora.ru/corpora-usage.html

Using corpora
-------------
Corpus reading
--------------

``ruscorpora.parse_xml`` function parses single XML file and returns
an iterator over sentences; each sentence is a list of ``ruscorpora.Token``
Expand All @@ -34,10 +34,60 @@ information.

::

>>> import ruscorpora as rc
>>> for sent in rc.simplify(rc.parse('fiction.xml')):
>>> import ruscorpora as rnc
>>> for sent in rnc.simplify(rnc.parse('fiction.xml')):
... print(sent)

Working with tags
-----------------

``ruscorpora.Tag`` class is a convenient wrapper for tags used in
ruscorpora::

>>> tag = rnc.Tag('S,f,inan=sg,nom')
>>> tag.POS
'S'
>>> tag.gender
'f'
>>> tag.animacy
'inan'
>>> tag.number
'sg'
>>> tag.case
'nom'
>>> tag.tense
None

(there are also other attributes).

Check if a grammeme is in tag::

>>> 'S' in tag
True
>>> 'V' in tag
False
>>> 'Foo' in tag
Traceback (most recent call last)
...
ValueError: Grammeme is unknown: Foo

Test tags equality::

>>> tag == rnc.Tag('S,f,inan=sg,nom')
True
>>> tag == 'S,f,inan=sg,nom'
True
>>> tag == rnc.Tag('S,f,inan=sg,acc')
False
>>> tag == 'S,f,inan=sg,acc'
False
>>> tag == 'Foo,inan'
Traceback (most recent call last)
...
ValueError: Unknown grammemes: frozenset({Foo})

Tags returned by ``rnc.simplify`` are wrapped with this class by default.

Development
===========

Expand Down
17 changes: 15 additions & 2 deletions ruscorpora/__init__.py
Expand Up @@ -8,6 +8,7 @@
import warnings
import functools
from collections import namedtuple
from .tagset import Tag

Token = namedtuple('Token', 'text annotations')
Annotation = namedtuple('Annotation', 'lex gr joined')
Expand Down Expand Up @@ -49,15 +50,17 @@ def punct_tokens(txt):


def simplify(sents, remove_accents=True, join_split=True,
join_hyphenated=True, punct_tag='PNCT'):
join_hyphenated=True, punct_tag='PNCT', wrap_tags=True):
"""
Simplify the result of ``sents`` parsing:
* keep only a single annotation per word part;
* annotate punctuation with ``punct_tag``;
* join split words into a single token (if ``join_split==True``);
* join hyphenated words to a single token (if ``join_hyphenated==True``);
* remove accents (if ``remove_accents==True``).
* remove accents (if ``remove_accents==True``);
* convert string tag representation to ruscorpora.Tag instances
(if ``wrap_tags==True``).
"""

def remove_extra_annotations(token):
Expand Down Expand Up @@ -111,6 +114,13 @@ def fix_punct_tags(sent):

yield text, new_annotations

def with_wrapped_tags(sent):
for text, annotations in sent:
new_annotations = []
for ann in annotations:
new_annotations.append(ann._replace(gr=Tag(ann.gr)))
yield text, new_annotations


for sent in sents:
sent = map(remove_extra_annotations, sent)
Expand All @@ -126,6 +136,9 @@ def fix_punct_tags(sent):

sent = fix_punct_tags(sent)

if wrap_tags:
sent = with_wrapped_tags(sent)

yield [Token(*t) for t in sent]


Expand Down

0 comments on commit b1d94e6

Please sign in to comment.