Permalink
Browse files

Corpora.paras() and Corpora.sents() are fixed; better as_text() metho…

…ds; test coverage
  • Loading branch information...
1 parent 79c5252 commit bad9818a74ef75a92dda07855c8a322ecca6c7eb @kmike committed Apr 8, 2012
Showing with 55 additions and 26 deletions.
  1. +1 −0 .hgignore
  2. +14 −5 opencorpora/__init__.py
  3. +0 −1 opencorpora/compat.py
  4. +4 −16 opencorpora/xml_utils.py
  5. +26 −2 tests/test_corpora.py
  6. +10 −2 tox.ini
View
@@ -18,6 +18,7 @@ Thumbs.db$
^dist
^MANIFEST$
\.egg-info$
+\.coverage$
#project-specific files
\.tox
View
@@ -39,22 +39,25 @@ class Sentence(OpenCorporaBase):
Sentence.
"""
def __init__(self, xml):
- self.root = xml_utils.copy_element(xml)
+ self.root = xml
def itertokens(self):
for token in self.root.findall('tokens//token'):
yield token.get('text')
def source(self):
- return self.root.find('source')
+ return self.root.find('source').text
+
+ def as_text(self):
+ return self.source()
class Paragraph(OpenCorporaBase):
"""
Text paragraph.
"""
def __init__(self, xml):
- self.root = xml_utils.copy_element(xml)
+ self.root = xml
def itertokens(self):
for token in self.root.findall('sentence//token'):
@@ -64,13 +67,16 @@ def itersents(self):
for sent in self.root.findall('sentence'):
yield Sentence(sent)
+ def as_text(self):
+ return ' '.join(sent.as_text() for sent in self.itersents())
+
class Text(OpenCorporaBase):
"""
Single OpenCorpora text.
"""
def __init__(self, xml):
- self.root = xml_utils.copy_element(xml)
+ self.root = xml
def title(self):
return self.root.get('name')
@@ -87,6 +93,9 @@ def itersents(self):
for sent in self.root.findall('paragraphs//sentence'):
yield Sentence(sent)
+ def as_text(self):
+ return "\n\n".join(para.as_text() for para in self.iterparas())
+
class Corpora(OpenCorporaBase):
"""
@@ -117,7 +126,7 @@ def itertokens(self):
"""
Returns an iterator over corpus tokens.
"""
- for token in xml_utils.iterparse(self.filename, 'token'):
+ for token in xml_utils.iterparse(self.filename, 'token', clear=True):
yield token.get('text')
def itersents(self):
View
@@ -6,7 +6,6 @@
except ImportError:
from xml.etree import ElementTree
-
try:
from collections import OrderedDict
except ImportError:
View
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-from __future__ import absolute_import
+from __future__ import absolute_import, print_function, division
import codecs
from collections import namedtuple
import re
@@ -8,30 +8,18 @@
Bounds = namedtuple('Bounds', 'line_start line_end byte_start byte_end')
-def iterparse(source, tag):
+def iterparse(source, tag, clear=False):
"""
iterparse variant that supports 'tag' parameter (like lxml),
handles only 'end' event, yields elements and clears nodes after parsing.
"""
for event, elem in ElementTree.iterparse(source):
if elem.tag == tag:
yield elem
- elem.clear()
+ if clear:
+ elem.clear()
-def copy_element(original):
- """
- This creates a shallow copy of en Element;
- subelements will be shared with the original tree.
-
- Extracted from ElementTree 1.3.
- """
- elem = original.makeelement(original.tag, original.attrib)
- elem.text = original.text
- elem.tail = original.tail
- elem[:] = list(original)
- return elem
-
def unescape_attribute(text):
return xml.sax.saxutils.unescape(text, {'"': '"'})
View
@@ -28,11 +28,18 @@ def test_raw_loading(self):
loaded_line = self.corpus._get_text_by_line_offset(3) # this is reliable
self.assertEqual(loaded_raw, loaded_line)
- def test_text_xml(self):
+ def test_single_text_xml(self):
xml = self.corpus._text_xml(3)
tokens = xml.findall('paragraphs//token')
self.assertEqual(tokens[17].get('text'), 'арт-группы')
+ def test_texts_xml(self):
+ text = self.corpus.texts()[2]
+ tokens = text.tokens()
+ self.assertTrue(tokens)
+ self.assertEqual(tokens[17], 'арт-группы')
+
+
def test_text_titles(self):
titles = [text.title() for text in self.corpus.itertexts()]
catalog_titles = list(dict(self.corpus.catalog()).values())
@@ -51,8 +58,18 @@ def test_tokens(self):
def test_paras(self):
paras = self.corpus.paras()
+ self.assertEqual(len(paras), 41)
+
for para in paras:
- self.assertTrue(len(para.tokens()) > 0)
+ self.assertTrue(para.tokens())
+ self.assertTrue(para.sents())
+
+ def test_sents(self):
+ sents = self.corpus.sents()
+ self.assertEqual(len(sents), 102)
+
+ for sent in sents:
+ self.assertTrue(sent.tokens())
class TextTest(BaseTest):
@@ -64,3 +81,10 @@ def test_tokens(self):
self.assertEqual(len(tokens), 1027)
self.assertEqual(tokens[9], 'градус')
+
+ def test_sents(self):
+ sents = self.corpus.get_text(2).sents()
+ self.assertEqual(len(sents), 44)
+ self.assertEqual(sents[1].as_text(), 'Сохранится ли градус дискуссии в новом сезоне?')
+
+
View
12 tox.ini
@@ -4,12 +4,20 @@ envlist = py26,py27,py32,pypy
[testenv]
deps=
nose
+ coverage
commands=
- nosetests --nocapture
+ nosetests --nocapture --with-coverage --cover-package=opencorpora
[testenv:py26]
deps=
nose
+ coverage
ordereddict
- argparse
+ argparse
+
+[testenv:pypy]
+
+; current coverage is super-slow under pypy
+commands=
+ nosetests --nocapture

0 comments on commit bad9818

Please sign in to comment.