first draft

kmike · Apr 8, 2012 · b98b35e · b98b35e
commit b98b35e
Show file tree

Hide file tree

Showing 10 changed files with 311 additions and 0 deletions.
diff --git a/.hgignore b/.hgignore
@@ -0,0 +1,26 @@
+\.settings
+\.project
+\.pydevproject
+\.cache/*
+\.idea/*
+
+#temp files
+\.pyc$
+\.orig$
+~$
+
+#os files
+\.DS_Store
+Thumbs.db$
+
+#setup
+^build
+^dist
+^MANIFEST$
+\.egg-info$
+
+#project-specific files
+\.tox
+^stuff
+annot.opcorpora.xml$
+\.bz2$
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2012 Mikhail Korobov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+include *.rst
+include tox.ini
diff --git a/README.rst b/README.rst
@@ -0,0 +1,23 @@
+=================
+opencorpora-tools
+=================
+
+This package provides Python interface to http://opencorpora.org/
+
+Installation
+============
+
+::
+
+    pip install opencorpora-tools
+
+If you have python < 2.7 then argparse and ordereddict packages are required::
+
+    pip install argparse
+    pip install ordereddict
+
+Usage
+=====
+
+TODO
+
diff --git a/bin/opencorpora b/bin/opencorpora
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+import sys
+from opencorpora import cli
+sys.exit(cli.main())
diff --git a/opencorpora/__init__.py b/opencorpora/__init__.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, division
+import re
+import codecs
+from collections import namedtuple
+from .compat import ElementTree, OrderedDict
+
+TextOffset = namedtuple('TextOffset', 'line_start line_end raw_start raw_end')
+
+class Corpora(object):
+    """
+    Opencorpora.ru corpora reader. Provides fast access to individual
+    texts without loading and parsing the whole XML; is capable of iterating
+    over individual paragraphs, sentences and tokens without loading
+    all data to memory.
+    """
+    def __init__(self, filename):
+        self.filename = filename
+        self._text_meta = OrderedDict()
+        self._populate_text_meta()
+
+    def _populate_text_meta(self):
+        """
+        Populates texts meta information cache for fast lookups.
+        """
+        for text_id, offset in self._text_offsets():
+            self._text_meta[text_id] = offset
+
+    def _text_offsets(self):
+        START_RE = re.compile(r'<text id="(\d+)"')
+        text_id, line_start, line_end, raw_start, raw_end = None, None, None, None, None
+        offset = 0
+        with open(self.filename, 'rb') as f:
+            for index, line in enumerate(f):
+                line_text = line.decode('utf8')
+                mo = re.match(START_RE, line_text)
+                if mo:
+                    text_id, line_start, raw_start = mo.group(1), index, offset
+
+                offset += len(line)
+
+                if '</text>' in line_text:
+                    yield text_id, TextOffset(line_start, index, raw_start, offset)
+                    text_id, line_start, line_end, raw_start, raw_end = None, None, None, None, None
+
+    def _get_text_by_raw_offset(self, text_id):
+        """
+        Loads text from xml using bytes offset information.
+        XXX: this is not tested under Windows.
+        """
+        offset = self._text_meta[text_id]
+        with open(self.filename, 'rb') as f:
+            f.seek(offset.raw_start)
+            return f.read(offset.raw_end-offset.raw_start).decode('utf8')
+
+    def _get_text_by_line_offset(self, text_id):
+        """
+        Loads text from xml using line offset information.
+        This is much slower than _get_text_by_raw_offset but should
+        work everywhere.
+        """
+        offset = self._text_meta[text_id]
+        lines = []
+        with codecs.open(self.filename, 'rb', 'utf8') as f:
+            for index, line in enumerate(f):
+                if index >= offset.line_start:
+                    lines.append(line)
+                if index >= offset.line_end:
+                    break
+        return ''.join(lines)
+
+    def get_text_ids(self):
+        return list(self._text_meta.keys())
+
+    def get_text_xml(self, text_id):
+        """
+        Returns xml Element for the text text_id.
+        """
+        text_str = self._get_text_by_raw_offset(str(text_id))
+        return ElementTree.XML(text_str.encode('utf8'))
+#
+#    def words(self):
+#        # list of str
+#        pass
+#
+#    def sents(self):
+#        # list of (list of str)
+#        pass
+#
+#    def paras(self):
+#        #list of (list of (list of str))
+#        pass
+#
+#    def tagged_words(self):
+#        # list of (str,str) tuple
+#        pass
+#
+#    def tagged_sents(self):
+#        # list of (list of (str,str))
+#        pass
+#
+#    def tagged_paras(self):
+#        # list of (list of (list of (str,str)))
+#        pass
+#
diff --git a/opencorpora/cli.py b/opencorpora/cli.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, division
+import sys
+import bz2
+import argparse
+import urllib2
+
+CORPORA_URL_BZ2 = 'http://opencorpora.org/files/export/annot/annot.opcorpora.xml.bz2'
+DEFAULT_OUT_FILE = 'annot.opcorpora.xml.bz2'
+CHUNK_SIZE = 256*1024
+
+parser = argparse.ArgumentParser(
+    description='opencorpora.org interface',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+subparsers = parser.add_subparsers()
+
+parser_download = subparsers.add_parser('download',
+    help='download and decompress annotated XML corpora',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser_download.add_argument('-o', '--output', type=str, help='destination file', default=DEFAULT_OUT_FILE)
+parser_download.add_argument('--url', help='download url', default=CORPORA_URL_BZ2)
+parser_download.add_argument('--decompress',  help='decompress', action='store_true')
+
+def download(args):
+    decompressor = bz2.BZ2Decompressor()
+    out_file = args.output
+    if args.decompress and out_file == DEFAULT_OUT_FILE:
+        out_file = DEFAULT_OUT_FILE[:-4]
+
+    print('Connecting...')
+    fp = urllib2.urlopen(args.url, timeout=30)
+
+    print('Creating %s from %s' % (out_file, args.url))
+    with open(out_file, 'w') as out:
+        while 1:
+            data = fp.read(CHUNK_SIZE)
+            if not data:
+                break
+
+            if args.decompress:
+                out.write(decompressor.decompress(data))
+            else:
+                out.write(data)
+
+            sys.stdout.write('.')
+            sys.stdout.flush()
+
+    print('\nDone.')
+
+parser_download.set_defaults(func=download)
+
+def main():
+    args = parser.parse_args()
+    return args.func(args)
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/opencorpora/compat.py b/opencorpora/compat.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+try:
+    from xml.etree import cElementTree as ElementTree
+except ImportError:
+    from xml.etree import ElementTree
+
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
diff --git a/setup.py b/setup.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+import sys
+from distutils.core import setup
+
+for cmd in ('egg_info', 'develop'):
+    if cmd in sys.argv:
+        from setuptools import setup
+
+__version__ = '0.1'
+
+setup(
+    name = 'opencorpora-tools',
+    version = __version__,
+    author = 'Mikhail Korobov',
+    author_email = 'kmike84@gmail.com',
+    url = 'https://github.com/kmike/opencorpora-tools/',
+
+    description = 'opencorpora.org python interface',
+    long_description = open('README.rst').read(),
+
+    license = 'MIT license',
+    packages = ['opencorpora'],
+    scripts=['bin/opencorpora'],
+    requires = ['argparse', 'ordereddict'],
+
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: Russian',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Scientific/Engineering :: Information Analysis',
+        'Topic :: Text Processing :: Linguistic',
+    ],
+)
diff --git a/tox.ini b/tox.ini
@@ -0,0 +1,15 @@
+[tox]
+envlist = py26,py27,py32,pypy
+
+[testenv]
+deps=
+    nose
+
+commands=
+    nosetests --nocapture
+
+[testenv:py26]
+deps=
+    nose
+    ordereddict
+    argparse