Skip to content

Commit

Permalink
first draft
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Apr 8, 2012
0 parents commit b98b35e
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 0 deletions.
26 changes: 26 additions & 0 deletions .hgignore
@@ -0,0 +1,26 @@
\.settings
\.project
\.pydevproject
\.cache/*
\.idea/*

#temp files
\.pyc$
\.orig$
~$

#os files
\.DS_Store
Thumbs.db$

#setup
^build
^dist
^MANIFEST$
\.egg-info$

#project-specific files
\.tox
^stuff
annot.opcorpora.xml$
\.bz2$
19 changes: 19 additions & 0 deletions LICENSE.txt
@@ -0,0 +1,19 @@
Copyright (c) 2012 Mikhail Korobov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
2 changes: 2 additions & 0 deletions MANIFEST.in
@@ -0,0 +1,2 @@
include *.rst
include tox.ini
23 changes: 23 additions & 0 deletions README.rst
@@ -0,0 +1,23 @@
=================
opencorpora-tools
=================

This package provides Python interface to http://opencorpora.org/

Installation
============

::

pip install opencorpora-tools

If you have python < 2.7 then argparse and ordereddict packages are required::

pip install argparse
pip install ordereddict

Usage
=====

TODO

4 changes: 4 additions & 0 deletions bin/opencorpora
@@ -0,0 +1,4 @@
#!/usr/bin/env python
import sys
from opencorpora import cli
sys.exit(cli.main())
105 changes: 105 additions & 0 deletions opencorpora/__init__.py
@@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import re
import codecs
from collections import namedtuple
from .compat import ElementTree, OrderedDict

TextOffset = namedtuple('TextOffset', 'line_start line_end raw_start raw_end')

class Corpora(object):
"""
Opencorpora.ru corpora reader. Provides fast access to individual
texts without loading and parsing the whole XML; is capable of iterating
over individual paragraphs, sentences and tokens without loading
all data to memory.
"""
def __init__(self, filename):
self.filename = filename
self._text_meta = OrderedDict()
self._populate_text_meta()

def _populate_text_meta(self):
"""
Populates texts meta information cache for fast lookups.
"""
for text_id, offset in self._text_offsets():
self._text_meta[text_id] = offset

def _text_offsets(self):
START_RE = re.compile(r'<text id="(\d+)"')
text_id, line_start, line_end, raw_start, raw_end = None, None, None, None, None
offset = 0
with open(self.filename, 'rb') as f:
for index, line in enumerate(f):
line_text = line.decode('utf8')
mo = re.match(START_RE, line_text)
if mo:
text_id, line_start, raw_start = mo.group(1), index, offset

offset += len(line)

if '</text>' in line_text:
yield text_id, TextOffset(line_start, index, raw_start, offset)
text_id, line_start, line_end, raw_start, raw_end = None, None, None, None, None

def _get_text_by_raw_offset(self, text_id):
"""
Loads text from xml using bytes offset information.
XXX: this is not tested under Windows.
"""
offset = self._text_meta[text_id]
with open(self.filename, 'rb') as f:
f.seek(offset.raw_start)
return f.read(offset.raw_end-offset.raw_start).decode('utf8')

def _get_text_by_line_offset(self, text_id):
"""
Loads text from xml using line offset information.
This is much slower than _get_text_by_raw_offset but should
work everywhere.
"""
offset = self._text_meta[text_id]
lines = []
with codecs.open(self.filename, 'rb', 'utf8') as f:
for index, line in enumerate(f):
if index >= offset.line_start:
lines.append(line)
if index >= offset.line_end:
break
return ''.join(lines)

def get_text_ids(self):
return list(self._text_meta.keys())

def get_text_xml(self, text_id):
"""
Returns xml Element for the text text_id.
"""
text_str = self._get_text_by_raw_offset(str(text_id))
return ElementTree.XML(text_str.encode('utf8'))
#
# def words(self):
# # list of str
# pass
#
# def sents(self):
# # list of (list of str)
# pass
#
# def paras(self):
# #list of (list of (list of str))
# pass
#
# def tagged_words(self):
# # list of (str,str) tuple
# pass
#
# def tagged_sents(self):
# # list of (list of (str,str))
# pass
#
# def tagged_paras(self):
# # list of (list of (list of (str,str)))
# pass
#
60 changes: 60 additions & 0 deletions opencorpora/cli.py
@@ -0,0 +1,60 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import sys
import bz2
import argparse
import urllib2

CORPORA_URL_BZ2 = 'http://opencorpora.org/files/export/annot/annot.opcorpora.xml.bz2'
DEFAULT_OUT_FILE = 'annot.opcorpora.xml.bz2'
CHUNK_SIZE = 256*1024

parser = argparse.ArgumentParser(
description='opencorpora.org interface',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
subparsers = parser.add_subparsers()

parser_download = subparsers.add_parser('download',
help='download and decompress annotated XML corpora',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser_download.add_argument('-o', '--output', type=str, help='destination file', default=DEFAULT_OUT_FILE)
parser_download.add_argument('--url', help='download url', default=CORPORA_URL_BZ2)
parser_download.add_argument('--decompress', help='decompress', action='store_true')

def download(args):
decompressor = bz2.BZ2Decompressor()
out_file = args.output
if args.decompress and out_file == DEFAULT_OUT_FILE:
out_file = DEFAULT_OUT_FILE[:-4]

print('Connecting...')
fp = urllib2.urlopen(args.url, timeout=30)

print('Creating %s from %s' % (out_file, args.url))
with open(out_file, 'w') as out:
while 1:
data = fp.read(CHUNK_SIZE)
if not data:
break

if args.decompress:
out.write(decompressor.decompress(data))
else:
out.write(data)

sys.stdout.write('.')
sys.stdout.flush()

print('\nDone.')

parser_download.set_defaults(func=download)

def main():
args = parser.parse_args()
return args.func(args)

if __name__ == '__main__':
sys.exit(main())
13 changes: 13 additions & 0 deletions opencorpora/compat.py
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

try:
from xml.etree import cElementTree as ElementTree
except ImportError:
from xml.etree import ElementTree


try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
44 changes: 44 additions & 0 deletions setup.py
@@ -0,0 +1,44 @@
#!/usr/bin/env python
import sys
from distutils.core import setup

for cmd in ('egg_info', 'develop'):
if cmd in sys.argv:
from setuptools import setup

__version__ = '0.1'

setup(
name = 'opencorpora-tools',
version = __version__,
author = 'Mikhail Korobov',
author_email = 'kmike84@gmail.com',
url = 'https://github.com/kmike/opencorpora-tools/',

description = 'opencorpora.org python interface',
long_description = open('README.rst').read(),

license = 'MIT license',
packages = ['opencorpora'],
scripts=['bin/opencorpora'],
requires = ['argparse', 'ordereddict'],

classifiers=[
'Development Status :: 2 - Pre-Alpha',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Natural Language :: Russian',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Text Processing :: Linguistic',
],
)
15 changes: 15 additions & 0 deletions tox.ini
@@ -0,0 +1,15 @@
[tox]
envlist = py26,py27,py32,pypy

[testenv]
deps=
nose

commands=
nosetests --nocapture

[testenv:py26]
deps=
nose
ordereddict
argparse

0 comments on commit b98b35e

Please sign in to comment.