Skip to content

Commit

Permalink
Merge pull request #51 from rero/rero-encoding-support
Browse files Browse the repository at this point in the history
feat: add encoding support
  • Loading branch information
david-caro committed Jan 11, 2018
2 parents 8b5c0a7 + e4d3311 commit a4a2b54
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 14 deletions.
13 changes: 9 additions & 4 deletions invenio_oaiharvester/api.py
Expand Up @@ -45,7 +45,7 @@


def list_records(metadata_prefix=None, from_date=None, until_date=None,
url=None, name=None, setspecs=None):
url=None, name=None, setspecs=None, encoding=None):
"""Harvest multiple records from an OAI repo.
:param metadata_prefix: The prefix for the metadata return
Expand All @@ -56,6 +56,8 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None,
:param name: The name of the OAIHarvestConfig to use instead of passing
specific parameters.
:param setspecs: The 'set' criteria for the harvesting (optional).
:param encoding: Override the encoding returned by the server. ISO-8859-1
if it is not provided by the server.
:return: request object, list of harvested records
"""
lastrun = None
Expand All @@ -73,7 +75,7 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None,
"Retry using the parameters -n <name> or -u <url>."
)

request = Sickle(url)
request = Sickle(url, encoding=encoding)

# By convention, when we have a url we have no lastrun, and when we use
# the name we can either have from_date (if provided) or lastrun.
Expand Down Expand Up @@ -114,7 +116,8 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None,
return request, records.values()


def get_records(identifiers, metadata_prefix=None, url=None, name=None):
def get_records(identifiers, metadata_prefix=None, url=None, name=None,
encoding=None):
"""Harvest specific records from an OAI repo via OAI-PMH identifiers.
:param metadata_prefix: The prefix for the metadata return
Expand All @@ -123,6 +126,8 @@ def get_records(identifiers, metadata_prefix=None, url=None, name=None):
:param url: The The url to be used to create the endpoint.
:param name: The name of the OAIHarvestConfig to use instead of passing
specific parameters.
:param encoding: Override the encoding returned by the server. ISO-8859-1
if it is not provided by the server.
:return: request object, list of harvested records
"""
if name:
Expand All @@ -137,7 +142,7 @@ def get_records(identifiers, metadata_prefix=None, url=None, name=None):
"Retry using the parameters -n <name> or -u <url>."
)

request = Sickle(url)
request = Sickle(url, encoding=encoding)
records = []
for identifier in identifiers:
arguments = {
Expand Down
12 changes: 9 additions & 3 deletions invenio_oaiharvester/cli.py
Expand Up @@ -61,9 +61,13 @@ def oaiharvester():
help="Enqueue harvesting and return immediately.")
@click.option('--signals/--no-signals', default=True,
help="Signals sent with OAI-PMH harvesting results.")
@click.option('-e', '--encoding', default=None,
help="Override the encoding returned by the server. ISO-8859-1 "
"if it is not provided by the server.")
@with_appcontext
def harvest(metadata_prefix, name, setspecs, identifiers, from_date,
until_date, url, directory, arguments, quiet, enqueue, signals):
until_date, url, directory, arguments, quiet, enqueue, signals,
encoding):
"""Harvest records from an OAI repository."""
arguments = dict(x.split('=', 1) for x in arguments)
records = None
Expand All @@ -84,7 +88,8 @@ def harvest(metadata_prefix, name, setspecs, identifiers, from_date,
until_date,
url,
name,
setspecs
setspecs,
encoding
)
else:
if (from_date is not None) or (until_date is not None):
Expand All @@ -104,7 +109,8 @@ def harvest(metadata_prefix, name, setspecs, identifiers, from_date,
identifiers,
metadata_prefix,
url,
name
name,
encoding
)

if records:
Expand Down
15 changes: 11 additions & 4 deletions invenio_oaiharvester/tasks.py
Expand Up @@ -30,7 +30,8 @@

@shared_task
def get_specific_records(identifiers, metadata_prefix=None, url=None,
name=None, signals=True, **kwargs):
name=None, signals=True, encoding=None,
**kwargs):
"""Harvest specific records from an OAI repo via OAI-PMH identifiers.
:param metadata_prefix: The prefix for the metadata return (e.g. 'oai_dc')
Expand All @@ -39,9 +40,12 @@ def get_specific_records(identifiers, metadata_prefix=None, url=None,
:param name: The name of the OAIHarvestConfig to use instead of passing
specific parameters.
:param signals: If signals should be emitted about results.
:param encoding: Override the encoding returned by the server. ISO-8859-1
if it is not provided by the server.
"""
identifiers = get_identifier_names(identifiers)
request, records = get_records(identifiers, metadata_prefix, url, name)
request, records = get_records(identifiers, metadata_prefix, url, name,
encoding)
if signals:
oaiharvest_finished.send(request, records=records, name=name, **kwargs)

Expand All @@ -50,7 +54,7 @@ def get_specific_records(identifiers, metadata_prefix=None, url=None,
def list_records_from_dates(metadata_prefix=None, from_date=None,
until_date=None, url=None,
name=None, setspecs=None, signals=True,
**kwargs):
encoding=None, **kwargs):
"""Harvest multiple records from an OAI repo.
:param metadata_prefix: The prefix for the metadata return (e.g. 'oai_dc')
Expand All @@ -61,14 +65,17 @@ def list_records_from_dates(metadata_prefix=None, from_date=None,
specific parameters.
:param setspecs: The 'set' criteria for the harvesting (optional).
:param signals: If signals should be emitted about results.
:param encoding: Override the encoding returned by the server. ISO-8859-1
if it is not provided by the server.
"""
request, records = list_records(
metadata_prefix,
from_date,
until_date,
url,
name,
setspecs
setspecs,
encoding
)
if signals:
oaiharvest_finished.send(request, records=records, name=name, **kwargs)
4 changes: 2 additions & 2 deletions setup.py
Expand Up @@ -41,7 +41,7 @@
'pytest-cov>=2.4.0',
'pytest-pep8>=1.0.6',
'pytest>=2.8.0',
'responses>=0.5.1',
'responses>=0.8.0',
'celery>=3.1.25,<4.0',
]

Expand Down Expand Up @@ -75,7 +75,7 @@
'Flask>=0.12',
'flask-celeryext>=0.2.2',
'blinker>=1.4',
'sickle>=0.5',
'sickle>=0.6.1',
]

packages = find_packages()
Expand Down
9 changes: 9 additions & 0 deletions tests/conftest.py
Expand Up @@ -102,6 +102,15 @@ def sample_record_xml():
return raw_xml


@pytest.fixture()
def sample_record_xml_utf8():
raw_xml = open(os.path.join(
os.path.dirname(__file__),
"data/sample_arxiv_response_utf8.xml"
)).read()
return raw_xml


@pytest.fixture()
def sample_record_xml_oai_dc():
raw_xml = open(os.path.join(
Expand Down
45 changes: 45 additions & 0 deletions tests/data/sample_arxiv_response_utf8.xml
@@ -0,0 +1,45 @@
<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
<responseDate>2017-12-07T10:34:37Z</responseDate>
<request verb="GetRecord" identifier="oai:arXiv.org:1207.1019" metadataPrefix="oai_dc">http://export.arxiv.org/oai2</request>
<GetRecord>
<record>
<header>
<identifier>oai:arXiv.org:1207.1019</identifier>
<datestamp>2015-01-16</datestamp>
<setSpec>cs</setSpec>
<setSpec>stat</setSpec>
</header>
<metadata>
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>PAC-Bayesian Majority Vote for Late Classifier Fusion</dc:title>
<dc:creator>Morvant, Emilie</dc:creator>
<dc:creator>Habrard, Amaury</dc:creator>
<dc:creator>Ayache, Stéphane</dc:creator>
<dc:subject>Statistics - Machine Learning</dc:subject>
<dc:subject>Computer Science - Computer Vision and Pattern Recognition</dc:subject>
<dc:subject>Computer Science - Learning</dc:subject>
<dc:subject>Computer Science - Multimedia</dc:subject>
<dc:description> A lot of attention has been devoted to multimedia indexing over the past few
years. In the literature, we often consider two kinds of fusion schemes: The
early fusion and the late fusion. In this paper we focus on late classifier
fusion, where one combines the scores of each modality at the decision level.
To tackle this problem, we investigate a recent and elegant well-founded
quadratic program named MinCq coming from the Machine Learning PAC-Bayes
theory. MinCq looks for the weighted combination, over a set of real-valued
functions seen as voters, leading to the lowest misclassification rate, while
making use of the voters' diversity. We provide evidence that this method is
naturally adapted to late fusion procedure. We propose an extension of MinCq by
adding an order- preserving pairwise loss for ranking, helping to improve Mean
Averaged Precision measure. We confirm the good behavior of the MinCq-based
fusion approaches with experiments on a real image benchmark.
</dc:description>
<dc:description>Comment: 7 pages, Research report</dc:description>
<dc:date>2012-07-04</dc:date>
<dc:type>text</dc:type>
<dc:identifier>http://arxiv.org/abs/1207.1019</dc:identifier>
</oai_dc:dc>
</metadata>
</record>
</GetRecord>
</OAI-PMH>
3 changes: 2 additions & 1 deletion tests/test_cli.py
Expand Up @@ -109,7 +109,8 @@ def test_cli_harvest_list(script_info, sample_empty_set):
'-m', 'arXiv',
'-s', 'physics',
'-f', '2015-01-17',
'-t', '2015-01-17'],
'-t', '2015-01-17',
'-e', 'utf-8'],
obj=script_info
)
assert result.exit_code == 0
Expand Down
34 changes: 34 additions & 0 deletions tests/test_harvesting.py
Expand Up @@ -44,6 +44,40 @@ def test_model_based_harvesting(app, sample_config, sample_record_xml):
assert len(records) == 1


@responses.activate
def test_model_based_utf8_harvesting(app, sample_config,
sample_record_xml_utf8):
"""Test harvesting using model encoded in utf-8."""
responses.add(
responses.GET,
'http://export.arxiv.org/oai2',
body=sample_record_xml_utf8,
content_type='text/xml;charset=utf-8'
)

with app.app_context():
_, records = get_records(['oai:arXiv.org:1207.1019'],
name=sample_config)
record = records.pop()
assert record.raw.find(u'Stéphane') >= 0
responses.remove(
responses.GET,
'http://export.arxiv.org/oai2'
)
responses.add(
responses.GET,
'http://export.arxiv.org/oai2',
body=sample_record_xml_utf8,
content_type='text/xml'
)

with app.app_context():
_, records = get_records(['oai:arXiv.org:1207.1019'],
name=sample_config, encoding='utf-8')
record = records.pop()
assert record.raw.find(u'Stéphane') >= 0


@responses.activate
def test_model_based_harvesting_list(app, sample_config, sample_list_xml):
"""Test harvesting using model."""
Expand Down

0 comments on commit a4a2b54

Please sign in to comment.