diff --git a/invenio_oaiharvester/api.py b/invenio_oaiharvester/api.py index 255acdf..b5a8930 100644 --- a/invenio_oaiharvester/api.py +++ b/invenio_oaiharvester/api.py @@ -45,7 +45,7 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None, - url=None, name=None, setspecs=None): + url=None, name=None, setspecs=None, encoding=None): """Harvest multiple records from an OAI repo. :param metadata_prefix: The prefix for the metadata return @@ -56,6 +56,8 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None, :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param setspecs: The 'set' criteria for the harvesting (optional). + :param encoding: Override the encoding returned by the server. ISO-8859-1 + if it is not provided by the server. :return: request object, list of harvested records """ lastrun = None @@ -73,7 +75,7 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None, "Retry using the parameters -n or -u ." ) - request = Sickle(url) + request = Sickle(url, encoding=encoding) # By convention, when we have a url we have no lastrun, and when we use # the name we can either have from_date (if provided) or lastrun. @@ -114,7 +116,8 @@ def list_records(metadata_prefix=None, from_date=None, until_date=None, return request, records.values() -def get_records(identifiers, metadata_prefix=None, url=None, name=None): +def get_records(identifiers, metadata_prefix=None, url=None, name=None, + encoding=None): """Harvest specific records from an OAI repo via OAI-PMH identifiers. :param metadata_prefix: The prefix for the metadata return @@ -123,6 +126,8 @@ def get_records(identifiers, metadata_prefix=None, url=None, name=None): :param url: The The url to be used to create the endpoint. :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. + :param encoding: Override the encoding returned by the server. ISO-8859-1 + if it is not provided by the server. :return: request object, list of harvested records """ if name: @@ -137,7 +142,7 @@ def get_records(identifiers, metadata_prefix=None, url=None, name=None): "Retry using the parameters -n or -u ." ) - request = Sickle(url) + request = Sickle(url, encoding=encoding) records = [] for identifier in identifiers: arguments = { diff --git a/invenio_oaiharvester/cli.py b/invenio_oaiharvester/cli.py index bcdeb3f..16cac30 100644 --- a/invenio_oaiharvester/cli.py +++ b/invenio_oaiharvester/cli.py @@ -61,9 +61,13 @@ def oaiharvester(): help="Enqueue harvesting and return immediately.") @click.option('--signals/--no-signals', default=True, help="Signals sent with OAI-PMH harvesting results.") +@click.option('-e', '--encoding', default=None, + help="Override the encoding returned by the server. ISO-8859-1 " + "if it is not provided by the server.") @with_appcontext def harvest(metadata_prefix, name, setspecs, identifiers, from_date, - until_date, url, directory, arguments, quiet, enqueue, signals): + until_date, url, directory, arguments, quiet, enqueue, signals, + encoding): """Harvest records from an OAI repository.""" arguments = dict(x.split('=', 1) for x in arguments) records = None @@ -84,7 +88,8 @@ def harvest(metadata_prefix, name, setspecs, identifiers, from_date, until_date, url, name, - setspecs + setspecs, + encoding ) else: if (from_date is not None) or (until_date is not None): @@ -104,7 +109,8 @@ def harvest(metadata_prefix, name, setspecs, identifiers, from_date, identifiers, metadata_prefix, url, - name + name, + encoding ) if records: diff --git a/invenio_oaiharvester/tasks.py b/invenio_oaiharvester/tasks.py index 7295a14..3166d7e 100644 --- a/invenio_oaiharvester/tasks.py +++ b/invenio_oaiharvester/tasks.py @@ -30,7 +30,8 @@ @shared_task def get_specific_records(identifiers, metadata_prefix=None, url=None, - name=None, signals=True, **kwargs): + name=None, signals=True, encoding=None, + **kwargs): """Harvest specific records from an OAI repo via OAI-PMH identifiers. :param metadata_prefix: The prefix for the metadata return (e.g. 'oai_dc') @@ -39,9 +40,12 @@ def get_specific_records(identifiers, metadata_prefix=None, url=None, :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param signals: If signals should be emitted about results. + :param encoding: Override the encoding returned by the server. ISO-8859-1 + if it is not provided by the server. """ identifiers = get_identifier_names(identifiers) - request, records = get_records(identifiers, metadata_prefix, url, name) + request, records = get_records(identifiers, metadata_prefix, url, name, + encoding) if signals: oaiharvest_finished.send(request, records=records, name=name, **kwargs) @@ -50,7 +54,7 @@ def get_specific_records(identifiers, metadata_prefix=None, url=None, def list_records_from_dates(metadata_prefix=None, from_date=None, until_date=None, url=None, name=None, setspecs=None, signals=True, - **kwargs): + encoding=None, **kwargs): """Harvest multiple records from an OAI repo. :param metadata_prefix: The prefix for the metadata return (e.g. 'oai_dc') @@ -61,6 +65,8 @@ def list_records_from_dates(metadata_prefix=None, from_date=None, specific parameters. :param setspecs: The 'set' criteria for the harvesting (optional). :param signals: If signals should be emitted about results. + :param encoding: Override the encoding returned by the server. ISO-8859-1 + if it is not provided by the server. """ request, records = list_records( metadata_prefix, @@ -68,7 +74,8 @@ def list_records_from_dates(metadata_prefix=None, from_date=None, until_date, url, name, - setspecs + setspecs, + encoding ) if signals: oaiharvest_finished.send(request, records=records, name=name, **kwargs) diff --git a/setup.py b/setup.py index f911d18..031ea80 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ 'pytest-cov>=2.4.0', 'pytest-pep8>=1.0.6', 'pytest>=2.8.0', - 'responses>=0.5.1', + 'responses>=0.8.0', 'celery>=3.1.25,<4.0', ] @@ -75,7 +75,7 @@ 'Flask>=0.12', 'flask-celeryext>=0.2.2', 'blinker>=1.4', - 'sickle>=0.5', + 'sickle>=0.6.1', ] packages = find_packages() diff --git a/tests/conftest.py b/tests/conftest.py index 8d219f3..a758d0c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -102,6 +102,15 @@ def sample_record_xml(): return raw_xml +@pytest.fixture() +def sample_record_xml_utf8(): + raw_xml = open(os.path.join( + os.path.dirname(__file__), + "data/sample_arxiv_response_utf8.xml" + )).read() + return raw_xml + + @pytest.fixture() def sample_record_xml_oai_dc(): raw_xml = open(os.path.join( diff --git a/tests/data/sample_arxiv_response_utf8.xml b/tests/data/sample_arxiv_response_utf8.xml new file mode 100644 index 0000000..30e8dc3 --- /dev/null +++ b/tests/data/sample_arxiv_response_utf8.xml @@ -0,0 +1,45 @@ + + + 2017-12-07T10:34:37Z + http://export.arxiv.org/oai2 + + +
+ oai:arXiv.org:1207.1019 + 2015-01-16 + cs + stat +
+ + + PAC-Bayesian Majority Vote for Late Classifier Fusion + Morvant, Emilie + Habrard, Amaury + Ayache, Stéphane + Statistics - Machine Learning + Computer Science - Computer Vision and Pattern Recognition + Computer Science - Learning + Computer Science - Multimedia + A lot of attention has been devoted to multimedia indexing over the past few + years. In the literature, we often consider two kinds of fusion schemes: The + early fusion and the late fusion. In this paper we focus on late classifier + fusion, where one combines the scores of each modality at the decision level. + To tackle this problem, we investigate a recent and elegant well-founded + quadratic program named MinCq coming from the Machine Learning PAC-Bayes + theory. MinCq looks for the weighted combination, over a set of real-valued + functions seen as voters, leading to the lowest misclassification rate, while + making use of the voters' diversity. We provide evidence that this method is + naturally adapted to late fusion procedure. We propose an extension of MinCq by + adding an order- preserving pairwise loss for ranking, helping to improve Mean + Averaged Precision measure. We confirm the good behavior of the MinCq-based + fusion approaches with experiments on a real image benchmark. + + Comment: 7 pages, Research report + 2012-07-04 + text + http://arxiv.org/abs/1207.1019 + + +
+
+
diff --git a/tests/test_cli.py b/tests/test_cli.py index d397274..e615be8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -109,7 +109,8 @@ def test_cli_harvest_list(script_info, sample_empty_set): '-m', 'arXiv', '-s', 'physics', '-f', '2015-01-17', - '-t', '2015-01-17'], + '-t', '2015-01-17', + '-e', 'utf-8'], obj=script_info ) assert result.exit_code == 0 diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index d112698..283c910 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -44,6 +44,40 @@ def test_model_based_harvesting(app, sample_config, sample_record_xml): assert len(records) == 1 +@responses.activate +def test_model_based_utf8_harvesting(app, sample_config, + sample_record_xml_utf8): + """Test harvesting using model encoded in utf-8.""" + responses.add( + responses.GET, + 'http://export.arxiv.org/oai2', + body=sample_record_xml_utf8, + content_type='text/xml;charset=utf-8' + ) + + with app.app_context(): + _, records = get_records(['oai:arXiv.org:1207.1019'], + name=sample_config) + record = records.pop() + assert record.raw.find(u'Stéphane') >= 0 + responses.remove( + responses.GET, + 'http://export.arxiv.org/oai2' + ) + responses.add( + responses.GET, + 'http://export.arxiv.org/oai2', + body=sample_record_xml_utf8, + content_type='text/xml' + ) + + with app.app_context(): + _, records = get_records(['oai:arXiv.org:1207.1019'], + name=sample_config, encoding='utf-8') + record = records.pop() + assert record.raw.find(u'Stéphane') >= 0 + + @responses.activate def test_model_based_harvesting_list(app, sample_config, sample_list_xml): """Test harvesting using model."""