Skip to content

Commit

Permalink
cli: reharvesting of all OpenAIRE projects
Browse files Browse the repository at this point in the history
Signed-off-by: Krzysztof Nowak <k.nowak@cern.ch>
  • Loading branch information
Krzysztof Nowak committed Jun 14, 2017
1 parent cb3ec8d commit 8c82c42
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 28 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ cache:
services:
- postgresql
- elasticsearch
- redis

addons:
postgresql: "9.4"
Expand Down
45 changes: 41 additions & 4 deletions invenio_openaire/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@

from __future__ import absolute_import, print_function

import json
import os

import click
from flask.cli import with_appcontext

from invenio_openaire.loaders import OAIREDumper
from invenio_openaire.tasks import harvest_fundref, harvest_openaire_projects
from invenio_openaire.tasks import harvest_all_openaire_projects, \
harvest_fundref, harvest_openaire_projects, register_grant


@click.group()
Expand Down Expand Up @@ -64,11 +66,46 @@ def loadfunders(source=None):
type=str,
default='projects',
help="Set to harvest (default: projects).")
@click.option(
'--all', '-A', 'all_grants',
default=False,
is_flag=True,
help="Harvest all grants (default: False).")
@with_appcontext
def loadgrants(source=None, setspec=None):
def loadgrants(source=None, setspec=None, all_grants=False):
"""Harvest grants from OpenAIRE.
:param source: Load the grants from a local sqlite db (offline).
The value of the parameter should be a path to the local file.
:type source: str
:param setspec: Harvest specific set through OAI-PMH
Creates a remote connection to OpenAIRE.
:type setspec: str
:param all_grants: Harvest all sets through OAI-PMH,
as specified in the configuration OPENAIRE_GRANTS_SPEC. Sets are
harvested sequentially in the order specified in the configuration.
Creates a remote connection to OpenAIRE.
:type all_grants: bool
"""
if all_grants:
harvest_all_openaire_projects.delay()
else:
harvest_openaire_projects.delay(source=source, setspec=setspec)
click.echo("Background task sent to queue.")


@openaire.command()
@click.option(
'--source',
type=click.Path(file_okay=True, dir_okay=False, readable=True,
resolve_path=True, exists=True),
help="JSON file with grant information.")
@with_appcontext
def registergrant(source=None, setspec=None):
"""Harvest grants from OpenAIRE."""
harvest_openaire_projects.delay(source=source, setspec=setspec)
click.echo("Background task sent to queue.")
with open(source, 'r') as fp:
data = json.load(fp)
register_grant(data)


@openaire.command()
Expand Down
12 changes: 12 additions & 0 deletions invenio_openaire/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
OPENAIRE_SCHEMAS_DEFAULT_GRANT = 'grants/grant-v1.0.0.json'
OPENAIRE_JSONRESOLVER_GRANTS_HOST = 'inveniosoftware.org'

OPENAIRE_GRANTS_SPECS = [
'ARCProjects',
'ECProjects',
'FCTProjects',
'MESTDProjects',
'MZOSProjects',
'NHMRCProjects',
'NWOProjects',
'WTProjects',
'NSFProjects',
]


OPENAIRE_REST_ENDPOINTS = dict(
frdoi=dict(
Expand Down
3 changes: 3 additions & 0 deletions invenio_openaire/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,9 @@ def __init__(self, data=None):
'fct_________::FCT': 'http://dx.doi.org/10.13039/501100001871',
'wt__________::WT': 'http://dx.doi.org/10.13039/100004440',
'nsf_________::NSF': 'http://dx.doi.org/10.13039/100000001',
'mestd_______::MESTD': 'http://dx.doi.org/10.13039/501100004564',
'nwo_________::NWO': 'http://dx.doi.org/10.13039/501100003246',
'irb_hr______::MZOS': 'http://dx.doi.org/10.13039/501100006588',
}
self.data = data or fixed_funders
self.inverse_data = {v: k for k, v in self.data.items()}
Expand Down
30 changes: 28 additions & 2 deletions invenio_openaire/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@

from __future__ import absolute_import, print_function

from celery import shared_task
from copy import deepcopy

from celery import chain, shared_task
from flask import current_app
from invenio_db import db
from invenio_indexer.api import RecordIndexer
from invenio_pidstore.errors import PIDDoesNotExistError
from invenio_pidstore.resolver import Resolver
from invenio_records.api import Record
Expand Down Expand Up @@ -55,6 +59,18 @@ def harvest_openaire_projects(source=None, setspec=None):
register_grant.delay(grant_json)


@shared_task(ignore_result=True)
def harvest_all_openaire_projects():
"""Reharvest all grants from OpenAIRE.
Harvest all OpenAIRE grants in a chain to prevent OpenAIRE
overloading from multiple parallel harvesting.
"""
setspecs = current_app.config['OPENAIRE_GRANTS_SPECS']
chain(harvest_openaire_projects.s(setspec=setspec)
for setspec in setspecs).apply_async()


@shared_task(ignore_result=True)
def register_funder(data):
"""Register the funder JSON in records and create a PID."""
Expand All @@ -74,11 +90,21 @@ def create_or_update_record(data, pid_type, id_key, minter):

try:
pid, record = resolver.resolve(data[id_key])
if data['remote_modified'] != record['remote_modified']:
data_c = deepcopy(data)
del data_c['remote_modified']
record_c = deepcopy(data)
del record_c['remote_modified']
# All grants on OpenAIRE are modified periodically even if nothing
# has changed. We need to check for actual differences in the metadata
if data_c != record_c:
record.update(data)
record.commit()
record_id = record.id
db.session.commit()
RecordIndexer().index_by_id(str(record_id))
except PIDDoesNotExistError:
record = Record.create(data)
record_id = record.id
minter(record.id, data)
db.session.commit()
RecordIndexer().index_by_id(str(record_id))
13 changes: 9 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,21 @@ def es(app):
list(current_search.create())
except RequestError:
list(current_search.delete(ignore=[404]))
list(current_search.create())
list(current_search.create(ignore=[400]))
yield current_search
list(current_search.delete(ignore=[404]))


@pytest.yield_fixture()
def indexed_records(app, es, db):
"""Provide elasticsearch access."""
harvest_openaire_projects(source='tests/testdata/openaire_test.sqlite')
def funders(app, es, db):
"""Funder records fixture."""
harvest_fundref(source='tests/testdata/fundref_test.rdf')


@pytest.yield_fixture()
def grants(app, es, db, funders):
"""Grant records fixture."""
harvest_openaire_projects(source='tests/testdata/openaire_test.sqlite')
records = []
for record in RecordMetadata.query.all():
records.append(record.id)
Expand Down
11 changes: 5 additions & 6 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,12 @@
from os.path import dirname, join

from click.testing import CliRunner
from conftest import MockSickle
from invenio_pidstore.models import PersistentIdentifier
from mock import patch

from invenio_openaire.cli import openaire


def test_loadfunders(script_info):
def test_loadfunders(script_info, es):
"""Test CLI for loading grants."""
assert PersistentIdentifier.query.count() == 0
runner = CliRunner()
Expand All @@ -49,9 +47,10 @@ def test_loadfunders(script_info):
assert PersistentIdentifier.query.count() == 6


def test_loadgrants(script_info):
def test_loadgrants(script_info, es, funders):
"""Test CLI for loading grants."""
assert PersistentIdentifier.query.count() == 0
# Funders only
assert PersistentIdentifier.query.count() == 6
runner = CliRunner()
result = runner.invoke(
openaire,
Expand All @@ -60,4 +59,4 @@ def test_loadgrants(script_info):
obj=script_info)
print(result.output)
assert result.exit_code == 0
assert PersistentIdentifier.query.count() == 40
assert PersistentIdentifier.query.count() == 46
4 changes: 2 additions & 2 deletions tests/test_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def load_all_testdata():

# Test the resolver rule for two possible DOI hosts: dx.doi.org and doi.org
@pytest.mark.parametrize("doi_host", ['dx.doi.org', 'doi.org'])
def test_funders_json_resolving(doi_host, app, db):
def test_funders_json_resolving(doi_host, app, db, es):
"""Test the loadef for the FundRef dataset."""
# Test loading the real FundRef dataset.
# 'grant': {'$ref': 'https://zenodo.org/funders/10.19/11/grants/22'}
Expand All @@ -86,7 +86,7 @@ def test_funders_json_resolving(doi_host, app, db):
assert out_json['parent']['parent']['name'] == 'University of Foo'


def test_grants_json_resolving(app, db):
def test_grants_json_resolving(app, db, es, funders):
"""Test the loadef for the FundRef dataset."""
load_grants_testdata()
grant_ref = {'$ref': 'http://inveniosoftware.org/grants/'
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from invenio_openaire.config import OPENAIRE_REST_ENDPOINTS


def test_records_rest(app, db, es, indexed_records):
def test_records_rest(app, db, es, grants):
"""Test Records REST."""
app.config['RECORDS_REST_ENDPOINTS'] = OPENAIRE_REST_ENDPOINTS
app.config['RECORDS_REST_DEFAULT_READ_PERMISSION_FACTORY'] = None
Expand Down
17 changes: 8 additions & 9 deletions tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@
from invenio_openaire.tasks import harvest_fundref, harvest_openaire_projects


def test_harvest_openaire_projects(app, db):
def test_harvest_openaire_projects(app, db, es, funders):
"""Test harvest_openaire_projects."""
with app.app_context():
# Use local OpenAIRE loader
harvest_openaire_projects(source='tests/testdata/openaire_test.sqlite')
assert PersistentIdentifier.query.count() == 40
assert RecordMetadata.query.count() == 10
assert PersistentIdentifier.query.count() == 46
assert RecordMetadata.query.count() == 15


def test_harvest_fundref(app, db):
def test_harvest_fundref(app, db, es):
"""Test harvest_openaire_projects."""
with app.app_context():
harvest_fundref(source='tests/testdata/fundref_test.rdf')
Expand All @@ -51,26 +51,25 @@ def test_harvest_fundref(app, db):
assert RecordMetadata.query.count() == 5


def test_reharvest_fundref(app, db):
def test_reharvest_fundref(app, db, es):
"""Test harvest_openaire_projects."""
with app.app_context():
harvest_fundref(source='tests/testdata/fundref_test.rdf')
assert PersistentIdentifier.query.count() == 6
assert RecordMetadata.query.count() == 5
recid = PersistentIdentifier.query.first().object_uuid
test_date = "2002-01-01T16:00:00.000000"
record = Record.get_record(recid)
record['remote_modified'] = test_date
record['title'] = 'Foobar'
record.commit()
db.session.commit()
harvest_fundref(source='tests/testdata/fundref_test.rdf')
assert PersistentIdentifier.query.count() == 6
assert RecordMetadata.query.count() == 5
record = Record.get_record(recid)
assert record['remote_modified'] != test_date
assert record['remote_modified'] != 'Foobar'


def test_harvest_all(app, db):
def test_harvest_all(app, db, es):
"""Test harvest_openaire_projects."""
with app.app_context():
harvest_fundref(source='tests/testdata/fundref_test.rdf')
Expand Down

0 comments on commit 8c82c42

Please sign in to comment.