Skip to content

Commit

Permalink
datastreams: add configs for funders and affiliations
Browse files Browse the repository at this point in the history
  • Loading branch information
tmorrell committed May 29, 2024
1 parent 70ab72f commit 17a7023
Show file tree
Hide file tree
Showing 11 changed files with 241 additions and 156 deletions.
10 changes: 8 additions & 2 deletions invenio_vocabularies/contrib/affiliations/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from invenio_i18n import lazy_gettext as _

from ...datastreams.writers import ServiceWriter
from .config import affiliation_schemes


class AffiliationsServiceWriter(ServiceWriter):
Expand All @@ -38,13 +39,18 @@ def _entry_id(self, entry):
{
"type": "zip",
"args": {
"regex": "(?<!_schema_v2)\\.json$",
"regex": "_schema_v2\\.json$",
},
},
{"type": "json"},
],
"transformers": [
{"type": "ror"},
{
"type": "ror",
"args": {
"vocab_schemes": affiliation_schemes,
},
},
],
"writers": [
{
Expand Down
18 changes: 0 additions & 18 deletions invenio_vocabularies/contrib/common/ror/config.py

This file was deleted.

62 changes: 38 additions & 24 deletions invenio_vocabularies/contrib/common/ror/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
from invenio_vocabularies.datastreams.readers import BaseReader
from invenio_vocabularies.datastreams.transformers import BaseTransformer

from .config import funder_fundref_doi_prefix, funder_schemes


class RORHTTPReader(BaseReader):
"""ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
Expand Down Expand Up @@ -74,24 +72,35 @@ def read(self, item=None, *args, **kwargs):
class RORTransformer(BaseTransformer):
"""Transforms a JSON ROR record into a funders record."""

def __init__(
self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
):
"""Initializes the transformer."""
self.vocab_schemes = vocab_schemes
self.funder_fundref_doi_prefix = funder_fundref_doi_prefix
super().__init__(*args, **kwargs)

def apply(self, stream_entry, **kwargs):
"""Applies the transformation to the stream entry."""
record = stream_entry.entry
funder = {}
funder["title"] = {}
ror = {}
ror["title"] = {}

funder["id"] = normalize_ror(record.get("id"))
if not funder["id"]:
ror["id"] = normalize_ror(record.get("id"))
if not ror["id"]:
raise TransformerError(_("Id not found in ROR entry."))

aliases = []
acronym = None
for name in record.get("names"):
lang = name.get("lang", "en")
if lang == None:
lang = "en"
if "ror_display" in name["types"]:
funder["name"] = name["value"]
ror["name"] = name["value"]
if "label" in name["types"]:
funder["title"][lang] = name["value"]
print(lang)
ror["title"][lang] = name["value"]
if "alias" in name["types"]:
aliases.append(name["value"])
if "acronym" in name["types"]:
Expand All @@ -102,51 +111,56 @@ def apply(self, stream_entry, **kwargs):
else:
aliases.append(name["value"])
if acronym:
funder["acronym"] = acronym
ror["acronym"] = acronym
if aliases:
funder["aliases"] = aliases
ror["aliases"] = aliases

# ror_display is required and should be in every entry
if not funder["name"]:
if not ror["name"]:
raise TransformerError(
_("Name with type ror_display not found in ROR entry.")
)

# This only gets the first location, to maintain compatability
# with existing data structure
location = record.get("locations", [{}])[0].get("geonames_details", {})
funder["country"] = location.get("country_code")
funder["country_name"] = location.get("country_name")
funder["location_name"] = location.get("name")
ror["country"] = location.get("country_code")
ror["country_name"] = location.get("country_name")
ror["location_name"] = location.get("name")

funder["types"] = record.get("types")
ror["types"] = record.get("types")

status = record.get("status")
funder["status"] = status
ror["status"] = status

# The ROR is always listed in identifiers, expected by serialization
funder["identifiers"] = [{"identifier": funder["id"], "scheme": "ror"}]
valid_schemes = set(funder_schemes.keys())
ror["identifiers"] = [{"identifier": ror["id"], "scheme": "ror"}]
if self.vocab_schemes:
valid_schemes = set(self.vocab_schemes.keys())
else:
valid_schemes = set()
fund_ref = "fundref"
valid_schemes.add(fund_ref)
if self.funder_fundref_doi_prefix:
valid_schemes.add(fund_ref)
for identifier in record.get("external_ids"):
scheme = identifier["type"]
if scheme in valid_schemes:
value = identifier.get("preferred") or identifier.get("all")[0]
if scheme == fund_ref:
value = f"{funder_fundref_doi_prefix}/{value}"
scheme = "doi"
funder["identifiers"].append(
if self.funder_fundref_doi_prefix:
value = f"{self.funder_fundref_doi_prefix}/{value}"
scheme = "doi"
ror["identifiers"].append(
{
"identifier": value,
"scheme": scheme,
}
)

stream_entry.entry = funder
stream_entry.entry = ror
return stream_entry


VOCABULARIES_DATASTREAM_TRANSFORMERS = {
"ror-funder": RORTransformer,
"ror": RORTransformer,
}
9 changes: 8 additions & 1 deletion invenio_vocabularies/contrib/funders/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from invenio_i18n import lazy_gettext as _

from ...datastreams.writers import ServiceWriter
from .config import funder_fundref_doi_prefix, funder_schemes


class FundersServiceWriter(ServiceWriter):
Expand Down Expand Up @@ -46,7 +47,13 @@ def _entry_id(self, entry):
{"type": "json"},
],
"transformers": [
{"type": "ror"},
{
"type": "ror",
"args": {
"vocab_schemes": funder_schemes,
"funder_fundref_doi_prefix": funder_fundref_doi_prefix,
},
},
],
"writers": [
{
Expand Down
1 change: 0 additions & 1 deletion invenio_vocabularies/datastreams/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def create(cls, config):
try:
type_ = config["type"]
args = config.get("args", {})
print(cls.options())
return cls.options()[type_](**args)
except KeyError:
raise FactoryError(name=cls.FACTORY_NAME, key=type_)
Expand Down
6 changes: 3 additions & 3 deletions invenio_vocabularies/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
import yaml
from invenio_records_resources.proxies import current_service_registry

from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config
from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
from .contrib.affiliations.datastreams import (
DATASTREAM_CONFIG as affiliations_ds_config,
)
from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config
from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config


class VocabularyConfig:
Expand Down
104 changes: 104 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# Copyright (C) 2020 CERN.
# Copyright (C) 2021 TU Wien.
# Copyright (C) 2024 California Institute of Technology.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand Down Expand Up @@ -43,6 +44,7 @@
from invenio_app.factory import create_api as _create_api
from invenio_cache import current_cache

from invenio_vocabularies.datastreams import StreamEntry
from invenio_vocabularies.records.api import Vocabulary
from invenio_vocabularies.records.models import VocabularyType

Expand Down Expand Up @@ -218,6 +220,108 @@ def client_with_credentials(db, client, user, role):
return client


@pytest.fixture()
def dict_ror_entry():
"""An example entry from ROR v2 Data Dump."""
return StreamEntry(
{
"locations": [
{
"geonames_id": 5381396,
"geonames_details": {
"country_code": "US",
"country_name": "United States",
"lat": 34.14778,
"lng": -118.14452,
"name": "Pasadena",
},
}
],
"established": 1891,
"external_ids": [
{
"type": "fundref",
"all": ["100006961", "100009676"],
"preferred": "100006961",
},
{
"type": "grid",
"all": ["grid.20861.3d"],
"preferred": "grid.20861.3d",
},
{"type": "isni", "all": ["0000 0001 0706 8890"], "preferred": None},
{"type": "wikidata", "all": ["Q161562"], "preferred": None},
],
"id": "https://ror.org/05dxps055",
"domains": [],
"links": [
{"type": "website", "value": "http://www.caltech.edu/"},
{
"type": "wikipedia",
"value": "http://en.wikipedia.org/wiki/California_Institute_of_Technology",
},
],
"names": [
{"value": "CIT", "types": ["acronym"], "lang": None},
{
"value": "California Institute of Technology",
"types": ["ror_display", "label"],
"lang": "en",
},
{"value": "Caltech", "types": ["alias"], "lang": None},
{
"value": "Instituto de Tecnología de California",
"types": ["label"],
"lang": "es",
},
],
"relationships": [
{
"label": "Caltech Submillimeter Observatory",
"type": "child",
"id": "https://ror.org/01e6j9276",
},
{
"label": "Infrared Processing and Analysis Center",
"type": "child",
"id": "https://ror.org/05q79g396",
},
{
"label": "Joint Center for Artificial Photosynthesis",
"type": "child",
"id": "https://ror.org/05jtgpc57",
},
{
"label": "Keck Institute for Space Studies",
"type": "child",
"id": "https://ror.org/05xkke381",
},
{
"label": "Jet Propulsion Laboratory",
"type": "child",
"id": "https://ror.org/027k65916",
},
{
"label": "Institute for Collaborative Biotechnologies",
"type": "child",
"id": "https://ror.org/04kz4p343",
},
{
"label": "Resnick Sustainability Institute",
"type": "child",
"id": "https://ror.org/04bxjes65",
},
],
"status": "active",
"types": ["education", "funder"],
"admin": {
"created": {"date": "2018-11-14", "schema_version": "1.0"},
"last_modified": {"date": "2024-05-13", "schema_version": "2.0"},
},
},
)


# FIXME: https://github.com/inveniosoftware/pytest-invenio/issues/30
# Without this, success of test depends on the tests order
@pytest.fixture()
Expand Down
31 changes: 31 additions & 0 deletions tests/contrib/affiliations/test_affiliations_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,44 @@
from invenio_access.permissions import system_identity

from invenio_vocabularies.contrib.affiliations.api import Affiliation
from invenio_vocabularies.contrib.affiliations.config import affiliation_schemes
from invenio_vocabularies.contrib.affiliations.datastreams import (
AffiliationsServiceWriter,
)
from invenio_vocabularies.contrib.common.ror.datastreams import RORTransformer
from invenio_vocabularies.datastreams import StreamEntry
from invenio_vocabularies.datastreams.errors import WriterError


@pytest.fixture(scope="module")
def expected_from_ror_json():
return {
"id": "05dxps055",
"name": "California Institute of Technology",
"title": {
"en": "California Institute of Technology",
"es": "Instituto de Tecnología de California",
},
"acronym": "CIT",
"aliases": ["Caltech"],
"country": "US",
"country_name": "United States",
"location_name": "Pasadena",
"status": "active",
"identifiers": [
{"scheme": "ror", "identifier": "05dxps055"},
{"scheme": "grid", "identifier": "grid.20861.3d"},
{"scheme": "isni", "identifier": "0000 0001 0706 8890"},
],
"types": ["education", "funder"],
}


def test_ror_transformer(app, dict_ror_entry, expected_from_ror_json):
transformer = RORTransformer(vocab_schemes=affiliation_schemes)
assert expected_from_ror_json == transformer.apply(dict_ror_entry).entry


def test_affiliations_service_writer_create(app, search_clear, affiliation_full_data):
writer = AffiliationsServiceWriter()
affiliation_rec = writer.write(StreamEntry(affiliation_full_data))
Expand Down
Loading

0 comments on commit 17a7023

Please sign in to comment.