datastreams: add configs for funders and affiliations

inveniosoftware · May 29, 2024 · 17a7023 · 17a7023
1 parent 70ab72f
commit 17a7023
Show file tree

Hide file tree

Showing 11 changed files with 241 additions and 156 deletions.
diff --git a/invenio_vocabularies/contrib/affiliations/datastreams.py b/invenio_vocabularies/contrib/affiliations/datastreams.py
@@ -13,6 +13,7 @@
 from invenio_i18n import lazy_gettext as _
 
 from ...datastreams.writers import ServiceWriter
+from .config import affiliation_schemes
 
 
 class AffiliationsServiceWriter(ServiceWriter):
@@ -38,13 +39,18 @@ def _entry_id(self, entry):
         {
             "type": "zip",
             "args": {
-                "regex": "(?<!_schema_v2)\\.json$",
+                "regex": "_schema_v2\\.json$",
             },
         },
         {"type": "json"},
     ],
     "transformers": [
-        {"type": "ror"},
+        {
+            "type": "ror",
+            "args": {
+                "vocab_schemes": affiliation_schemes,
+            },
+        },
     ],
     "writers": [
         {

diff --git a/invenio_vocabularies/contrib/common/ror/config.py b/invenio_vocabularies/contrib/common/ror/config.py
diff --git a/invenio_vocabularies/contrib/common/ror/datastreams.py b/invenio_vocabularies/contrib/common/ror/datastreams.py
@@ -18,8 +18,6 @@
 from invenio_vocabularies.datastreams.readers import BaseReader
 from invenio_vocabularies.datastreams.transformers import BaseTransformer
 
-from .config import funder_fundref_doi_prefix, funder_schemes
-
 
 class RORHTTPReader(BaseReader):
     """ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
@@ -74,24 +72,35 @@ def read(self, item=None, *args, **kwargs):
 class RORTransformer(BaseTransformer):
     """Transforms a JSON ROR record into a funders record."""
 
+    def __init__(
+        self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
+    ):
+        """Initializes the transformer."""
+        self.vocab_schemes = vocab_schemes
+        self.funder_fundref_doi_prefix = funder_fundref_doi_prefix
+        super().__init__(*args, **kwargs)
+
     def apply(self, stream_entry, **kwargs):
         """Applies the transformation to the stream entry."""
         record = stream_entry.entry
-        funder = {}
-        funder["title"] = {}
+        ror = {}
+        ror["title"] = {}
 
-        funder["id"] = normalize_ror(record.get("id"))
-        if not funder["id"]:
+        ror["id"] = normalize_ror(record.get("id"))
+        if not ror["id"]:
             raise TransformerError(_("Id not found in ROR entry."))
 
         aliases = []
         acronym = None
         for name in record.get("names"):
             lang = name.get("lang", "en")
+            if lang == None:
+                lang = "en"
             if "ror_display" in name["types"]:
-                funder["name"] = name["value"]
+                ror["name"] = name["value"]
             if "label" in name["types"]:
-                funder["title"][lang] = name["value"]
+                print(lang)
+                ror["title"][lang] = name["value"]
             if "alias" in name["types"]:
                 aliases.append(name["value"])
             if "acronym" in name["types"]:
@@ -102,51 +111,56 @@ def apply(self, stream_entry, **kwargs):
                 else:
                     aliases.append(name["value"])
         if acronym:
-            funder["acronym"] = acronym
+            ror["acronym"] = acronym
         if aliases:
-            funder["aliases"] = aliases
+            ror["aliases"] = aliases
 
         # ror_display is required and should be in every entry
-        if not funder["name"]:
+        if not ror["name"]:
             raise TransformerError(
                 _("Name with type ror_display not found in ROR entry.")
             )
 
         # This only gets the first location, to maintain compatability
         # with existing data structure
         location = record.get("locations", [{}])[0].get("geonames_details", {})
-        funder["country"] = location.get("country_code")
-        funder["country_name"] = location.get("country_name")
-        funder["location_name"] = location.get("name")
+        ror["country"] = location.get("country_code")
+        ror["country_name"] = location.get("country_name")
+        ror["location_name"] = location.get("name")
 
-        funder["types"] = record.get("types")
+        ror["types"] = record.get("types")
 
         status = record.get("status")
-        funder["status"] = status
+        ror["status"] = status
 
         # The ROR is always listed in identifiers, expected by serialization
-        funder["identifiers"] = [{"identifier": funder["id"], "scheme": "ror"}]
-        valid_schemes = set(funder_schemes.keys())
+        ror["identifiers"] = [{"identifier": ror["id"], "scheme": "ror"}]
+        if self.vocab_schemes:
+            valid_schemes = set(self.vocab_schemes.keys())
+        else:
+            valid_schemes = set()
         fund_ref = "fundref"
-        valid_schemes.add(fund_ref)
+        if self.funder_fundref_doi_prefix:
+            valid_schemes.add(fund_ref)
         for identifier in record.get("external_ids"):
             scheme = identifier["type"]
             if scheme in valid_schemes:
                 value = identifier.get("preferred") or identifier.get("all")[0]
                 if scheme == fund_ref:
-                    value = f"{funder_fundref_doi_prefix}/{value}"
-                    scheme = "doi"
-                funder["identifiers"].append(
+                    if self.funder_fundref_doi_prefix:
+                        value = f"{self.funder_fundref_doi_prefix}/{value}"
+                        scheme = "doi"
+                ror["identifiers"].append(
                     {
                         "identifier": value,
                         "scheme": scheme,
                     }
                 )
 
-        stream_entry.entry = funder
+        stream_entry.entry = ror
         return stream_entry
 
 
 VOCABULARIES_DATASTREAM_TRANSFORMERS = {
-    "ror-funder": RORTransformer,
+    "ror": RORTransformer,
 }
diff --git a/invenio_vocabularies/contrib/funders/datastreams.py b/invenio_vocabularies/contrib/funders/datastreams.py
@@ -14,6 +14,7 @@
 from invenio_i18n import lazy_gettext as _
 
 from ...datastreams.writers import ServiceWriter
+from .config import funder_fundref_doi_prefix, funder_schemes
 
 
 class FundersServiceWriter(ServiceWriter):
@@ -46,7 +47,13 @@ def _entry_id(self, entry):
         {"type": "json"},
     ],
     "transformers": [
-        {"type": "ror"},
+        {
+            "type": "ror",
+            "args": {
+                "vocab_schemes": funder_schemes,
+                "funder_fundref_doi_prefix": funder_fundref_doi_prefix,
+            },
+        },
     ],
     "writers": [
         {

diff --git a/invenio_vocabularies/datastreams/factories.py b/invenio_vocabularies/datastreams/factories.py
@@ -36,7 +36,6 @@ def create(cls, config):
         try:
             type_ = config["type"]
             args = config.get("args", {})
-            print(cls.options())
             return cls.options()[type_](**args)
         except KeyError:
             raise FactoryError(name=cls.FACTORY_NAME, key=type_)

diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py
@@ -12,12 +12,12 @@
 import yaml
 from invenio_records_resources.proxies import current_service_registry
 
-from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config
-from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
-from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
 from .contrib.affiliations.datastreams import (
     DATASTREAM_CONFIG as affiliations_ds_config,
 )
+from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config
+from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
+from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
 
 
 class VocabularyConfig:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2020 CERN.
 # Copyright (C) 2021 TU Wien.
+# Copyright (C) 2024 California Institute of Technology.
 #
 # Invenio-Vocabularies is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -43,6 +44,7 @@
 from invenio_app.factory import create_api as _create_api
 from invenio_cache import current_cache
 
+from invenio_vocabularies.datastreams import StreamEntry
 from invenio_vocabularies.records.api import Vocabulary
 from invenio_vocabularies.records.models import VocabularyType
 
@@ -218,6 +220,108 @@ def client_with_credentials(db, client, user, role):
     return client
 
 
+@pytest.fixture()
+def dict_ror_entry():
+    """An example entry from ROR v2 Data Dump."""
+    return StreamEntry(
+        {
+            "locations": [
+                {
+                    "geonames_id": 5381396,
+                    "geonames_details": {
+                        "country_code": "US",
+                        "country_name": "United States",
+                        "lat": 34.14778,
+                        "lng": -118.14452,
+                        "name": "Pasadena",
+                    },
+                }
+            ],
+            "established": 1891,
+            "external_ids": [
+                {
+                    "type": "fundref",
+                    "all": ["100006961", "100009676"],
+                    "preferred": "100006961",
+                },
+                {
+                    "type": "grid",
+                    "all": ["grid.20861.3d"],
+                    "preferred": "grid.20861.3d",
+                },
+                {"type": "isni", "all": ["0000 0001 0706 8890"], "preferred": None},
+                {"type": "wikidata", "all": ["Q161562"], "preferred": None},
+            ],
+            "id": "https://ror.org/05dxps055",
+            "domains": [],
+            "links": [
+                {"type": "website", "value": "http://www.caltech.edu/"},
+                {
+                    "type": "wikipedia",
+                    "value": "http://en.wikipedia.org/wiki/California_Institute_of_Technology",
+                },
+            ],
+            "names": [
+                {"value": "CIT", "types": ["acronym"], "lang": None},
+                {
+                    "value": "California Institute of Technology",
+                    "types": ["ror_display", "label"],
+                    "lang": "en",
+                },
+                {"value": "Caltech", "types": ["alias"], "lang": None},
+                {
+                    "value": "Instituto de Tecnología de California",
+                    "types": ["label"],
+                    "lang": "es",
+                },
+            ],
+            "relationships": [
+                {
+                    "label": "Caltech Submillimeter Observatory",
+                    "type": "child",
+                    "id": "https://ror.org/01e6j9276",
+                },
+                {
+                    "label": "Infrared Processing and Analysis Center",
+                    "type": "child",
+                    "id": "https://ror.org/05q79g396",
+                },
+                {
+                    "label": "Joint Center for Artificial Photosynthesis",
+                    "type": "child",
+                    "id": "https://ror.org/05jtgpc57",
+                },
+                {
+                    "label": "Keck Institute for Space Studies",
+                    "type": "child",
+                    "id": "https://ror.org/05xkke381",
+                },
+                {
+                    "label": "Jet Propulsion Laboratory",
+                    "type": "child",
+                    "id": "https://ror.org/027k65916",
+                },
+                {
+                    "label": "Institute for Collaborative Biotechnologies",
+                    "type": "child",
+                    "id": "https://ror.org/04kz4p343",
+                },
+                {
+                    "label": "Resnick Sustainability Institute",
+                    "type": "child",
+                    "id": "https://ror.org/04bxjes65",
+                },
+            ],
+            "status": "active",
+            "types": ["education", "funder"],
+            "admin": {
+                "created": {"date": "2018-11-14", "schema_version": "1.0"},
+                "last_modified": {"date": "2024-05-13", "schema_version": "2.0"},
+            },
+        },
+    )
+
+
 # FIXME: https://github.com/inveniosoftware/pytest-invenio/issues/30
 # Without this, success of test depends on the tests order
 @pytest.fixture()

diff --git a/tests/contrib/affiliations/test_affiliations_datastreams.py b/tests/contrib/affiliations/test_affiliations_datastreams.py
@@ -15,13 +15,44 @@
 from invenio_access.permissions import system_identity
 
 from invenio_vocabularies.contrib.affiliations.api import Affiliation
+from invenio_vocabularies.contrib.affiliations.config import affiliation_schemes
 from invenio_vocabularies.contrib.affiliations.datastreams import (
     AffiliationsServiceWriter,
 )
+from invenio_vocabularies.contrib.common.ror.datastreams import RORTransformer
 from invenio_vocabularies.datastreams import StreamEntry
 from invenio_vocabularies.datastreams.errors import WriterError
 
 
+@pytest.fixture(scope="module")
+def expected_from_ror_json():
+    return {
+        "id": "05dxps055",
+        "name": "California Institute of Technology",
+        "title": {
+            "en": "California Institute of Technology",
+            "es": "Instituto de Tecnología de California",
+        },
+        "acronym": "CIT",
+        "aliases": ["Caltech"],
+        "country": "US",
+        "country_name": "United States",
+        "location_name": "Pasadena",
+        "status": "active",
+        "identifiers": [
+            {"scheme": "ror", "identifier": "05dxps055"},
+            {"scheme": "grid", "identifier": "grid.20861.3d"},
+            {"scheme": "isni", "identifier": "0000 0001 0706 8890"},
+        ],
+        "types": ["education", "funder"],
+    }
+
+
+def test_ror_transformer(app, dict_ror_entry, expected_from_ror_json):
+    transformer = RORTransformer(vocab_schemes=affiliation_schemes)
+    assert expected_from_ror_json == transformer.apply(dict_ror_entry).entry
+
+
 def test_affiliations_service_writer_create(app, search_clear, affiliation_full_data):
     writer = AffiliationsServiceWriter()
     affiliation_rec = writer.write(StreamEntry(affiliation_full_data))