Skip to content

Commit

Permalink
handle errata
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Nov 29, 2021
1 parent cc9645a commit 76ddf4b
Show file tree
Hide file tree
Showing 9 changed files with 500 additions and 27 deletions.
15 changes: 15 additions & 0 deletions inspire_json_merger/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from inspire_json_merger.config import (
ArxivOnArxivOperations,
ArxivOnPublisherOperations,
ErratumOnPublisherOperations,
ManualMergeOperations,
PublisherOnArxivOperations,
PublisherOnPublisherOperations,
Expand Down Expand Up @@ -100,6 +101,9 @@ def get_configuration(head, update, head_source=None):
if is_manual_merge(head, update):
return ManualMergeOperations

if is_erratum(update):
return ErratumOnPublisherOperations

if head_source == 'arxiv':
if update_source == 'arxiv':
return ArxivOnArxivOperations
Expand Down Expand Up @@ -138,3 +142,14 @@ def get_acquisition_source(json_obj):

def is_manual_merge(head, update):
return ('control_number' in update and 'control_number' in head and update['control_number'] != head['control_number'])


def is_erratum(update):
erratum_keywords = {"erratum", "corrigendum", "publisher's note"}
journal_titles_list = get_value(update, "titles.title", [])
journal_titles_string = " ".join(journal_titles_list).lower()
title_contains_erratum_keyword = any([keyword in journal_titles_string for keyword in erratum_keywords])
title_starts_with_correction_to = any(journal_title.lower().startswith('correction to:') for journal_title in journal_titles_list)
erratum_in_dois_material = 'erratum' in get_value(update, "dois.material", [])
if title_contains_erratum_keyword or title_starts_with_correction_to or erratum_in_dois_material:
return True
19 changes: 11 additions & 8 deletions inspire_json_merger/comparators.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,11 @@ class Ret(PrimaryKeyComparator):
CollectionsComparator = get_pk_comparator(['primary'])
CreationDatetimeComparator = get_pk_comparator(['creation_datetime'])
DateComparator = get_pk_comparator(['date'])
DoiComparator = get_pk_comparator([['source', 'value', 'material']])
FundingInfoComparator = get_pk_comparator(['project_number'])
ImprintsComparator = get_pk_comparator(['publisher'])
LanguageComparator = get_pk_comparator(['language'])
LicenseComparator = get_pk_comparator(['imposing'])
LicenseComparator = get_pk_comparator(['imposing', 'material'])
MaterialComparator = get_pk_comparator(['material'])
RefComparator = get_pk_comparator(['$ref'])
SchemaComparator = get_pk_comparator(['schema'])
Expand All @@ -112,19 +113,21 @@ class Ret(PrimaryKeyComparator):


PublicationInfoComparator = get_pk_comparator([
['journal_title', 'journal_volume']
['journal_title', 'journal_volume', 'material']
])

FigureComparator = get_pk_comparator([
['key']
['key', 'material']
])

DocumentComparator = get_pk_comparator([
['source', 'description'],
['source', 'fulltext'],
['source', 'original_url'],
['source', 'description', 'material'],
['source', 'fulltext', 'material'],
['source', 'original_url', 'material'],
])

PersistentIdentifierComparator = get_pk_comparator(['value', 'material'])

COMPARATORS = {
'_desy_bookkeeping': DateComparator,
'_private_notes': SourceComparator,
Expand All @@ -139,7 +142,7 @@ class Ret(PrimaryKeyComparator):
'copyright': MaterialComparator,
'deleted_records': RefComparator,
'documents': DocumentComparator,
'dois': SourceValueComparator,
'dois': DoiComparator,
'external_system_identifiers': SchemaComparator,
'figures': FigureComparator,
'funding_info': FundingInfoComparator,
Expand All @@ -148,7 +151,7 @@ class Ret(PrimaryKeyComparator):
'keywords': ValueComparator,
'license': LicenseComparator,
'new_record': RefComparator,
'persistent_identifiers': ValueComparator,
'persistent_identifiers': PersistentIdentifierComparator,
'public_notes': SourceComparator,
'publication_info': PublicationInfoComparator,
'references.reference.authors': AuthorComparator,
Expand Down
22 changes: 21 additions & 1 deletion inspire_json_merger/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
filter_publisher_references,
update_authors_with_ordering_info,
remove_references_from_update,
clean_root_for_acquisition_source
clean_root_for_acquisition_source,
update_material,
remove_root
)
from .comparators import COMPARATORS, GROBID_ON_ARXIV_COMPARATORS

Expand Down Expand Up @@ -317,3 +319,21 @@ class GrobidOnArxivAuthorsOperations(MergerConfigurationOperations):
}
comparators = GROBID_ON_ARXIV_COMPARATORS
conflict_filters = ["authors.full_name"]


class ErratumOnPublisherOperations(MergerConfigurationOperations):
pre_filters = [
update_material,
filter_curated_references,
update_authors_with_ordering_info,
remove_root
]
default_list_merge_op = U.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST
default_dict_merge_op = D.FALLBACK_KEEP_HEAD
list_merge_ops = {
'abstracts': U.KEEP_ONLY_HEAD_ENTITIES,
'authors': U.KEEP_UPDATE_ENTITIES_CONFLICT_ON_HEAD_DELETE,
'publication_info': U.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST,
'dois': U.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST,
'references': U.KEEP_UPDATE_AND_HEAD_ENTITIES_HEAD_FIRST,
}
18 changes: 17 additions & 1 deletion inspire_json_merger/pre_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@

import pyrsistent
from inspire_utils.record import get_value
from pyrsistent import freeze, thaw
from pyrsistent import freeze, thaw, ny
from six.moves import zip

from inspire_json_merger.utils import ORDER_KEY

FIELDS_WITH_MATERIAL_KEY = [
'dois', 'publication_info', 'copyright', 'documents', 'license', 'figures', 'persistent_identifiers'
]


def remove_elements_with_source(source, field):
"""Remove all elements matching ``source`` in ``field``."""
Expand Down Expand Up @@ -201,3 +205,15 @@ def clean_root_for_acquisition_source(root, head, update):

filter_documents_same_source = partial(keep_only_update_source_in_field, 'documents')
filter_figures_same_source = partial(keep_only_update_source_in_field, 'figures')


def update_material(root, head, update):
if "erratum" in get_value(thaw(update), 'dois.material'):
return root, head, update
for field in FIELDS_WITH_MATERIAL_KEY:
update = update.transform([field, ny], lambda element: element.set("material", "erratum"))
return root, head, update


def remove_root(root, head, update):
return {}, head, update
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
'munkres==1.0.12',
'inspire-schemas~=61.0',
'inspire-utils~=3.0,>=3.0.0',
'json-merger[contrib]~=0.0,==0.7.6',
'json-merger[contrib]~=0.0,==0.7.11',
'pyrsistent~=0.0,>=0.14.0',
]

Expand Down
43 changes: 40 additions & 3 deletions tests/unit/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import pytest

from utils import validate_subschema, assert_ordered_conflicts
from inspire_json_merger.config import GrobidOnArxivAuthorsOperations

from inspire_json_merger.api import (
get_acquisition_source,
Expand All @@ -39,11 +38,13 @@
merge,
)
from inspire_json_merger.config import (
ErratumOnPublisherOperations,
ArxivOnArxivOperations,
ArxivOnPublisherOperations,
PublisherOnArxivOperations,
PublisherOnPublisherOperations,
ManualMergeOperations
ManualMergeOperations,
GrobidOnArxivAuthorsOperations
)


Expand Down Expand Up @@ -140,6 +141,39 @@ def publisher_record():
}


@pytest.fixture
def erratum_1():
return {
'_collections': ['literature'],
'document_type': ['article'],
'titles': [{'title': 'Erratum: that was a wrong title'}],
'dois': [{'value': '10.1023/A:1026654312961'}],
'acquisition_source': {'source': 'ejl'}
}


@pytest.fixture
def erratum_2():
return {
'_collections': ['literature'],
'document_type': ['article'],
'titles': [{'title': 'Correction to: an article'}],
'dois': [{'value': '10.1023/A:1026654312961'}],
'acquisition_source': {'source': 'ejl'}
}


@pytest.fixture
def erratum_3():
return {
'_collections': ['literature'],
'document_type': ['article'],
'titles': [{'title': 'A title'}],
'dois': [{'value': '10.1023/A:1026654312961', 'material': 'erratum'}],
'acquisition_source': {'source': 'ejl'}
}


def test_get_head_source_freetext_pub_info_with_eprint(rec_publication_info):
# record has pubinfo_freetext and arxiv_eprints, no dois
validate_subschema(rec_publication_info)
Expand Down Expand Up @@ -220,11 +254,14 @@ def test_get_head_source_arxiv_dois_and_freetext_but_no_arxiv_eprint(rec_dois, r
assert get_head_source(rec_dois) == 'publisher'


def test_get_configuration(arxiv_record, publisher_record):
def test_get_configuration(arxiv_record, publisher_record, erratum_1, erratum_2, erratum_3):
assert get_configuration(arxiv_record, arxiv_record) == ArxivOnArxivOperations
assert get_configuration(arxiv_record, publisher_record) == PublisherOnArxivOperations
assert get_configuration(publisher_record, arxiv_record) == ArxivOnPublisherOperations
assert get_configuration(publisher_record, publisher_record) == PublisherOnPublisherOperations
assert get_configuration(publisher_record, erratum_1) == ErratumOnPublisherOperations
assert get_configuration(publisher_record, erratum_2) == ErratumOnPublisherOperations
assert get_configuration(publisher_record, erratum_3) == ErratumOnPublisherOperations

arxiv1 = arxiv_record
arxiv1['control_number'] = 1
Expand Down
Loading

0 comments on commit 76ddf4b

Please sign in to comment.