Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Generalize Zenodo content provider to support other Invenio repositories #704

Merged
merged 8 commits into from
Jun 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 59 additions & 31 deletions repo2docker/contentproviders/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from os import makedirs
from os import path
from urllib.request import build_opener, urlopen, Request
from urllib.error import HTTPError
from zipfile import ZipFile, is_zipfile

from .base import ContentProvider
from ..utils import copytree
from ..utils import copytree, deep_get
from ..utils import normalize_doi, is_doi
from .. import __version__


Expand All @@ -28,39 +30,64 @@ def _urlopen(self, req, headers=None):

return urlopen(req)

def _doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)

try:
resp = self._urlopen("https://doi.org/{}".format(doi))
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
return resp.url
else:
# Just return what is actulally just a URL
return doi

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo record"""
# To support Zenodo instances not hosted at zenodo.org we need to
# start maintaining a list of known DOI prefixes and their hostname.
# We should also change to returning a complete `record_url` that
# fetch() can use instead of constructing a URL there
doi = doi.lower()
# 10.5281 is the Zenodo DOI prefix
if doi.startswith("10.5281/"):
resp = self._urlopen("https://doi.org/{}".format(doi))
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id}

elif doi.startswith("https://doi.org/10.5281/") or doi.startswith(
"http://doi.org/10.5281/"
):
resp = self._urlopen(doi)
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id}

elif doi.startswith("https://zenodo.org/record/") or doi.startswith(
"http://zenodo.org/record/"
):
self.record_id = doi.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id}
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
hosts = [
{
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
{
"hostname": [
"https://data.caltech.edu/records/",
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
},
]

url = self._doi2url(doi)

for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
tmorrell marked this conversation as resolved.
Show resolved Hide resolved
self.record_id = url.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id, "host": host}

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Zenodo record"""
record_id = spec["record"]
host = spec["host"]

yield "Fetching Zenodo record {}.\n".format(record_id)
req = Request(
"https://zenodo.org/api/records/{}".format(record_id),
"{}{}".format(host["api"], record_id),
headers={"accept": "application/json"},
)
resp = self._urlopen(req)
Expand All @@ -70,8 +97,8 @@ def fetch(self, spec, output_dir, yield_output=False):
def _fetch(file_ref, unzip=False):
# the assumption is that `unzip=True` means that this is the only
# file related to the zenodo record
with self._urlopen(file_ref["links"]["download"]) as src:
fname = file_ref["filename"]
with self._urlopen(deep_get(file_ref, host["download"])) as src:
fname = deep_get(file_ref, host["filename"])
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
Expand Down Expand Up @@ -105,9 +132,10 @@ def _fetch(file_ref, unzip=False):
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))

is_software = record["metadata"]["upload_type"] == "software"
only_one_file = len(record["files"]) == 1
for file_ref in record["files"]:
is_software = deep_get(record, host["type"]).lower() == "software"
files = deep_get(record, host["filepath"])
only_one_file = len(files) == 1
for file_ref in files:
for line in _fetch(file_ref, unzip=is_software and only_one_file):
yield line

Expand Down
42 changes: 42 additions & 0 deletions repo2docker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,3 +391,45 @@ def copytree(
if errors:
raise Error(errors)
return dst


def deep_get(dikt, path):
"""Get a value located in `path` from a nested dictionary.

Use a string separated by periods as the path to access
values in a nested dictionary:

deep_get(data, "data.files.0") == data["data"]["files"][0]
"""
value = dikt
for component in path.split("."):
if component.isdigit():
value = value[int(component)]
else:
value = value[component]
return value


# doi_regexp, is_doi, and normalize_doi are from idutils (https://github.com/inveniosoftware/idutils)
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2018 Alan Rubin.
# Licensed under BSD-3-Clause license
doi_regexp = re.compile(
"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
)


def is_doi(val):
"""Returns None if val doesn't match pattern of a DOI.
http://en.wikipedia.org/wiki/Digital_object_identifier."""
print(type(val))
print(val)
return doi_regexp.match(val)


def normalize_doi(val):
"""Return just the DOI (e.g. 10.1234/jshd123)
from a val that could include a url or doi
(e.g. https://doi.org/10.1234/jshd123)"""
m = doi_regexp.match(val)
return m.group(2)
119 changes: 99 additions & 20 deletions tests/unit/contentproviders/test_zenodo.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import pytest

from contextlib import contextmanager
from io import BytesIO
Expand All @@ -20,32 +21,70 @@ def test_content_id():
assert zen.content_id == "3232985"


def test_detect():
test_hosts = [
(
[
"https://zenodo.org/record/3232985",
"10.5281/zenodo.3232985",
"https://doi.org/10.5281/zenodo.3232985",
],
{
"host": {
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "3232985",
},
),
(
[
"https://data.caltech.edu/records/1235",
"10.22002/d1.1235",
"https://doi.org/10.22002/d1.1235",
],
{
"host": {
"hostname": [
"https://data.caltech.edu/records/",
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
},
"record": "1235",
},
),
]


@pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_zenodo(test_input, expected):
with patch.object(Zenodo, "_urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
fake_urlopen.return_value.url = test_input[0]
# valid Zenodo DOIs trigger this content provider
assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"}
assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == {
"record": "3232985"
}
assert Zenodo().detect("https://zenodo.org/record/3232985") == {
"record": "3232985"
}

assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2

with patch.object(Zenodo, "_urlopen") as fake_urlopen:
# Don't trigger the Zenodo content provider
assert Zenodo().detect("/some/path/here") is None
assert Zenodo().detect("https://example.com/path/here") is None
# donn't handle DOIs that aren't from Zenodo
# don't handle DOIs that aren't from Zenodo
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None

# none of the examples are Zenodo like, so we should not attempt to
# resolve a DOI either
assert not fake_urlopen.called


@contextmanager
def zenodo_archive(prefix="a_directory"):
Expand Down Expand Up @@ -83,10 +122,24 @@ def mock_urlopen(self, req):

with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
zen = Zenodo()
spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}

with TemporaryDirectory() as d:
output = []
for l in zen.fetch({"record": "1234"}, d):
for l in zen.fetch(spec, d):
output.append(l)

unpacked_files = set(os.listdir(d))
Expand Down Expand Up @@ -123,9 +176,22 @@ def mock_urlopen(self, req):
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()

spec = spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
output = []
for l in zen.fetch({"record": "1234"}, d):
for l in zen.fetch(spec, d):
output.append(l)

unpacked_files = set(os.listdir(d))
Expand Down Expand Up @@ -164,9 +230,22 @@ def mock_urlopen(self, req):
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()

spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
output = []
for l in zen.fetch({"record": "1234"}, d):
for l in zen.fetch(spec, d):
output.append(l)

unpacked_files = set(os.listdir(d))
Expand Down
28 changes: 28 additions & 0 deletions tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,31 @@ def test_invalid_port_mapping(port_spec):
utils.validate_and_generate_port_mapping([port_spec])

assert 'Port specification "{}"'.format(port_spec) in str(e.value)


def test_deep_get():
data = {"data": {"files": [1, 2, 3]}}
assert utils.deep_get(data, "data.files.0") == 1
assert utils.deep_get(data, "data.files.1") == 2
assert utils.deep_get(data, "data.files") == [1, 2, 3]
assert utils.deep_get(data, "data") == {"files": [1, 2, 3]}


def test_is_doi():
assert utils.is_doi("10.1234/jshd123") != None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For even more Pythonicness people usually do something is None and something is not None instead of == and !=. To be honest I'd have to dig a bit to remember/explain why or in which edge cases is is better :D

assert utils.is_doi("10.1234/JSHD.8192") != None
assert utils.is_doi("doi.org/10.1234/jshd123") != None
assert utils.is_doi("http://doi.org/10.1234/jshd123") != None
assert utils.is_doi("https://doi.org/10.1234/jshd123") != None
assert utils.is_doi("http://dx.doi.org/10.1234/jshd123") != None
assert utils.is_doi("101234/jshd123") == None
assert utils.is_doi("https://mybinder.org") == None


def test_normalize_doi():
assert utils.normalize_doi("10.1234/jshd123") == "10.1234/jshd123"
assert utils.normalize_doi("10.1234/JSHD.8192") == "10.1234/JSHD.8192"
assert utils.normalize_doi("doi.org/10.1234/jshd123") == "10.1234/jshd123"
assert utils.normalize_doi("http://doi.org/10.1234/jshd123") == "10.1234/jshd123"
assert utils.normalize_doi("https://doi.org/10.1234/jshd123") == "10.1234/jshd123"
assert utils.normalize_doi("http://dx.doi.org/10.1234/jshd123") == "10.1234/jshd123"