Skip to content

Commit

Permalink
Merge pull request #1253 from IQSS/1242-dataverse-original-file-format
Browse files Browse the repository at this point in the history
[MRG] download original file formats from Dataverse #1242
  • Loading branch information
minrk committed Mar 29, 2023
2 parents 0d84b9e + 48f4cc6 commit 43ff7bb
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
13 changes: 10 additions & 3 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,18 @@ def fetch(self, spec, output_dir, yield_output=False):

for fobj in deep_get(record, "latestVersion.files"):
file_url = (
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
# without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
)
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None)
if original_filename:
# replace preservation format filename (foo.tab) with original filename (foo.dta)
filename = original_filename

file_ref = {"download": file_url, "filename": filename}
filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)

file_ref = {"download": file_url, "filename": filename_with_path}
fetch_map = {key: key for key in file_ref.keys()}

yield from self.fetch_file(file_ref, fetch_map, output_dir)
Expand Down
4 changes: 3 additions & 1 deletion tests/unit/contentproviders/test_dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from io import BytesIO
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.parse import urlsplit
from urllib.request import Request, urlopen

import pytest
Expand Down Expand Up @@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}

def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1
parts = urlsplit(req.url)
file_no = int(parts.path.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()

requests_mock.get(
Expand Down

0 comments on commit 43ff7bb

Please sign in to comment.