Skip to content

Commit

Permalink
Merge pull request #169 from inbo/download_zenodo
Browse files Browse the repository at this point in the history
download_zenodo(): add fixes, refactor multi-file approach, add unit tests
  • Loading branch information
florisvdh committed Nov 20, 2023
2 parents 4ce5b03 + 9107fa8 commit 353ae47
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 50 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ Imports:
sf,
stringr,
tidyr (>= 1.0.0),
withr
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.3
Expand All @@ -55,13 +54,15 @@ Suggests:
knitr,
mapview,
openssl,
parallel,
raster (>= 3.3-16),
readxl,
remotes,
rmarkdown,
testthat (>= 3.0.0),
tidyverse,
tools,
units,
utils
utils,
withr
VignetteBuilder: knitr
Config/testthat/edition: 3
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,3 @@ importFrom(tidyr,nest)
importFrom(tidyr,spread)
importFrom(tidyr,unnest)
importFrom(utils,packageVersion)
importFrom(withr,with_options)
4 changes: 2 additions & 2 deletions R/GRTSmh.R
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ convert_dec_to_base4frac <-
#'
#' @export
#' @importFrom dplyr %>%
#' @importFrom withr with_options
#' @importFrom stringr str_sub str_pad str_split
convert_base4frac_to_dec <-
function(x, level) {

with_options(
require_pkgs("withr")
withr::with_options(
c(scipen = 999,
digits = 15), {

Expand Down
49 changes: 12 additions & 37 deletions R/filemanagement.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,9 @@ fileman_folders <- function(root = c("rproj", "git"), path = NA) {
#' @param path Path where the data must be downloaded.
#' Defaults to the working directory.
#' @param doi a doi pointer to the Zenodo archive starting with '10.5281/zenodo.'. See examples.
#' @param parallel Logical (\code{FALSE} by default).
#' If \code{TRUE}, will run a number of parallel processes, each downloading
#' another file.
#' This is useful when multiple large files are present in the Zenodo
#' record, which otherwise would be downloaded sequentially.
#' @param parallel Logical.
#' If \code{TRUE} (the default), files will be
#' downloaded concurrently for multi-file records.
#' Of course, the operation is limited by bandwidth and traffic limitations.
#' @param quiet Logical (\code{FALSE} by default).
#' Do you want to suppress informative messages (not warnings)?
Expand Down Expand Up @@ -135,7 +133,7 @@ fileman_folders <- function(root = c("rproj", "git"), path = NA) {
#' }
download_zenodo <- function(doi,
path = ".",
parallel = FALSE,
parallel = TRUE,
quiet = FALSE) {

assert_that(is.string(doi), is.string(path))
Expand All @@ -159,7 +157,7 @@ download_zenodo <- function(doi,

# extract individual file names and urls
file_urls <- content$files$links$self
filenames <- str_match(file_urls, ".+/([^/]+)")[,2]
filenames <- basename(content$files$key)
destfiles <- file.path(path, filenames)

# extract check-sum(s)
Expand All @@ -186,36 +184,13 @@ download_zenodo <- function(doi,
)
}

if (parallel) {
if (length(file_urls) > 1 && parallel) {

require_pkgs("parallel")

nr_nodes <- min(10, length(file_urls))

if (!quiet) message("Initializing parallel download on ",
nr_nodes,
" R session nodes...\n")

clus <- parallel::makeCluster(nr_nodes)

if (!quiet) {
message("Starting parallel downloads. ",
"This may take a while (and I can't show you the overall progress).\n",
"Be patient...\n")
}

parallel::clusterMap(clus,
function(src, dest) {
curl::curl_download(url = src,
destfile = dest,
quiet = quiet)
},
file_urls,
destfiles)

parallel::stopCluster(clus)

if (!quiet) message("Ended parallel downloads.")
curl::multi_download(
urls = file_urls,
destfiles = destfiles,
progress = !quiet
)

} else {

Expand All @@ -235,7 +210,7 @@ download_zenodo <- function(doi,
destfile <- destfiles[i]
md5 <- unname(tools::md5sum(destfile))
zenodo_md5 <- str_split(file_md5[i], ":")[[1]][2]
if (all.equal(md5, zenodo_md5)) {
if (identical(md5, zenodo_md5)) {
if (!quiet) message(filename,
" was downloaded and its integrity verified (md5sum: ",
md5,
Expand Down
10 changes: 4 additions & 6 deletions man/download_zenodo.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion n2khab.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 4
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: knitr
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This file is part of the standard setup for testthat.
# It is recommended that you do not modify it.
#
# Where should you do additional test configuration?
# Learn more about the roles of various files in:
# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
# * https://testthat.r-lib.org/articles/special-files.html

library(testthat)
library(n2khab)

test_check("n2khab")
54 changes: 54 additions & 0 deletions tests/testthat/_snaps/zenodo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# download_zenodo() works for a single-file record

Code
download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir)
Message
Will download 1 file (total size: 32.5 KiB) from https://doi.org/10.5281/zenodo.3784149 (Distribution of the Natura 2000 habitat type 7220 (Cratoneurion) in Flanders and Brussels Capital Region, Belgium (version 2020); version: habitatsprings_2020v2)
Verifying file integrity...
habitatsprings.geojson was downloaded and its integrity verified (md5sum: 64c3db07d17274da047b3962aab28e80)

# download_zenodo() works for a GitHub code record

Code
download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir)
Message
Will download 1 file (total size: 236.7 KiB) from https://doi.org/10.5281/zenodo.7335805 (R package n2khab: providing preprocessed reference data for Flemish Natura 2000 habitat analyses; version: 0.8.0)
Verifying file integrity...
n2khab-v0.8.0.zip was downloaded and its integrity verified (md5sum: 25fb33360d257c085bce567da8f6a2cb)

# download_zenodo() works for a multi-file record

Code
download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir)
Message
Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7)
Verifying file integrity...
habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917)
habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff)
habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c)
habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99)

# download_zenodo() can work sequentially for a multi-file record

Code
download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir, parallel = FALSE)
Message
Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7)
Verifying file integrity...
habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917)
habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff)
habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c)
habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99)

43 changes: 43 additions & 0 deletions tests/testthat/test-zenodo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
test_that("download_zenodo() works for a single-file record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir)
)
})

test_that("download_zenodo() works for a GitHub code record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir)
)
})

test_that("download_zenodo() works for a multi-file record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(
doi = "10.5281/zenodo.4420858",
path = zenodo_dir
)
)
})

test_that("download_zenodo() can work sequentially for a multi-file record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(
doi = "10.5281/zenodo.4420858",
path = zenodo_dir,
parallel = FALSE
)
)
})

0 comments on commit 353ae47

Please sign in to comment.