Skip to content

Commit

Permalink
Adding support for multihash()
Browse files Browse the repository at this point in the history
  • Loading branch information
jeroen committed Mar 27, 2020
1 parent 5e306f0 commit ec9e879
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 8 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ SystemRequirements: OpenSSL >= 1.0.1
VignetteBuilder: knitr
Imports: askpass
Suggests:
testthat,
testthat (>= 2.1.0),
digest,
knitr,
rmarkdown,
jsonlite,
jose,
sodium
RoxygenNote: 6.1.1
RoxygenNote: 7.1.0
Encoding: UTF-8
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ export(encrypt_envelope)
export(fingerprint)
export(md4)
export(md5)
export(multihash)
export(my_key)
export(my_pubkey)
export(openssl_config)
Expand Down
41 changes: 35 additions & 6 deletions R/hash.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
#' (hashed message authentication code) when \code{key} is not \code{NULL}. Supported
#' inputs are binary (raw vector), strings (character vector) or a connection object.
#'
#' The most efficient way to calculate hashes is by using input \link{connections},
#' such as a \link[base:connections]{file()} or \link[base:connections]{url()} object.
#' In this case the hash is calculated streamingly, using almost no memory or disk space,
#' regardless of the data size. When using a connection input in the \link{multihash}
#' function, the data is only read only once while streaming to multiple hash functions
#' simultaneously. Therefore several hashes are calculated simultanously, without the
#' need to store any data or download it multiple times.
#'
#' Functions are vectorized for the case of character vectors: a vector with \code{n}
#' strings returns \code{n} hashes. When passing a connection object, the contents will
#' be stream-hashed which minimizes the amount of required memory. This is recommended
Expand Down Expand Up @@ -125,6 +133,22 @@ ripemd160 <- function(x, key = NULL){
rawstringhash(x, "ripemd160", key)
}

#' @rdname hash
#' @export
multihash <- function(x, algos = c('md5', 'sha1', 'sha256', 'sha384', 'sha512')){
if(inherits(x, 'connection')){
connectionhashes(x, algos = algos)
} else if(is.raw(x)){
out <- lapply(algos, function(algo){rawstringhash(x, algo = algo, key = NULL)})
structure(out, names = algos)
} else if(is.character(x)){
m <- vapply(algos, function(algo){stringhash(x, algo = algo, key = NULL)}, FUN.VALUE = x)
if(length(x) == 1)
m <- t(m)
data.frame(m, stringsAsFactors = FALSE)
}
}

# Low level interfaces, not exported.
rawhash <- function(x, algo, key = NULL){
stopifnot(is.raw(x))
Expand All @@ -139,22 +163,27 @@ stringhash <- function(x, algo, key = NULL){
.Call(R_digest,x, as.character(algo), key)
}

connectionhash <- function(con, algo){
md <- md_init(algo);
connectionhashes <- function(con, algos){
if(!isOpen(con)){
open(con, "rb")
on.exit(close(con))
}
mds <- lapply(algos, function(algo){
structure(md_init(algo), algo = algo)
})
if(summary(con)$text == "binary"){
while(length(data <- readBin(con, raw(), 512*1024))){
md_feed(md, data)
lapply(mds, md_feed, data = data)
}
} else {
while(length(data <- readLines(con, n = 1L, warn = FALSE))){
md_feed(md, charToRaw(data))
lapply(mds, md_feed, data = charToRaw(data))
}
}
md_final(md)
hashes <- lapply(mds, function(md){
structure(md_final(md), class = c("hash", attr(md, 'algo')))
})
structure(hashes, names = algos)
}

connectionhmac <- function(con, algo, key){
Expand Down Expand Up @@ -182,7 +211,7 @@ rawstringhash <- function(x, algo, key){
key <- charToRaw(key)
hash <- if(inherits(x, "connection")){
if(is.null(key)){
connectionhash(x, algo)
connectionhashes(x, algo)[[algo]]
} else {
connectionhmac(x, algo, key)
}
Expand Down
36 changes: 36 additions & 0 deletions tests/testthat/test_hash_multihash.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
context("Multihash")

test_that("Multihash for connections or raw vectors", {
desc <- system.file('DESCRIPTION')
buf <- readBin(desc, raw(), 1e5)
algos <- c("md5", "sha1", "sha256", "sha512")
out1 <- multihash(buf, algos = algos)
out2 <- multihash(file(desc), algos = algos)
expect_identical(out1, out2)
expect_named(out1, algos)
expect_equal(out1$md5, md5(file(desc)))
expect_equal(out1$sha1, sha1(file(desc)))
expect_equal(out1$sha256, sha256(file(desc)))
expect_equal(out1$sha512, sha512(file(desc)))
})

test_that("Multihash for text vectors", {
algos <- c("md5", "sha1", "sha256", "sha512")
out0 <- multihash(character(), algos = algos)
expect_is(out0, 'data.frame')
expect_named(out0, algos)
expect_equal(nrow(out0), 0)

out1 <- multihash("foo", algos = algos)
expect_is(out1, 'data.frame')
expect_named(out1, algos)
expect_equal(nrow(out1), 1)

out2 <- multihash(c("foo", "bar"), algos = algos)
expect_is(out2, 'data.frame')
expect_named(out2, algos)
expect_equal(nrow(out2), 2)

expect_equal(out2[1,], out1)

})

0 comments on commit ec9e879

Please sign in to comment.