From c8e596568f18bafbf62c06ffae7803a6e1cbf073 Mon Sep 17 00:00:00 2001 From: Li Shandross <57642277+lshandross@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:10:47 -0400 Subject: [PATCH 1/3] Create Hmisc-utils.R --- R/Hmisc-utils.R | 212 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 R/Hmisc-utils.R diff --git a/R/Hmisc-utils.R b/R/Hmisc-utils.R new file mode 100644 index 0000000..e0c8625 --- /dev/null +++ b/R/Hmisc-utils.R @@ -0,0 +1,212 @@ +# All functions adapted from Hmisc pkg functions +# Compute weighted quantiles +# Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/wtd.stats.s#L48 +wtd.quantile <- function(x, weights=NULL, probs=c(0, .25, .5, .75, 1), + type=c('quantile','(i-1)/(n-1)','i/(n+1)','i/n'), + normwt=FALSE, na.rm=TRUE) +{ + if(! length(weights)) + return(stats::quantile(x, probs=probs, na.rm=na.rm)) + + type <- match.arg(type) + if(any(probs < 0 | probs > 1)) + stop("Probabilities must be between 0 and 1 inclusive") + + nams <- paste(format(round(probs * 100, if(length(probs) > 1) + 2 - log10(diff(range(probs))) else 2)), + "%", sep = "") + + i <- is.na(weights) | weights == 0 + if(any(i)) { + x <- x[! i] + weights <- weights[! i] + } + if(type == 'quantile') { + w <- wtd.table(x, weights, na.rm=na.rm, normwt=normwt, type='list') + x <- w$x + wts <- w$sum.of.weights + n <- sum(wts) + order <- 1 + (n - 1) * probs + low <- pmax(floor(order), 1) + high <- pmin(low + 1, n) + order <- order %% 1 + ## Find low and high order statistics + ## These are minimum values of x such that the cum. freqs >= c(low,high) + allq <- stats::approx(cumsum(wts), x, xout=c(low,high), + method='constant', f=1, rule=2)$y + k <- length(probs) + quantiles <- (1 - order)*allq[1:k] + order*allq[-(1:k)] + names(quantiles) <- nams + return(quantiles) + } + w <- wtd.Ecdf(x, weights, na.rm=na.rm, type=type, normwt=normwt) + structure(stats::approx(w$ecdf, w$x, xout=probs, rule=2)$y, + names=nams) +} + + +# Compute weighted empirical distribution function +# Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/wtd.stats.s#L92 +wtd.Ecdf <- function(x, weights=NULL, + type=c('i/n','(i-1)/(n-1)','i/(n+1)'), + normwt=FALSE, na.rm=TRUE) +{ + type <- match.arg(type) + switch(type, + '(i-1)/(n-1)'={a <- b <- -1}, + 'i/(n+1)' ={a <- 0; b <- 1}, + 'i/n' ={a <- b <- 0}) + + if(! length(weights)) { + ##.Options$digits <- 7 ## to get good resolution for names(table(x)) + oldopt <- options('digits') + options(digits=7) + on.exit(options(oldopt)) + cumu <- table(x) ## R does not give names for cumsum + isdate <- testDateTime(x) ## 31aug02 + ax <- attributes(x) + ax$names <- NULL + x <- as.numeric(names(cumu)) + if(isdate) attributes(x) <- c(attributes(x),ax) + cumu <- cumsum(cumu) + cdf <- (cumu + a)/(cumu[length(cumu)] + b) + if(cdf[1]>0) { + x <- c(x[1], x); + cdf <- c(0,cdf) + } + + return(list(x = x, ecdf=cdf)) + } + + w <- wtd.table(x, weights, normwt=normwt, na.rm=na.rm) + cumu <- cumsum(w$sum.of.weights) + cdf <- (cumu + a)/(cumu[length(cumu)] + b) + list(x = c(if(cdf[1]>0) w$x[1], w$x), ecdf=c(if(cdf[1]>0)0, cdf)) +} + + +# Compute weighted frequency table +# Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/wtd.stats.s#L130 +wtd.table <- function(x, weights=NULL, type=c('list','table'), + normwt=FALSE, na.rm=TRUE) +{ + type <- match.arg(type) + if(! length(weights)) + weights <- rep(1, length(x)) + + isdate <- testDateTime(x) ## 31aug02 + next 2 + ax <- attributes(x) + ax$names <- NULL + + if(is.character(x)) x <- as.factor(x) + lev <- levels(x) + x <- unclass(x) + + if(na.rm) { + s <- ! is.na(x + weights) + x <- x[s, drop=FALSE] ## drop is for factor class + weights <- weights[s] + } + + n <- length(x) + if(normwt) + weights <- weights * length(x) / sum(weights) + + i <- order(x) # R does not preserve levels here + x <- x[i]; weights <- weights[i] + + if(anyDuplicated(x)) { ## diff(x) == 0 faster but doesn't handle Inf + weights <- tapply(weights, x, sum) + if(length(lev)) { + levused <- lev[sort(unique(x))] + if((length(weights) > length(levused)) && + any(is.na(weights))) + weights <- weights[! is.na(weights)] + + if(length(weights) != length(levused)) + stop('program logic error') + + names(weights) <- levused + } + + if(! length(names(weights))) + stop('program logic error') + + if(type=='table') + return(weights) + + x <- all.is.numeric(names(weights), 'vector') + if(isdate) + attributes(x) <- c(attributes(x),ax) + + names(weights) <- NULL + return(list(x=x, sum.of.weights=weights)) + } + + xx <- x + if(isdate) + attributes(xx) <- c(attributes(xx),ax) + + if(type=='list') + list(x=if(length(lev))lev[x] + else xx, + sum.of.weights=weights) + else { + names(weights) <- if(length(lev)) lev[x] + else xx + weights + } +} + + +# Determine if variable is a date, time, or date/time variable +# Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/Misc.s#L1125 +testDateTime <- function(x, what=c('either','both','timeVaries')) +{ + what <- match.arg(what) + cl <- class(x) + if(!length(cl)) + return(FALSE) + + dc <- c('Date', 'POSIXt','POSIXct','dates','times','chron') + + dtc <- c('POSIXt','POSIXct','chron') + + switch(what, + either = any(cl %in% dc), + both = any(cl %in% dtc), + timeVaries = { + if('chron' %in% cl || 'Date' %in% cl) { + ## chron or S+ timeDate + y <- as.numeric(x) + length(unique(round(y - floor(y),13))) > 1 + } + else length(unique(format(x,'%H%M%S'))) > 1 + }) +} + + +# Check if all elements in a character vector are numeric +# Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/Misc.s#L241 +all.is.numeric <- function(x, what=c('test','vector','nonnum'), + extras=c('.','NA')) +{ + what <- match.arg(what) + x <- sub('[[:space:]]+$', '', x) + x <- sub('^[[:space:]]+', '', x) + xs <- x[x %nin% c('', extras)] + if(! length(xs) || all(is.na(x))) + return(switch(what, test = FALSE, vector=x, nonnum=x[0])) + isnon <- suppressWarnings(! is.na(xs) & is.na(as.numeric(xs))) + isnum <- ! any(isnon) + # suppressWarnings below handles extras present in x + switch(what, + test = isnum, + vector = if(isnum) suppressWarnings(as.numeric(x)) else x, + nonnum = xs[isnon]) +} + + +# Find matching (or non-matching) elements +# Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/in.operator.s#L1 +"%nin%" <- function(x, table) match(x, table, nomatch = 0) == 0 \ No newline at end of file From dd86a1904702ca1a43dcd3413f13d3a4e6ef9491 Mon Sep 17 00:00:00 2001 From: Li Shandross <57642277+lshandross@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:11:20 -0400 Subject: [PATCH 2/3] Remove `Hmisc` dependencies --- DESCRIPTION | 3 +-- R/linear_pool_quantile.R | 2 +- tests/testthat/test-linear_pool.R | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c7b3bb8..ca1687f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: hubEnsembles Title: Ensemble methods for combining hub model outputs -Version: 0.1.1 +Version: 0.1.2 Authors@R: c( person(given = "Evan L", family = "Ray", @@ -44,7 +44,6 @@ Imports: cli, distfromq (>= 1.0.2), dplyr, - Hmisc, hubUtils (>= 0.0.1), magrittr, matrixStats, diff --git a/R/linear_pool_quantile.R b/R/linear_pool_quantile.R index acc1e5f..f8447a9 100644 --- a/R/linear_pool_quantile.R +++ b/R/linear_pool_quantile.R @@ -75,7 +75,7 @@ linear_pool_quantile <- function(model_outputs, weights = NULL, dplyr::group_by(dplyr::across(dplyr::all_of(task_id_cols))) |> dplyr::summarize( output_type_id = list(quantile_levels), - value = list(do.call(Hmisc::wtd.quantile, args = agg_args)), + value = list(do.call(wtd.quantile, args = agg_args)), .groups = "drop") |> tidyr::unnest(cols = tidyselect::all_of(c("output_type_id", "value"))) |> dplyr::mutate(model_id = model_id, .before = 1) |> diff --git a/tests/testthat/test-linear_pool.R b/tests/testthat/test-linear_pool.R index 1d9c96d..4ac9232 100644 --- a/tests/testthat/test-linear_pool.R +++ b/tests/testthat/test-linear_pool.R @@ -1,4 +1,3 @@ -library(Hmisc) library(distfromq) library(matrixStats) library(dplyr) From 0316a2dab6a6c1083d8fc94a848cd0041b2eb257 Mon Sep 17 00:00:00 2001 From: Li Shandross <57642277+lshandross@users.noreply.github.com> Date: Thu, 11 Apr 2024 14:56:50 -0400 Subject: [PATCH 3/3] Fix `Hmisc-utils` failing lint checks --- .lintr | 4 ++++ R/Hmisc-utils.R | 2 +- R/linear_pool_quantile.R | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.lintr b/.lintr index 4c5d639..1fbc773 100644 --- a/.lintr +++ b/.lintr @@ -2,3 +2,7 @@ linters: linters_with_defaults( line_length_linter = line_length_linter(120L), commented_code_linter = NULL ) +exclusions: list( + "./R/Hmisc-utils.R" + ) + diff --git a/R/Hmisc-utils.R b/R/Hmisc-utils.R index e0c8625..b59df6c 100644 --- a/R/Hmisc-utils.R +++ b/R/Hmisc-utils.R @@ -209,4 +209,4 @@ all.is.numeric <- function(x, what=c('test','vector','nonnum'), # Find matching (or non-matching) elements # Source: https://github.com/harrelfe/Hmisc/blob/f80ecfb27e1deb88b3c6cc075c7252e3c577f66b/R/in.operator.s#L1 -"%nin%" <- function(x, table) match(x, table, nomatch = 0) == 0 \ No newline at end of file +"%nin%" <- function(x, table) match(x, table, nomatch = 0) == 0 diff --git a/R/linear_pool_quantile.R b/R/linear_pool_quantile.R index 8ebaa8c..f71c942 100644 --- a/R/linear_pool_quantile.R +++ b/R/linear_pool_quantile.R @@ -79,7 +79,7 @@ linear_pool_quantile <- function(model_outputs, weights = NULL, dplyr::group_by(dplyr::across(dplyr::all_of(task_id_cols))) |> dplyr::summarize( output_type_id = list(quantile_levels), - value = list(do.call(wtd.quantile, args = agg_args)), + value = list(do.call("wtd.quantile", args = agg_args)), .groups = "drop" ) |> tidyr::unnest(cols = tidyselect::all_of(c("output_type_id", "value"))) |>