hubverse-org · elray1 · Aug 16, 2023 · Jul 12, 2023 · Jul 20, 2023 · Jul 21, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,9 +24,13 @@ BugReports: https://github.com/Infectious-Disease-Modeling-Hubs/hubEnsembles/iss
 Imports: 
     cli,
     dplyr,
+    Hmisc,
     hubUtils,
     magrittr,
     matrixStats,
-    rlang
+    rlang,
+    tidyr,
+    tidyselect
 Remotes:
-    Infectious-Disease-Modeling-Hubs/hubUtils
+    Infectious-Disease-Modeling-Hubs/hubUtils,
+    reichlab/distfromq
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(linear_pool)
 export(simple_ensemble)
 importFrom(magrittr,"%>%")
diff --git a/R/linear_pool.R b/R/linear_pool.R
@@ -0,0 +1,173 @@
+#' Compute ensemble model outputs as a linear pool, otherwise known as a
+#' distributional mixture, of component model outputs for
+#' each combination of model task, output type, and output type id. Supported
+#' output types include `mean`, `quantile`, `cdf`, `pmf`, and `sample`.
+#'
+#' @param model_outputs an object of class `model_output_df` with component
+#'   model outputs (e.g., predictions).
+#' @param weights an optional `data.frame` with component model weights. If
+#'   provided, it should have a column named `model_id` and a column containing
+#'   model weights. Optionally, it may contain additional columns corresponding
+#'   to task id variables, `output_type`, or `output_type_id`, if weights are
+#'   specific to values of those variables. The default is `NULL`, in which case
+#'   an equally-weighted ensemble is calculated.
+#' @param weights_col_name `character` string naming the column in `weights`
+#'   with model weights. Defaults to `"weight"`
+#' @param model_id `character` string with the identifier to use for the
+#'   ensemble model.
+#' @param task_id_cols `character` vector with names of columns in
+#'   `model_outputs` that specify modeling tasks. Defaults to `NULL`, in which
+#'   case all columns in `model_outputs` other than `"model_id"`, the specified
+#'   `output_type_col` and `output_type_id_col`, and `"value"` are used as task
+#'   ids.
+#' @param ... parameters that are passed to `distfromq::make_r_fun`, specifying
+#'   details of how to estimate a quantile function from provided quantile levels and
+#'.  quantile values.
+#'
+#' @return a `model_out_tbl` object of ensemble predictions. Note that
+#'   any additional columns in the input `model_outputs` are dropped.
+#'
+#' @export
+linear_pool <- function(model_outputs, weights = NULL,
+                        weights_col_name = "weight",
+                        model_id = "hub-ensemble",
+                        task_id_cols = NULL,
+                        ...) {
+
+  if (!is.data.frame(model_outputs)) {
+    cli::cli_abort(c("x" = "{.arg model_outputs} must be a `data.frame`."))
+  }
+
+  if (isFALSE("model_out_tbl" %in% class(model_outputs))) {
+    model_outputs <- hubUtils::as_model_out_tbl(model_outputs)
+  }
+
+  model_out_cols <- colnames(model_outputs)
+
+  non_task_cols <- c("model_id", "output_type", "output_type_id", "value")
+  if (is.null(task_id_cols)) {
+    task_id_cols <- model_out_cols[!model_out_cols %in% non_task_cols]
+  }
+
+  if (!all(task_id_cols %in% model_out_cols)) {
+    cli::cli_abort(c(
+      "x" = "{.arg model_outputs} did not have all listed task id columns
+             {.val {task_id_col}}."
+    ))
+  }
+
+  # check `model_outputs` has all standard columns with correct data type
+  # and `model_outputs` has > 0 rows
+  hubUtils::validate_model_out_tbl(model_outputs)
+
+  valid_types <- c("mean", "quantile", "cdf", "pmf", "sample")
+  unique_types <- unique(model_outputs[["output_type"]])
+  invalid_types <- unique_types[!unique_types %in% valid_types]
+  if (length(invalid_types) > 0) {
+    cli::cli_abort(c(
+      "x" = "{.arg model_outputs} contains unsupported output type.",
+      "!" = "Included invalid output type{?s}: {.val {invalid_types}}.",
+      "i" = "Supported output types: {.val {valid_types}}."
+    ))
+  }
+
+  # calculate linear opinion pool for different types
+  ensemble_outputs1 <- ensemble_outputs2 <- ensemble_outputs3 <- NULL
+
+  if (any(unique_types %in% c("mean", "cdf", "pmf"))) {
+    # linear pool calculation for mean, cdf, pmf output types
+    ensemble_outputs1 <- model_outputs %>%
+      dplyr::filter(output_type %in% c("mean", "cdf", "pmf")) %>%
+      hubEnsembles::simple_ensemble(weights = weights,
+                                    weights_col_name = weights_col_names,
+                                    agg_fun = "mean", agg_args = list(),
+                                    model_id = model_id,
+                                    task_id_cols = task_id_cols) 
+  } 
+
+  if (any(unique_types == "sample")) {
+    # linear pool calculation for sample output type
+    print("sample")
+  } 
+
+  if (any(unique_types == "quantile")) {
+    # linear pool calculation for quantile output type
+    n_samples <- 1e4
+    quantile_levels <- unique(model_outputs$output_type_id)
+
+    if (is.null(weights)) {
+      weights_col_name <- NULL
+      group_by_cols <- task_id_cols
+      agg_args <- c(list(x = quote(.data[["pred_qs"]]), probs = quantile_levels))
+    } else {
+    req_weight_cols <- c("model_id", weights_col_name)
+    if (!all(req_weight_cols %in% colnames(weights))) {
+      cli::cli_abort(c(
+        "x" = "{.arg weights} did not include required columns
+               {.val {req_weight_cols}}."
+      ))
+    }
+
+    weight_by_cols <- colnames(weights)[colnames(weights) != weights_col_name]
+
+    if ("value" %in% weight_by_cols) {
+      cli::cli_abort(c(
+        "x" = "{.arg weights} included a column named {.val {\"value\"}},
+               which is not allowed."
+      ))
+    }
+
+    invalid_cols <- weight_by_cols[!weight_by_cols %in% colnames(model_outputs)]
+    if (length(invalid_cols) > 0) {
+      cli::cli_abort(c(
+        "x" = "{.arg weights} included {length(invalid_cols)} column{?s} that
+               {?was/were} not present in {.arg model_outputs}:
+               {.val {invalid_cols}}"
+      ))
+    }
+
+    if (weights_col_name %in% colnames(model_outputs)) {
+      cli::cli_abort(c(
+        "x" = "The specified {.arg weights_col_name}, {.val {weights_col_name}},
+               is already a column in {.arg model_outputs}."
+      ))
+    }
+
+    model_outputs <- model_outputs %>%
+      dplyr::left_join(weights, by = weight_by_cols)
+
+    agg_args <- c(list(x = quote(.data[["pred_qs"]]),
+                     weights = quote(.data[[weights_col_name]]),
+                     normwt = TRUE,
+                     probs = quantile_levels))
+
+    group_by_cols <- c(task_id_cols, weights_col_name)
+  }
+
+  ensemble_outputs3 <- model_outputs |>
+    dplyr::group_by(model_id, dplyr::across(dplyr::all_of(group_by_cols))) |>
+      dplyr::summarize(
+        pred_qs = list(distfromq::make_q_fn(
+          ps = output_type_id,
+          qs = value)(seq(from = 0, to = 1, length.out = n_samples + 2)[2:n_samples])),
+        .groups = "drop"
+      ) |>
+      tidyr::unnest(pred_qs) |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(group_by_cols))) |>
+    dplyr::summarize(
+      output_type_id= list(quantile_levels),
+      value = list(do.call(Hmisc::wtd.quantile, args = agg_args)),
+      .groups = "drop") |>
+      tidyr::unnest(cols = tidyselect::all_of(c("output_type_id", "value"))) |>
+    dplyr::mutate(model_id = model_id, .before = 1) |>
+    dplyr::mutate(output_type = "quantile", .before = output_type_id) |>
+    dplyr::ungroup() |>
+    dplyr::select(-all_of(weights_col_name)) 
+  }
+
+  ensemble_model_outputs <- ensemble_outputs1 %>%
+    rbind(ensemble_outputs2, ensemble_outputs3) %>%
+    hubUtils::as_model_out_tbl()
+
+  return(ensemble_model_outputs)
+}
diff --git a/man/linear_pool.Rd b/man/linear_pool.Rd
diff --git a/tests/testthat/test-linear_pool.R b/tests/testthat/test-linear_pool.R
@@ -0,0 +1,76 @@
+library(Hmisc)
+library(distfromq)
+library(matrixStats)
+library(dplyr)
+library(tidyr)
+
+# set up simple data for test cases
+quantile_outputs <- expand.grid(
+  stringsAsFactors = FALSE,
+  model_id = letters[1:4],
+  location = c("222", "888"),
+  horizon = 1, #week
+  target = "inc death",
+  target_date = as.Date("2021-12-25"),
+  output_type = "quantile",
+  output_type_id = c(.1, .5, .9),
+  value = NA_real_)
+
+v2.1 <- quantile_outputs$value[quantile_outputs$location == "222" &
+                            quantile_outputs$output_type_id == .1] <-
+  c(10, 30, 15, 20)
+v2.5 <- quantile_outputs$value[quantile_outputs$location == "222" &
+                            quantile_outputs$output_type_id == .5] <-
+  c(40, 40, 45, 50)
+v2.9 <- quantile_outputs$value[quantile_outputs$location == "222" &
+                            quantile_outputs$output_type_id == .9] <-
+  c(60, 70, 75, 80)
+v8.1 <- quantile_outputs$value[quantile_outputs$location == "888" &
+                            quantile_outputs$output_type_id == .1] <-
+  c(100, 300, 400, 250)
+v8.5 <- quantile_outputs$value[quantile_outputs$location == "888" &
+                            quantile_outputs$output_type_id == .5] <-
+  c(150, 325, 500, 300)
+v8.9 <- quantile_outputs$value[quantile_outputs$location == "888" &
+                            quantile_outputs$output_type_id == .9] <-
+  c(250, 350, 500, 350)
+
+cdf_outputs <- mutate(quantile_outputs,output_type="cdf")
+
+fweight2 <- data.frame(model_id = letters[1:4],
+                       location = "222",
+                       weight = 0.1 * (1:4))
+fweight8 <- data.frame(model_id = letters[1:4],
+                       location = "888",
+                       weight = 0.1 * (4:1))
+fweight <- bind_rows(fweight2, fweight8)
+
+
+test_that("non-default columns are dropped from output", {
+  output_names <- quantile_outputs %>%
+    dplyr::mutate(extra_col_1 = "a", extra_col_2 = "a") %>%
+    linear_pool(
+      task_id_cols = c("target_date", "target", "horizon", "location")
+    ) %>%
+    names()
+
+  expect_equal(sort(names(quantile_outputs)), sort(output_names))
+})
+
+
+test_that("invalid output type throws error", {
+  expect_error(
+    quantile_outputs %>%
+      dplyr::mutate(output_type = "median") %>%
+      linear_pool()
+  )
+})
+
+
+test_that("weights column already in quantile_outputs generates error", {
+  expect_error(
+    quantile_outputs %>%
+      dplyr::mutate(weight = "a") %>%
+      linear_pool(weights = fweight)
+  )
+})