From a60811d2973e5f7fe594c300ee23642a7b84a043 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Thu, 8 Jun 2023 15:25:38 -0400 Subject: [PATCH 1/5] start at vignettte --- .gitignore | 1 + DESCRIPTION | 3 ++ vignettes/.gitignore | 2 + vignettes/hubEnsembles.Rmd | 90 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+) create mode 100644 vignettes/.gitignore create mode 100644 vignettes/hubEnsembles.Rmd diff --git a/.gitignore b/.gitignore index 565f2b6..f47ffab 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .Rdata .httr-oauth .DS_Store +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 04875dc..be59a53 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,8 @@ Description: Functions for combining model outputs (e.g. predictions or estimates) from multiple models into an aggregated ensemble model output. License: MIT + file LICENSE Suggests: + knitr, + rmarkdown, testthat (>= 3.0.0) Config/testthat/edition: 3 Encoding: UTF-8 @@ -30,3 +32,4 @@ Imports: rlang Remotes: Infectious-Disease-Modeling-Hubs/hubUtils +VignetteBuilder: knitr diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd new file mode 100644 index 0000000..886f9ce --- /dev/null +++ b/vignettes/hubEnsembles.Rmd @@ -0,0 +1,90 @@ +--- +title: "The `hubEnsembles` package" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{The `hubEnsembles` package} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(dplyr) +library(plotly) +library(hubUtils) +library(hubEnsembles) +``` + +# Introduction + +The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette. + +# Example data: a simple forecast hub + +The `example-simple-forecast-hub` has been created by the Consortium of Infectious Disease Modeling Hubs as a simple example hub to demonstrate the set up and functionality for the hubverse. The hub includes both target data and example model output data. + +```{r} +hub_path <- system.file("example-data/example-simple-forecast-hub", + package = "hubEnsembles") + +model_outputs <- hubUtils::connect_hub(hub_path) %>% + dplyr::collect() + +head(model_outputs) +``` + +# Creating ensembles with `simple_ensemble` + +```{r} +mean_ens <- hubEnsembles::simple_ensemble(model_outputs) +head(mean_ens) +``` + +# Plots + +```{r} +basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(0.975, 0.025)) { + + plain_df <- dplyr::filter(plot_df, output_type_id == plain_line) + + ribbon_df <- dplyr::filter(plot_df, output_type_id %in% ribbon) %>% + dplyr::mutate(output_type_id = ifelse(output_type_id == min(ribbon), + "min", "max")) %>% + tidyr::pivot_wider(names_from = output_type_id, values_from = value) + + plot_model <- plot_ly(height = 1050, colors = scales::hue_pal()(50)) + + if (!is.null(truth_df)) { + plot_model <- plot_model %>% + add_trace(data = truth_df, x = ~time_idx, y = ~value, type = "scatter", + mode = "lines+markers", line = list(color = "#6e6e6e"), + hoverinfo = "text", name = "ground truth", + hovertext = paste("Date: ", truth_df$time_value, "
", + "Ground truth: ", + format(truth_df$value, big.mark = ","), + sep = ""), + marker = list(color = "#6e6e6e", size = 7)) + } + plot_model <- plot_model %>% + add_lines(data = plain_df, x = ~target_date, y = ~value, + color = ~model_id) %>% + add_ribbons(data = ribbon_df, x = ~target_date, ymin = ~min, + ymax = ~max, color = ~model_id, opacity = 0.25, + line = list(width = 0), showlegend = FALSE) +} +``` + +```{r} +plot_df <- dplyr::bind_rows(model_outputs, mean_ens) %>% + dplyr::filter(location == "US") %>% + dplyr::mutate(target_date = origin_date + horizon) + +plot <- basic_plot_function(plot_df, truth_df = NULL) +plot +``` From ea6065ef0bfc2de27b2f02af4b2226d52e19816d Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Fri, 16 Jun 2023 09:56:15 -0400 Subject: [PATCH 2/5] progress on vignette example with mean ensemble --- vignettes/hubEnsembles.Rmd | 52 +++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd index 886f9ce..7fa77eb 100644 --- a/vignettes/hubEnsembles.Rmd +++ b/vignettes/hubEnsembles.Rmd @@ -10,10 +10,18 @@ vignette: > ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, - comment = "#>" + comment = "#>", + fig.width = 7, + fig.height = 4 ) ``` +# Introduction + +The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette. + +This vignette uses the following R packages: + ```{r setup} library(dplyr) library(plotly) @@ -21,10 +29,6 @@ library(hubUtils) library(hubEnsembles) ``` -# Introduction - -The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette. - # Example data: a simple forecast hub The `example-simple-forecast-hub` has been created by the Consortium of Infectious Disease Modeling Hubs as a simple example hub to demonstrate the set up and functionality for the hubverse. The hub includes both target data and example model output data. @@ -35,8 +39,12 @@ hub_path <- system.file("example-data/example-simple-forecast-hub", model_outputs <- hubUtils::connect_hub(hub_path) %>% dplyr::collect() - head(model_outputs) + +target_data_path <- file.path(hub_path, "target-data", + "covid-hospitalizations.csv") +target_data <- read.csv(target_data_path) +head(target_data) ``` # Creating ensembles with `simple_ensemble` @@ -49,17 +57,18 @@ head(mean_ens) # Plots ```{r} -basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(0.975, 0.025)) { - - plain_df <- dplyr::filter(plot_df, output_type_id == plain_line) - +basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(0.975, 0.025), + forecast_date) { + + plain_df <- dplyr::filter(plot_df, output_type_id == plain_line) + ribbon_df <- dplyr::filter(plot_df, output_type_id %in% ribbon) %>% - dplyr::mutate(output_type_id = ifelse(output_type_id == min(ribbon), + dplyr::mutate(output_type_id = ifelse(output_type_id == min(ribbon), "min", "max")) %>% tidyr::pivot_wider(names_from = output_type_id, values_from = value) - - plot_model <- plot_ly(height = 1050, colors = scales::hue_pal()(50)) - + + plot_model <- plot_ly(height = 600, colors = scales::hue_pal()(50)) + if (!is.null(truth_df)) { plot_model <- plot_model %>% add_trace(data = truth_df, x = ~time_idx, y = ~value, type = "scatter", @@ -76,15 +85,24 @@ basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c( color = ~model_id) %>% add_ribbons(data = ribbon_df, x = ~target_date, ymin = ~min, ymax = ~max, color = ~model_id, opacity = 0.25, - line = list(width = 0), showlegend = FALSE) + line = list(width = 0), showlegend = FALSE) %>% + plotly::layout(shapes = list(type = "line", y0 = 0, y1 = 1, yref = "paper", + x0 = forecast_date, x1 = forecast_date, + line = list(color = "gray"))) } ``` ```{r} plot_df <- dplyr::bind_rows(model_outputs, mean_ens) %>% - dplyr::filter(location == "US") %>% + dplyr::filter(location == "US", origin_date == "2022-12-12") %>% dplyr::mutate(target_date = origin_date + horizon) -plot <- basic_plot_function(plot_df, truth_df = NULL) +plot <- basic_plot_function( + plot_df, + truth_df = target_data %>% + dplyr::filter(location == "US", + time_idx >= "2022-10-01", + time_idx <= "2023-03-01"), + forecast_date = "2022-12-12") plot ``` From 240ce1f93e3c48e193ad973afe555b2b04d6f98b Mon Sep 17 00:00:00 2001 From: eahowerton Date: Fri, 22 Sep 2023 15:45:49 -0400 Subject: [PATCH 3/5] add simple_ensemble examples to vignette #16 --- vignettes/hubEnsembles.Rmd | 46 +++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd index 7fa77eb..43648f0 100644 --- a/vignettes/hubEnsembles.Rmd +++ b/vignettes/hubEnsembles.Rmd @@ -18,7 +18,7 @@ knitr::opts_chunk$set( # Introduction -The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette. +The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models and combined into ensemble model outputs. The package includes two main functions: `simple_ensemble` and `linear_pool`. We illustrate these functions in this vignette, and briefly compare them. This vignette uses the following R packages: @@ -49,11 +49,55 @@ head(target_data) # Creating ensembles with `simple_ensemble` +The `simple_ensemble` function is used to summarize across component model outputs; this function can be applied to predictions with an `output_type` of `mean`, `median`, `quantile`, `cdf`, or `pmf`. + +The `simple_ensemble` function defaults to calculating an equally weighted mean across all component model outputs for each unique `output_type_id`. For our example data, which contains two output types (`median` and `quantile`), this means the resulting ensemble will be the mean of component model-submitted values for each quantile. + ```{r} mean_ens <- hubEnsembles::simple_ensemble(model_outputs) head(mean_ens) ``` +## Changing the aggregation function + +We can change the function used to aggregate across model outputs. For example, we may want to calculate a median of component model-submitted values for each quantile. We will also use the `model_id` argument to distinguish this ensemble. + +```{r} +median_ens <- hubEnsembles::simple_ensemble(model_outputs, + agg_fun = median, + model_id = "hub-ensemble-median") +head(median_ens) +``` + +Custom functions can also be passed into the `agg_fun` argument. For example, a geometric mean may be a more appropriate way to combine component model outputs. Any custom function to be used requires an argument `x` for the vector of numeric values to summarize, and if relevant, an argument `w` of numeric weights. + +```{r} +geometric_mean <- function(x){ + n <- length(x) + return(prod(x)^(1/n)) +} + +geometric_mean_ens <- hubEnsembles::simple_ensemble(model_outputs, + agg_fun = geometric_mean, + model_id = "hub-ensemble-geometric") +head(geometric_mean_ens) +``` + +## Weighting model contributions + +In addition, we can weight the contributions of each model by providing a table of weights, which are provided in a `data.frame` with a `model_id` column and a `weight` column. + +```{r} +model_weights <- data.frame(model_id = c("UMass-ar", "UMass-gbq", "simple_hub-baseline"), + weight = c(0.4, 0.4, 0.2)) + +weighted_mean_ens <- hubEnsembles::simple_ensemble(model_outputs, + weights = model_weights, + model_id = "hub-ensemble-weighted-mean") +head(weighted_mean_ens) +``` + + # Plots ```{r} From 54ed7c74c20c1043ef2c062268b5f46ca590b29c Mon Sep 17 00:00:00 2001 From: eahowerton Date: Tue, 26 Sep 2023 15:46:12 -0400 Subject: [PATCH 4/5] start linear_pool() section --- vignettes/hubEnsembles.Rmd | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd index 43648f0..fc9839c 100644 --- a/vignettes/hubEnsembles.Rmd +++ b/vignettes/hubEnsembles.Rmd @@ -49,9 +49,9 @@ head(target_data) # Creating ensembles with `simple_ensemble` -The `simple_ensemble` function is used to summarize across component model outputs; this function can be applied to predictions with an `output_type` of `mean`, `median`, `quantile`, `cdf`, or `pmf`. +The `simple_ensemble` function is used to summarize across component model outputs; this function can be applied to predictions with an `output_type` of `mean`, `median`, `quantile`, `cdf`, or `pmf`. -The `simple_ensemble` function defaults to calculating an equally weighted mean across all component model outputs for each unique `output_type_id`. For our example data, which contains two output types (`median` and `quantile`), this means the resulting ensemble will be the mean of component model-submitted values for each quantile. +The `simple_ensemble` function defaults to calculating an equally weighted mean across all component model outputs for each unique `output_type_id`. For our example data, which contains two output types (`median` and `quantile`), this means the resulting ensemble will be the mean of component model submitted values for each quantile. ```{r} mean_ens <- hubEnsembles::simple_ensemble(model_outputs) @@ -60,7 +60,7 @@ head(mean_ens) ## Changing the aggregation function -We can change the function used to aggregate across model outputs. For example, we may want to calculate a median of component model-submitted values for each quantile. We will also use the `model_id` argument to distinguish this ensemble. +We can change the function used to aggregate across model outputs. For example, we may want to calculate a median of component model submitted values for each quantile. We will also use the `model_id` argument to distinguish this ensemble. ```{r} median_ens <- hubEnsembles::simple_ensemble(model_outputs, @@ -97,6 +97,17 @@ weighted_mean_ens <- hubEnsembles::simple_ensemble(model_outputs, head(weighted_mean_ens) ``` +# Creating ensembles with `linear_pool` + +An alternative approach to generate an ensemble is a linear pool, or a distributional mixture; this function can be applied to predictions with an `output_type` of `mean`, `quantile`, `cdf`, or `pmf`. Our example hub includes `median` output type, so we exclude it from the calculation. + +For `mean`, `cdf` and `pmf` output types, the linear pool is equivalent to using a mean `simple_ensemble`. For `quantile` model outputs, the `linear_pool` function needs to approximate a full probability distribution using the value-quantile pairs from each component model. As a default, this is done with functions in the `distfromq` package, which defaults to fitting a monotonic cubic spline. + +```{r} +linear_pool_ens <- hubEnsembles::linear_pool(model_outputs %>% + filter(output_type != "median")) +head(linear_pool_ens) +``` # Plots From 5a4684293ccf1d3fb7d93a36febb853bb3949c6d Mon Sep 17 00:00:00 2001 From: Li Shandross <57642277+lshandross@users.noreply.github.com> Date: Wed, 18 Oct 2023 16:51:26 -0400 Subject: [PATCH 5/5] resolve branch conflicts with main --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f47ffab..9168bf8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ .Rdata .httr-oauth .DS_Store -inst/doc +docs