From a60811d2973e5f7fe594c300ee23642a7b84a043 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Thu, 8 Jun 2023 15:25:38 -0400
Subject: [PATCH 1/5] start at vignettte

---
 .gitignore                 |  1 +
 DESCRIPTION                |  3 ++
 vignettes/.gitignore       |  2 +
 vignettes/hubEnsembles.Rmd | 90 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+)
 create mode 100644 vignettes/.gitignore
 create mode 100644 vignettes/hubEnsembles.Rmd
diff --git a/.gitignore b/.gitignore
index 565f2b6..f47ffab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 .Rdata
 .httr-oauth
 .DS_Store
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
index 04875dc..be59a53 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -14,6 +14,8 @@ Description: Functions for combining model outputs (e.g. predictions or
     estimates) from multiple models into an aggregated ensemble model output.
 License: MIT + file LICENSE
 Suggests: 
+    knitr,
+    rmarkdown,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 Encoding: UTF-8
@@ -30,3 +32,4 @@ Imports:
     rlang
 Remotes:
     Infectious-Disease-Modeling-Hubs/hubUtils
+VignetteBuilder: knitr
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..097b241
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd
new file mode 100644
index 0000000..886f9ce
--- /dev/null
+++ b/vignettes/hubEnsembles.Rmd
@@ -0,0 +1,90 @@
+---
+title: "The `hubEnsembles` package"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{The `hubEnsembles` package}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+```{r setup}
+library(dplyr)
+library(plotly)
+library(hubUtils)
+library(hubEnsembles)
+```
+
+# Introduction
+
+The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette.
+
+# Example data: a simple forecast hub
+
+The `example-simple-forecast-hub` has been created by the Consortium of Infectious Disease Modeling Hubs as a simple example hub to demonstrate the set up and functionality for the hubverse. The hub includes both target data and example model output data.
+
+```{r}
+hub_path <- system.file("example-data/example-simple-forecast-hub",
+                        package = "hubEnsembles")
+
+model_outputs <- hubUtils::connect_hub(hub_path) %>%
+  dplyr::collect()
+
+head(model_outputs)
+```
+
+# Creating ensembles with `simple_ensemble`
+
+```{r}
+mean_ens <- hubEnsembles::simple_ensemble(model_outputs)
+head(mean_ens)
+```
+
+# Plots
+
+```{r}
+basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(0.975, 0.025)) {
+  
+  plain_df <- dplyr::filter(plot_df, output_type_id == plain_line) 
+  
+  ribbon_df <- dplyr::filter(plot_df, output_type_id %in% ribbon) %>%
+    dplyr::mutate(output_type_id = ifelse(output_type_id == min(ribbon), 
+                                          "min", "max")) %>% 
+    tidyr::pivot_wider(names_from = output_type_id, values_from = value)
+  
+  plot_model <- plot_ly(height = 1050, colors = scales::hue_pal()(50)) 
+  
+  if (!is.null(truth_df)) {
+    plot_model <- plot_model %>% 
+      add_trace(data = truth_df, x = ~time_idx, y = ~value, type = "scatter",
+                mode = "lines+markers", line = list(color = "#6e6e6e"),
+                hoverinfo = "text", name = "ground truth",
+                hovertext = paste("Date: ", truth_df$time_value, "<br>", 
+                                  "Ground truth: ", 
+                                  format(truth_df$value, big.mark = ","), 
+                             sep = ""), 
+                marker = list(color = "#6e6e6e", size = 7))
+  }
+  plot_model <- plot_model %>% 
+    add_lines(data = plain_df, x = ~target_date, y = ~value, 
+              color = ~model_id) %>% 
+    add_ribbons(data = ribbon_df, x = ~target_date, ymin = ~min, 
+                ymax = ~max, color = ~model_id, opacity = 0.25, 
+                line = list(width = 0), showlegend = FALSE)
+}
+```
+
+```{r}
+plot_df <- dplyr::bind_rows(model_outputs, mean_ens) %>%
+  dplyr::filter(location == "US") %>%
+  dplyr::mutate(target_date = origin_date + horizon)
+
+plot <- basic_plot_function(plot_df, truth_df = NULL)
+plot
+```

From ea6065ef0bfc2de27b2f02af4b2226d52e19816d Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Fri, 16 Jun 2023 09:56:15 -0400
Subject: [PATCH 2/5] progress on vignette example with mean ensemble

---
 vignettes/hubEnsembles.Rmd | 52 +++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd
index 886f9ce..7fa77eb 100644
--- a/vignettes/hubEnsembles.Rmd
+++ b/vignettes/hubEnsembles.Rmd
@@ -10,10 +10,18 @@ vignette: >
 ```{r, include = FALSE}
 knitr::opts_chunk$set(
   collapse = TRUE,
-  comment = "#>"
+  comment = "#>",
+  fig.width = 7,
+  fig.height = 4
 )
 ```
 
+# Introduction
+
+The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette.
+
+This vignette uses the following R packages:
+
 ```{r setup}
 library(dplyr)
 library(plotly)
@@ -21,10 +29,6 @@ library(hubUtils)
 library(hubEnsembles)
 ```
 
-# Introduction
-
-The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette.
-
 # Example data: a simple forecast hub
 
 The `example-simple-forecast-hub` has been created by the Consortium of Infectious Disease Modeling Hubs as a simple example hub to demonstrate the set up and functionality for the hubverse. The hub includes both target data and example model output data.
@@ -35,8 +39,12 @@ hub_path <- system.file("example-data/example-simple-forecast-hub",
 
 model_outputs <- hubUtils::connect_hub(hub_path) %>%
   dplyr::collect()
-
 head(model_outputs)
+
+target_data_path <- file.path(hub_path, "target-data",
+                              "covid-hospitalizations.csv")
+target_data <- read.csv(target_data_path)
+head(target_data)
 ```
 
 # Creating ensembles with `simple_ensemble`
@@ -49,17 +57,18 @@ head(mean_ens)
 # Plots
 
 ```{r}
-basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(0.975, 0.025)) {
-  
-  plain_df <- dplyr::filter(plot_df, output_type_id == plain_line) 
-  
+basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(0.975, 0.025),
+                                forecast_date) {
+
+  plain_df <- dplyr::filter(plot_df, output_type_id == plain_line)
+
   ribbon_df <- dplyr::filter(plot_df, output_type_id %in% ribbon) %>%
-    dplyr::mutate(output_type_id = ifelse(output_type_id == min(ribbon), 
+    dplyr::mutate(output_type_id = ifelse(output_type_id == min(ribbon),
                                           "min", "max")) %>% 
     tidyr::pivot_wider(names_from = output_type_id, values_from = value)
-  
-  plot_model <- plot_ly(height = 1050, colors = scales::hue_pal()(50)) 
-  
+
+  plot_model <- plot_ly(height = 600, colors = scales::hue_pal()(50)) 
+
   if (!is.null(truth_df)) {
     plot_model <- plot_model %>% 
       add_trace(data = truth_df, x = ~time_idx, y = ~value, type = "scatter",
@@ -76,15 +85,24 @@ basic_plot_function <- function(plot_df, truth_df, plain_line = 0.5, ribbon = c(
               color = ~model_id) %>% 
     add_ribbons(data = ribbon_df, x = ~target_date, ymin = ~min, 
                 ymax = ~max, color = ~model_id, opacity = 0.25, 
-                line = list(width = 0), showlegend = FALSE)
+                line = list(width = 0), showlegend = FALSE) %>%
+    plotly::layout(shapes = list(type = "line", y0 = 0, y1 = 1, yref = "paper",
+                                x0 = forecast_date, x1 = forecast_date,
+                                line = list(color = "gray")))
 }
 ```
 
 ```{r}
 plot_df <- dplyr::bind_rows(model_outputs, mean_ens) %>%
-  dplyr::filter(location == "US") %>%
+  dplyr::filter(location == "US", origin_date == "2022-12-12") %>%
   dplyr::mutate(target_date = origin_date + horizon)
 
-plot <- basic_plot_function(plot_df, truth_df = NULL)
+plot <- basic_plot_function(
+    plot_df,
+    truth_df = target_data %>%
+        dplyr::filter(location == "US",
+                      time_idx >= "2022-10-01",
+                      time_idx <= "2023-03-01"),
+    forecast_date = "2022-12-12")
 plot
 ```

From 240ce1f93e3c48e193ad973afe555b2b04d6f98b Mon Sep 17 00:00:00 2001
From: eahowerton <eah394@psu.edu>
Date: Fri, 22 Sep 2023 15:45:49 -0400
Subject: [PATCH 3/5] add simple_ensemble examples to vignette #16

---
 vignettes/hubEnsembles.Rmd | 46 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd
index 7fa77eb..43648f0 100644
--- a/vignettes/hubEnsembles.Rmd
+++ b/vignettes/hubEnsembles.Rmd
@@ -18,7 +18,7 @@ knitr::opts_chunk$set(
 
 # Introduction
 
-The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models into combined ensemble model outputs. Currently the package includes just one function: `simple_ensemble`. We illustrate that function in this vignette.
+The `hubEnsembles` package includes functionality for aggregating model outputs, such as forecasts or projections, that are submitted to a hub by multiple models and combined into ensemble model outputs. The package includes two main functions: `simple_ensemble` and `linear_pool`. We illustrate these functions in this vignette, and briefly compare them.
 
 This vignette uses the following R packages:
 
@@ -49,11 +49,55 @@ head(target_data)
 
 # Creating ensembles with `simple_ensemble`
 
+The `simple_ensemble` function is used to summarize across component model outputs; this function can be applied to predictions with an `output_type` of `mean`, `median`, `quantile`, `cdf`, or `pmf`. 
+
+The `simple_ensemble` function defaults to calculating an equally weighted mean across all component model outputs for each unique `output_type_id`. For our example data, which contains two output types (`median` and `quantile`), this means the resulting ensemble will be the mean of component model-submitted values for each quantile. 
+
 ```{r}
 mean_ens <- hubEnsembles::simple_ensemble(model_outputs)
 head(mean_ens)
 ```
 
+## Changing the aggregation function
+
+We can change the function used to aggregate across model outputs. For example, we may want to calculate a median of component model-submitted values for each quantile. We will also use the `model_id` argument to distinguish this ensemble.
+
+```{r}
+median_ens <- hubEnsembles::simple_ensemble(model_outputs, 
+                                            agg_fun = median, 
+                                            model_id = "hub-ensemble-median")
+head(median_ens)
+```
+
+Custom functions can also be passed into the `agg_fun` argument. For example, a geometric mean may be a more appropriate way to combine component model outputs. Any custom function to be used requires an argument `x` for the vector of numeric values to summarize, and if relevant, an argument `w` of numeric weights.
+
+```{r}
+geometric_mean <- function(x){
+    n <- length(x)
+    return(prod(x)^(1/n))
+}
+
+geometric_mean_ens <-  hubEnsembles::simple_ensemble(model_outputs, 
+                                            agg_fun = geometric_mean, 
+                                            model_id = "hub-ensemble-geometric")
+head(geometric_mean_ens)
+```
+
+## Weighting model contributions
+
+In addition, we can weight the contributions of each model by providing a table of weights, which are provided in a `data.frame` with a `model_id` column and a `weight` column.
+
+```{r}
+model_weights <- data.frame(model_id = c("UMass-ar", "UMass-gbq", "simple_hub-baseline"), 
+                            weight = c(0.4, 0.4, 0.2))
+
+weighted_mean_ens <- hubEnsembles::simple_ensemble(model_outputs, 
+                                                   weights = model_weights, 
+                                                   model_id = "hub-ensemble-weighted-mean")
+head(weighted_mean_ens)
+```
+
+
 # Plots
 
 ```{r}

From 54ed7c74c20c1043ef2c062268b5f46ca590b29c Mon Sep 17 00:00:00 2001
From: eahowerton <eah394@psu.edu>
Date: Tue, 26 Sep 2023 15:46:12 -0400
Subject: [PATCH 4/5] start linear_pool() section

---
 vignettes/hubEnsembles.Rmd | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/vignettes/hubEnsembles.Rmd b/vignettes/hubEnsembles.Rmd
index 43648f0..fc9839c 100644
--- a/vignettes/hubEnsembles.Rmd
+++ b/vignettes/hubEnsembles.Rmd
@@ -49,9 +49,9 @@ head(target_data)
 
 # Creating ensembles with `simple_ensemble`
 
-The `simple_ensemble` function is used to summarize across component model outputs; this function can be applied to predictions with an `output_type` of `mean`, `median`, `quantile`, `cdf`, or `pmf`. 
+The `simple_ensemble` function is used to summarize across component model outputs; this function can be applied to predictions with an `output_type` of `mean`, `median`, `quantile`, `cdf`, or `pmf`.
 
-The `simple_ensemble` function defaults to calculating an equally weighted mean across all component model outputs for each unique `output_type_id`. For our example data, which contains two output types (`median` and `quantile`), this means the resulting ensemble will be the mean of component model-submitted values for each quantile. 
+The `simple_ensemble` function defaults to calculating an equally weighted mean across all component model outputs for each unique `output_type_id`. For our example data, which contains two output types (`median` and `quantile`), this means the resulting ensemble will be the mean of component model submitted values for each quantile.
 
 ```{r}
 mean_ens <- hubEnsembles::simple_ensemble(model_outputs)
@@ -60,7 +60,7 @@ head(mean_ens)
 
 ## Changing the aggregation function
 
-We can change the function used to aggregate across model outputs. For example, we may want to calculate a median of component model-submitted values for each quantile. We will also use the `model_id` argument to distinguish this ensemble.
+We can change the function used to aggregate across model outputs. For example, we may want to calculate a median of component model submitted values for each quantile. We will also use the `model_id` argument to distinguish this ensemble.
 
 ```{r}
 median_ens <- hubEnsembles::simple_ensemble(model_outputs, 
@@ -97,6 +97,17 @@ weighted_mean_ens <- hubEnsembles::simple_ensemble(model_outputs,
 head(weighted_mean_ens)
 ```
 
+# Creating ensembles with `linear_pool`
+
+An alternative approach to generate an ensemble is a linear pool, or a distributional mixture; this function can be applied to predictions with an `output_type` of `mean`, `quantile`, `cdf`, or `pmf`. Our example hub includes `median` output type, so we exclude it from the calculation.
+
+For `mean`, `cdf` and `pmf` output types, the linear pool is equivalent to using a mean `simple_ensemble`. For `quantile` model outputs, the `linear_pool` function needs to approximate a full probability distribution using the value-quantile pairs from each component model. As a default, this is done with functions in the `distfromq` package, which defaults to fitting a monotonic cubic spline.
+
+```{r}
+linear_pool_ens <- hubEnsembles::linear_pool(model_outputs %>%
+                                               filter(output_type != "median"))
+head(linear_pool_ens)
+```
 
 # Plots
 

From 5a4684293ccf1d3fb7d93a36febb853bb3949c6d Mon Sep 17 00:00:00 2001
From: Li Shandross <57642277+lshandross@users.noreply.github.com>
Date: Wed, 18 Oct 2023 16:51:26 -0400
Subject: [PATCH 5/5] resolve branch conflicts with main

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index f47ffab..9168bf8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,4 @@
 .Rdata
 .httr-oauth
 .DS_Store
-inst/doc
+docs