insightsengineering · PhilBoileau · Feb 7, 2022 · Feb 7, 2022 · Feb 7, 2022 · Feb 7, 2022
@@ -1,6 +1,6 @@
 Package: uniCATE
 Title: Univariate Conditional Average Treatment Effect Estimation
-Version: 0.3.0
+Version: 0.4.0
 Authors@R: c(
     person(given = "Philippe",
            family = "Boileau",
@@ -35,7 +35,6 @@ Suggests:
     nnls,
     Rsolnp,
     speedglm,
-    glmnet,
     polspline,
     xgboost,
     ranger,
@@ -52,6 +51,7 @@ Imports:
     purrr,
     rlang,
     sl3,
+    glmnet,
     tibble,
     future
 Config/testthat/edition: 3

@@ -17,6 +17,7 @@ importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
 importFrom(dplyr,select)
 importFrom(dplyr,setequal)
+importFrom(glmnet,cv.glmnet)
 importFrom(magrittr,"%>%")
 importFrom(matrixStats,colVars)
 importFrom(origami,cross_validate)
@@ -30,12 +31,12 @@ importFrom(rlang,"!!")
 importFrom(rlang,":=")
 importFrom(rlang,enquo)
 importFrom(rlang,sym)
-importFrom(stats,coef)
-importFrom(stats,lm)
+importFrom(stats,as.formula)
+importFrom(stats,model.matrix)
 importFrom(stats,p.adjust)
 importFrom(stats,pnorm)
+importFrom(stats,predict)
 importFrom(stats,quantile)
-importFrom(stats,var)
 importFrom(tibble,as_tibble)
 importFrom(tibble,is_tibble)
 importFrom(tibble,tibble)
@@ -1,3 +1,12 @@
+## uniCATE 0.4.0 (2022-02-07)
+
+* When the super learning arguments are set to `NULL` in `unicate()` and
+  `sunicate()`, cross-validated elastic net regressions are fit to estimate
+  nuisance parameters instead of the previously used default `sl3` super
+  learners. This speeds up computation time, and makes it easier to apply these
+  methods to small datasets by default. This doesn't affect the asymptotic
+  behaviour of the `unicate()` estimator, either.
+
 ## uniCATE 0.3.0 (2022-02-01)
 
 * `uniCATE` is going public!

@@ -22,14 +22,10 @@
 #' @param biomarkers A \code{character} vector listing the pre-treatment
 #'   biomarkers variables in \code{data}. \code{biomarkers} must be a subset of
 #'   \code{covariates}.
-#' @param super_learner A \code{\link[sl3:Lrnr_sl]{Lrnr_sl}} object. If set
-#'   to \code{NULL}, a default SuperLearner is used. If the outcome variable is
-#'   continuous, the default's library of base learners is made up of a
-#'   linear model, penalized linear models (LASSO and elasticnet), a spline
-#'   regression, XGBoost, a Random Forest, and the mean model. When the outcome
-#'   variable is binary, the base learner library consists of (penalized)
-#'   logistic regression models, XGBoost, a Random Forests, and the mean model.
-#'   The type of outcome is automatically detected.
+#' @param super_learner A \code{\link[sl3:Lrnr_sl]{Lrnr_sl}} object used to
+#'   estimate the conditional outcome regression. If set to \code{NULL}, the
+#'   default, an elastic net regression is used instead. It is best to use this
+#'   default behaviour when analyzing small datasets.
 #' @param propensity_score_ls A named \code{numeric} \code{list} providing the
 #'   propensity scores for the treatment conditions. The first element of the
 #'   list should correspond to the "treatment" condition, and the second to the
@@ -151,14 +147,10 @@ estimate_univariate_cates <- function(data,
 #' @param biomarkers A \code{character} vector listing the pre-treatment
 #'   biomarkers variables in \code{data}. \code{biomarkers} must be a subset of
 #'   \code{covariates}.
-#' @param super_learner A \code{\link[sl3:Lrnr_sl]{Lrnr_sl}} object. If set
-#'   to \code{NULL}, a default SuperLearner is used. If the outcome variable is
-#'   continuous, the default's library of base learners is made up of a
-#'   linear model, penalized linear models (LASSO and elasticnet), a spline
-#'   regression, XGBoost, a Random Forest, and the mean model. When the outcome
-#'   variable is binary, the base learner library consists of (penalized)
-#'   logistic regression models, XGBoost, a Random Forests, and the mean model.
-#'   The type of outcome is automatically detected.
+#' @param super_learner A \code{\link[sl3:Lrnr_sl]{Lrnr_sl}} object used to
+#'   estimate the conditional outcome regression. If set to \code{NULL}, the
+#'   default, an elastic net regression is used instead. It is best to use this
+#'   default behaviour when analyzing small datasets.
 #' @param propensity_score_ls A named \code{numeric} \code{list} providing the
 #'   propensity scores for the treatment conditions. The first element of the
 #'   list should correspond to the "treatment" condition, and the second to the
@@ -176,9 +168,10 @@ estimate_univariate_cates <- function(data,
 #' @importFrom rlang !! enquo
 #' @importFrom magrittr %>%
 #' @importFrom origami training validation
-#' @importFrom stats var coef lm
+#' @importFrom stats as.formula model.matrix predict
 #' @importFrom purrr map
 #' @importFrom matrixStats colVars
+#' @importFrom glmnet cv.glmnet
 #' @import sl3
 #'
 #' @keywords internal
@@ -190,121 +183,111 @@ hold_out_calculation <- function(fold, data, outcome, treatment, biomarkers,
   train_data <- origami::training(data)
   valid_data <- origami::validation(data)
 
-  # construct the SuperLearner task for the outcome regression
-  covar_names <- colnames(train_data)
-  covar_names <- covar_names[which(!grepl(outcome, covar_names))]
-  train_task <- sl3::make_sl3_Task(
-    data = train_data,
-    covariates = covar_names,
-    outcome = outcome,
-    outcome_type = outcome_type
+  # construct the modified validation data with fixed treatment assignments
+  valid_data_treat <- valid_data
+  valid_data_treat[treatment] <- names(propensity_score_ls)[1]
+  valid_data_treat[treatment] <- factor(
+    dplyr::pull(valid_data_treat, treatment),
+    levels = names(propensity_score_ls)
+  )
+  valid_data_cont <- valid_data
+  valid_data_cont[treatment] <- names(propensity_score_ls)[2]
+  valid_data_cont[treatment] <- factor(
+    dplyr::pull(valid_data_cont, treatment),
+    levels = names(propensity_score_ls)
   )
 
-  # should the built-in SuperLearner be used to estimate the outcome regression?
+  # should the built-in learner be used to estimate the outcome regression?
   if (is.null(super_learner)) {
 
-    # define the treatment-covariate interaction for certain learners
-    covars <- covar_names[which(!grepl(treatment, covar_names))]
-    interactions <- lapply(covars, function(w) c(w, treatment))
-    lrnr_interactions <- sl3::Lrnr_define_interactions$new(interactions)
+    # specify the outcome
+    y_train <- train_data[[outcome]]
+    covar_names <- colnames(train_data)
+    covar_names <- covar_names[which(!grepl(
+      paste0(outcome, "|", treatment),
+      covar_names
+    ))]
+    formula_string <- paste(
+      "~", treatment, "+", paste(covar_names, collapse = " + "), "+",
+      paste(paste(covar_names, treatment, sep = ":"), collapse = " + ")
+    )
+    x_train <- stats::model.matrix(
+      stats::as.formula(formula_string),
+      data = train_data
+    )
 
     if (outcome_type == "continuous") {
 
-      # define the base learners for continuous outcomes
-      lrnr_glm <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_glm_fast$new()
-      )
-      lrnr_lasso <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_glmnet$new()
-      )
-      lrnr_enet <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_glmnet$new(alpha = 0.5)
-      )
-      lrnr_spline <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_polspline$new()
-      )
-      lrnr_xgboost <- sl3::Lrnr_xgboost$new()
-      lrnr_rf <- sl3::Lrnr_ranger$new()
-      lrnr_mean <- sl3::Lrnr_mean$new()
-
-      # assemble learners
-      learner_library <- sl3::make_learner(
-        sl3::Stack, lrnr_glm, lrnr_spline, lrnr_lasso, lrnr_enet, lrnr_xgboost,
-        lrnr_rf, lrnr_mean
-      )
-
-      # define the metalearner
-      meta_learner <- sl3::make_learner(
-        sl3::Lrnr_solnp,
-        loss_function = sl3::loss_squared_error,
-        learner_function = sl3::metalearner_linear
+      # train the elastic net
+      glmnet_fit <- cv.glmnet(
+        x = x_train,
+        y = y_train,
+        nfolds = 5,
+        alpha = 0.5,
+        family = "gaussian"
       )
     } else {
 
-      # define the base learners for binary outcomes
-      lrnr_glm <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_glm_fast$new()
-      )
-      lrnr_lasso <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_glmnet$new()
-      )
-      lrnr_enet <- sl3::make_learner(
-        sl3::Pipeline, lrnr_interactions, sl3::Lrnr_glmnet$new(alpha = 0.5)
-      )
-      lrnr_xgboost <- sl3::Lrnr_xgboost$new()
-      lrnr_rf <- sl3::Lrnr_ranger$new()
-      lrnr_mean <- sl3::Lrnr_mean$new()
-
-      # assemble learners
-      learner_library <- sl3::make_learner(
-        sl3::Stack, lrnr_glm, lrnr_lasso, lrnr_enet, lrnr_xgboost, lrnr_rf,
-        lrnr_mean
-      )
-
-      # define the metalearner
-      meta_learner <- sl3::make_learner(
-        sl3::Lrnr_solnp,
-        loss_function = sl3::loss_loglik_binomial,
-        learner_function = sl3::metalearner_logistic_binomial
+      # train the elastic net
+      glmnet_fit <- cv.glmnet(
+        x = x_train,
+        y = y_train,
+        nfolds = 5,
+        alpha = 0.5,
+        family = "binomial"
       )
     }
 
-    # intialize the SuperLearner
-    super_learner <- sl3::Lrnr_sl$new(
-      learners = learner_library,
-      metalearner = meta_learner
+    # predict the outcomes in the validation sets with fixed treatments
+    x_data_treat <- stats::model.matrix(
+      stats::as.formula(formula_string),
+      data = valid_data_treat
     )
-  }
+    valid_data$y_hat_treat <- stats::predict(
+      glmnet_fit,
+      newx = x_data_treat,
+      s = "lambda.min",
+      type = "response"
+    )
+    x_data_cont <- stats::model.matrix(
+      stats::as.formula(formula_string),
+      data = valid_data_cont
+    )
+    valid_data$y_hat_cont <- stats::predict(
+      glmnet_fit,
+      newx = x_data_cont,
+      s = "lambda.min",
+      type = "response"
+    )
+  } else {
 
-  # train the SuperLearner on the training data
-  sl_fit <- super_learner$train(train_task)
+    # construct the SuperLearner task for the outcome regression
+    covar_names <- colnames(train_data)
+    covar_names <- covar_names[which(!grepl(outcome, covar_names))]
+    train_task <- sl3::make_sl3_Task(
+      data = train_data,
+      covariates = covar_names,
+      outcome = outcome,
+      outcome_type = outcome_type
+    )
 
-  # estimate the outcome of each observation for each treatment level
-  valid_data_treat <- valid_data
-  valid_data_treat[treatment] <- names(propensity_score_ls)[1]
-  valid_data_treat[treatment] <- factor(
-    dplyr::pull(valid_data_treat, treatment),
-    levels = names(propensity_score_ls)
-  )
-  pred_task_treat <- sl3::make_sl3_Task(
-    data = valid_data_treat,
-    covariates = covar_names,
-    outcome = outcome
-  )
-  valid_data$y_hat_treat <- sl_fit$predict(pred_task_treat)
+    # train the SuperLearner on the training data
+    sl_fit <- super_learner$train(train_task)
 
-  valid_data_cont <- valid_data
-  valid_data_cont[treatment] <- names(propensity_score_ls)[2]
-  valid_data_cont[treatment] <- factor(
-    dplyr::pull(valid_data_cont, treatment),
-    levels = names(propensity_score_ls)
-  )
-  pred_task_cont <- sl3::make_sl3_Task(
-    data = valid_data_cont,
-    covariates = covar_names,
-    outcome = outcome
-  )
-  valid_data$y_hat_cont <- sl_fit$predict(pred_task_cont)
+    # estimate the outcome of each observation for each treatment level
+    pred_task_treat <- sl3::make_sl3_Task(
+      data = valid_data_treat,
+      covariates = covar_names,
+      outcome = outcome
+    )
+    valid_data$y_hat_treat <- sl_fit$predict(pred_task_treat)
+    pred_task_cont <- sl3::make_sl3_Task(
+      data = valid_data_cont,
+      covariates = covar_names,
+      outcome = outcome
+    )
+    valid_data$y_hat_cont <- sl_fit$predict(pred_task_cont)
+  }
 
   # apply the doubly-robust A-IPTW transform to each observation for each
   # treatment level in valid_data
@@ -328,23 +311,39 @@ hold_out_calculation <- function(fold, data, outcome, treatment, biomarkers,
   )
   valid_data <- valid_data %>% dplyr::select(-y_diff)
   coefs_and_ic_ls <- valid_data %>%
-    purrr::map(
-      function(bio) {
+    purrr::map2(
+      colnames(valid_data),
+      function(bio, bio_name) {
 
         # center the biomarker measurements
         bio <- as.vector(base::scale(bio, center = TRUE, scale = FALSE))
+        var_bio <- mean(bio^2)
+
+        # if the variance of the biomarker is nonzero, then proceed
+        # otherwise, fail gracefully
+        if (var_bio > 0) {
 
-        # estimate the best linear approximation using the estimating equation
-        # formula
-        bio_coef <- sum(y_diff * bio) / sum(bio^2)
+          # estimate the best linear approximation using the estimating equation
+          # formula
+          bio_coef <- mean(y_diff * bio) / var_bio
 
-        # compute the unscaled empirical IC of each observation
-        unsc_inf_curves <- (y_diff - bio_coef * bio) * bio
+          # compute the empirical IC of each observation
+          inf_curves <- ((y_diff - bio_coef * bio) * bio) / var_bio
+        } else {
+
+          # set the coefficient to zero, and make the influence curve enormous
+          bio_coef <- 0
+          inf_curves <- rep(1000000, length(y_diff))
+          message(paste(
+            "Biomarker", bio_name, "has low variability. Remove",
+            "it and repeat the analysis."
+          ))
+        }
 
         # return the beta coefficients and the influence curves
         return(list(
           "bio_coef" = bio_coef,
-          "unsc_inf_curves" = unsc_inf_curves
+          "inf_curves" = inf_curves
         ))
       }
     )
@@ -357,24 +356,13 @@ hold_out_calculation <- function(fold, data, outcome, treatment, biomarkers,
   names(beta_coefs) <- biomarkers
 
   # extract the table of un-scaled influence curves
-  unsc_ic_mat <- lapply(
-    seq_len(
-      length(coefs_and_ic_ls)
-    ),
-    function(idx) coefs_and_ic_ls[[idx]]$unsc_inf_curves
+  ic_ls <- lapply(
+    seq_len(length(coefs_and_ic_ls)),
+    function(idx) coefs_and_ic_ls[[idx]]$inf_curves
   )
-  unsc_ic_mat <- do.call(cbind, unsc_ic_mat)
-
-  # scale the un-scaled influence curves by the training set's biomarkers' vars
-  train_var_bio_vec <- train_data %>%
-    dplyr::select(dplyr::all_of(biomarkers)) %>%
-    as.matrix() %>%
-    matrixStats::colVars()
-  ic_df <- tcrossprod(unsc_ic_mat, diag(1 / train_var_bio_vec)) %>%
-    dplyr::as_tibble(.name_repair = "minimal")
+  ic_df <- do.call(cbind, ic_ls) %>% dplyr::as_tibble(.name_repair = "minimal")
   colnames(ic_df) <- biomarkers
 
-
   # return the vector of coefficients and the table of empirical IC values
   return(
     list(