spell check and small patches

koalaverse · Aug 26, 2018 · f6915f3 · f6915f3
1 parent f8d9c94
commit f6915f3
Show file tree

Hide file tree

Showing 17 changed files with 214 additions and 100 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -6,7 +6,7 @@
 
 * Changed `truncate_feature_names` argument of `vi()` to `abbreviate_feature_names` which abbreviates all feature names, rather than just truncating them.
 
-* Added CRAN badge [(#32)](https://github.com/koalaverse/vip/issues/32).
+* Added CRAN-related badges [(#32)](https://github.com/koalaverse/vip/issues/32).
 
 * New generic `vi_permute()` for constructing permutation-based variable importance scores [(#19)](https://github.com/koalaverse/vip/issues/19).
 

diff --git a/R/get_predictions.R b/R/get_predictions.R
@@ -5,7 +5,9 @@ get_predictions <- function(object, type = c("raw", "prob")) {
 
 
 #' @keywords internal
-get_predictions.default <- stats::predict
+get_predictions.default <- function(object, type = c("raw", "prob")) {
+  stats::predict
+}
 
 
 #' @keywords internal

diff --git a/R/metrics.R b/R/metrics.R
@@ -10,7 +10,7 @@ perf_rmse <- ModelMetrics::rmse
 
 #' @keywords internal
 perf_rsquared <- function(actual, predicted) {
-  stats::cor(actual, predicted)^2
+  stats::cor(x = actual, y = predicted) ^ 2
 }
 
 
@@ -27,7 +27,10 @@ perf_auc <- function(actual, predicted) {
   if (NCOL(predicted) != 2L) {
     stop("Expected a 2 column matrix of predicted class probabilities.")
   }
-  ModelMetrics::auc(actual = actual, predicted = predicted[, 1L, drop = TRUE])
+  ModelMetrics::auc(
+    actual = actual,
+    predicted = predicted[, 1L, drop = TRUE]
+  )
 }
 
 
@@ -36,7 +39,10 @@ perf_logLoss <- function(actual, predicted) {
   if (NCOL(predicted) != 2L) {
     stop("Expected a 2 column matrix of predicted class probabilities.")
   }
-  ModelMetrics::logLoss(actual = actual, predicted = predicted[, 1L, drop = TRUE])
+  ModelMetrics::logLoss(
+    actual = actual,
+    predicted = predicted[, 1L, drop = TRUE]
+  )
 }
 
 
@@ -47,7 +53,10 @@ perf_mauc <- function(actual, predicted) {
   if (NCOL(predicted) <= 2L) {
     stop("Expected a >2 column matrix of predicted class probabilities.")
   }
-  ModelMetrics::mauc(actual = actual, predicted = predicted)$mauc
+  ModelMetrics::mauc(
+    actual = actual,
+    predicted = predicted
+  )$mauc
 }
 
 
@@ -56,7 +65,10 @@ perf_mlogLoss <- function(actual, predicted) {
   if (NCOL(predicted) <= 2L) {
     stop("Expected a >2 column matrix of predicted class probabilities.")
   }
-  ModelMetrics::mlogLoss(actual = actual, predicted = predicted)
+  ModelMetrics::mlogLoss(
+    actual = actual,
+    predicted = predicted
+  )
 }
 
 

diff --git a/R/vi.R b/R/vi.R
@@ -14,7 +14,7 @@
 #' @param feature_names Character string giving the names of the predictor
 #' variables (i.e., features) of interest.
 #'
-#' @param FUN List with two componenets, \code{"cat"} and \code{"con"},
+#' @param FUN List with two components, \code{"cat"} and \code{"con"},
 #' containing the functions to use for categorical and continuous features,
 #' respectively. If \code{NULL}, the standard deviation is used for continuous
 #' features. For categorical features, the range statistic is used (i.e.,
@@ -74,17 +74,19 @@ vi <- function(
 
   # Construct VI scores
   method <- match.arg(method)
-  if (method %in% c("pdp", "ice", "permute")) {
+  if (method %in% c("pdp", "ice")) {
     if (missing(feature_names)) {
       feature_names <- get_feature_names(object)
     }
   }
 
+  # Construct tibble of VI scores
   tib <- switch(method,
-                "model" = vi_model(object, ...),
-                "pdp" = vi_pdp(object, feature_names = feature_names, FUN = FUN, ...),
-                "ice" = vi_ice(object, feature_names = feature_names, FUN = FUN, ...),
-                vi_permute(object, feature_names = feature_names, ...))
+    "model" = vi_model(object, ...),
+    "pdp" = vi_pdp(object, feature_names = feature_names, FUN = FUN, ...),
+    "ice" = vi_ice(object, feature_names = feature_names, FUN = FUN, ...),
+    vi_permute(object, feature_names = feature_names, ...)
+  )
 
   # Save attribute
   vi_type <- attr(tib, which = "type")

diff --git a/R/vi_ice.R b/R/vi_ice.R
@@ -8,7 +8,7 @@
 #' @param feature_names Character string giving the names of the predictor
 #' variables (i.e., features) of interest.
 #'
-#' @param FUN List with two componenets, \code{"cat"} and \code{"con"},
+#' @param FUN List with two components, \code{"cat"} and \code{"con"},
 #' containing the functions to use for categorical and continuous features,
 #' respectively. If \code{NULL}, the standard deviation is used for continuous
 #' features. For categorical features, the range statistic is used (i.e.,

diff --git a/R/vi_pdp.R b/R/vi_pdp.R
@@ -8,7 +8,7 @@
 #' @param feature_names Character string giving the names of the predictor
 #' variables (i.e., features) of interest.
 #'
-#' @param FUN List with two componenets, \code{"cat"} and \code{"con"},
+#' @param FUN List with two components, \code{"cat"} and \code{"con"},
 #' containing the functions to use for categorical and continuous features,
 #' respectively. If \code{NULL}, the standard deviation is used for continuous
 #' features. For categorical features, the range statistic is used (i.e.,

diff --git a/R/vi_permute.R b/R/vi_permute.R
@@ -5,34 +5,40 @@
 #'
 #' @param object A fitted model object (e.g., a \code{"randomForest"} object).
 #'
-#' @param train Data frame containing the original training data.
+#' @param train A matrix-like R object (e.g., a data frame or matrix)
+#' containing the training data.
 #'
-#' @param response_name Character string giving the name (or position) of the
-#' traget column in \code{train}.
-#'
-#' @param pred_fun Optional prediction function that requires two arguments,
-#' \code{object} and \code{newdata}. Default is \code{NULL}.
+#' @param target Either a character string giving the name (or position) of the
+#' target column in \code{train} or, if \code{train} only contains feature
+#' columns, a vector containing the target values used to train \code{object}.
 #'
 #' @param metric Either a function or character string specifying the
-#' performancefor metric to use in computing model performance (e.g.,
-#' RMSE for regression or accuracy for binary classification). If \code{metric}
-#' is a function, then it requires two arguments, \code{actual} and
-#' \code{predicted}, and should return a single, numeric value.
+#' performance metric to use in computing model performance (e.g., RMSE for
+#' regression or accuracy for binary classification). If \code{metric} is a
+#' function, then it requires two arguments, \code{actual} and \code{predicted},
+#' and should return a single, numeric value.
 #'
 #' @param smaller_is_better Logical indicating whether or not a smaller value
 #' of \code{metric} is better. Default is \code{NULL}. Must be supplied if
 #' \code{metric} is a user-supplied function.
 #'
-#' @param pos_class Character string specifying which category in `obs`
-#' represents the "positive" class (i.e., the class for which the predicted
-#' class probabilties correspond to). Only needed for binary classification
+#' @param reference_class Character string specifying which response category
+#' represents the "reference" class (i.e., the class for which the predicted
+#' class probabilities correspond to). Only needed for binary classification
 #' problems.
 #'
+#' @param pred_fun Optional prediction function that requires two arguments,
+#' \code{object} and \code{newdata}. Default is \code{NULL}. Must be supplied
+#' whenever \code{metric} is a custom function.
+#'
 #' @return A tidy data frame (i.e., a \code{"tibble"} object) with two columns:
 #' \code{Variable} and \code{Importance}. For \code{"glm"}-like object, an
 #' additional column, called \code{Sign}, is also included which gives the sign
 #' (i.e., POS/NEG) of the original coefficient.
 #'
+#' @param verbose Logical indicating whether or not to print information during
+#' the construction of variable importance scores. Default is \code{FALSE}.
+#'
 #' @param progress Character string giving the name of the progress bar to use.
 #' See \code{\link[plyr]{create_progress_bar}} for details. Default is
 #' \code{"none"}.
@@ -62,7 +68,7 @@
 #'
 #' # Simulate training data
 #' set.seed(101)  # for reproducibility
-#' trn <- as.data.frame(mlbench.friedman1(500)  # ?mlbench.friedman1
+#' trn <- as.data.frame(mlbench.friedman1(500))  # ?mlbench.friedman1
 #'
 #' # Inspect data
 #' tibble::as.tibble(trn)
@@ -76,9 +82,9 @@
 #'
 #' # Plot VI scores
 #' set.seed(2021)  # for reproducibility
-#' p1 <- vip(pp, method = "permute", response_name = "y", metric = "rsquared",
+#' p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared",
 #'           pred_fun = predict) + ggtitle("PPR")
-#' p2 <- vip(nn, method = "permute", response_name = "y", metric = "rsquared",
+#' p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared",
 #'           pred_fun = predict) + ggtitle("NN")
 #' grid.arrange(p1, p2, ncol = 2)
 #'
@@ -89,12 +95,11 @@
 #'
 #' # Permutation-based VIP with user-defined MAE metric
 #' set.seed(1101)  # for reproducibility
-#' vip(pp, method = "permute",
-#'     response_name = "y",
-#'     metric = mae,
+#' vip(pp, method = "permute", target = "y", metric = mae,
 #'     smaller_is_better = TRUE,
 #'     pred_fun = function(object, newdata) predict(object, newdata)  # wrapper
 #' ) + ggtitle("PPR")
+#' }
 vi_permute <- function(object, ...) {
   UseMethod("vi_permute")
 }
@@ -103,19 +108,9 @@ vi_permute <- function(object, ...) {
 #' @rdname vi_permute
 #'
 #' @export
-vi_permute.default <- function(
-  object,
-  train,
-  response_name,
-  # perf_fun = NULL,
-  metric = "auto",  # add log loss, auc, mae, mape, etc.
-  smaller_is_better = NULL,
-  pos_class = NULL,
-  pred_fun = NULL,
-  progress = "none",
-  parallel = FALSE,
-  paropts = NULL,
-  ...
+vi_permute.default <- function(object, train, target, metric = "auto",
+  smaller_is_better = NULL, reference_class = NULL, pred_fun = NULL,
+  verbose = FALSE, progress = "none", parallel = FALSE, paropts = NULL, ...
 ) {
 
   # Issue warning until this function is complete!
@@ -127,11 +122,16 @@ vi_permute.default <- function(
     train <- get_training_data(object)
   }
 
-  # Feature names
-  feature_names <- setdiff(names(train), response_name)
-
-  # Observed (training) response values
-  obs <- train[[response_name]]
+  # Extract feature names and separate features from target (if necessary)
+  if (is.character(target)) {
+    feature_names <- setdiff(colnames(train), target)
+    train_x <- train[, feature_names]
+    train_y <- train[, target, drop = TRUE]
+  } else {
+    feature_names <- colnames(train)
+    train_x <- train
+    train_y <- target
+  }
 
   # Metric
   if (is.function(metric)) {  # user-supplied function
@@ -150,7 +150,7 @@ vi_permute.default <- function(
            call. = FALSE)
     } else {
       # Check prediction function arguments
-      if (!identical(c("object", "newdata"), names(formals(pred_fun)))) {
+      if (!all(c("object", "newdata") %in% names(formals(pred_fun)))) {
         stop("`pred_fun()` must be a function with arguments `object` and ",
              "`newdata`.", call. = FALSE)
       }
@@ -229,17 +229,21 @@ vi_permute.default <- function(
       pred_fun <- get_predictions(object, type = type)
     }
 
-  }
+    # Determine reference class (binary classification only)
+    if (is.null(reference_class) && metric %in% c("auc", "logloss")) {
+      stop("Please specify the reference class via the `reference_class` ",
+           "argument when using \"auc\" or \"logloss\".")
+    }
+    if (!is.null(reference_class) && metric %in% c("auc", "logloss")) {
+      train_y <- ifelse(train_y == reference_class, yes = 1, no = 0)
+    }
 
-  # Determine reference class (classification only)
-  if (!is.null(pos_class)) {
-    obs <- ifelse(obs == pos_class, yes = 1, no = 0)
   }
 
   # Compute baseline metric for comparison
   baseline <- perf_fun(
-    actual = obs,
-    predicted = pred_fun(object, newdata = train)
+    actual = train_y,
+    predicted = pred_fun(object, newdata = train_x)
   )
 
   # Construct VI scores
@@ -253,11 +257,14 @@ vi_permute.default <- function(
   vis <- unlist(plyr::llply(feature_names, .progress = progress,
     .parallel = parallel, .paropts = paropts,
     .fun = function(x) {
-      copy <- train  # make copy
-      copy[[x]] <- sample(copy[[x]])  # permute values
+      if (verbose && !parallel) {
+        message("Computing variable importance for ", x, "...")
+      }
+      train_x_permuted <- train_x  # make copy
+      train_x_permuted[[x]] <- sample(train_x_permuted[[x]])  # permute values
       permuted <- perf_fun(
-        actual = obs,
-        predicted = pred_fun(object, newdata = copy)
+        actual = train_y,
+        predicted = pred_fun(object, newdata = train_x_permuted)
       )
       if (smaller_is_better) {
         permuted - baseline

diff --git a/R/vip.R b/R/vip.R
@@ -16,8 +16,8 @@
 #' @param horizontal Logical indicating whether or not to plot the importance
 #' scores on the x-axis (\code{TRUE}). Default is \code{TRUE}.
 #'
-#' @param alpha Numeric value between 0 and 1 giving the trasparency of the
-#' bars.
+#' @param alpha Numeric value between 0 and 1 giving the transparency of the
+#' bars (\code{bar = TRUE}) or points (\code{bar = FALSE}).
 #'
 #' @param color Character string specifying the color to use for the borders of
 #' the bars. Could also be a function, such as

diff --git a/README.Rmd b/README.Rmd
@@ -41,4 +41,4 @@ if (!requireNamespace("devtools")) {
 devtools::install_github("koalaverse/vip")
 ```
 
-For details and example usage, click the [Get started](https://koalaverse.github.io/vip/articles/vip.html) tab on the [`vip` package website](https://koalaverse.github.io/vip/index.html).
+For details and example usage, visit the [**vip** package website](https://koalaverse.github.io/vip/index.html).
diff --git a/man/vi.Rd b/man/vi.Rd
diff --git a/man/vi_ice.Rd b/man/vi_ice.Rd
diff --git a/man/vi_pdp.Rd b/man/vi_pdp.Rd