diff --git a/NEWS.md b/NEWS.md index a322591a..f109e1c8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ * Changed `truncate_feature_names` argument of `vi()` to `abbreviate_feature_names` which abbreviates all feature names, rather than just truncating them. -* Added CRAN badge [(#32)](https://github.com/koalaverse/vip/issues/32). +* Added CRAN-related badges [(#32)](https://github.com/koalaverse/vip/issues/32). * New generic `vi_permute()` for constructing permutation-based variable importance scores [(#19)](https://github.com/koalaverse/vip/issues/19). diff --git a/R/get_predictions.R b/R/get_predictions.R index 3793942d..f28b3d33 100644 --- a/R/get_predictions.R +++ b/R/get_predictions.R @@ -5,7 +5,9 @@ get_predictions <- function(object, type = c("raw", "prob")) { #' @keywords internal -get_predictions.default <- stats::predict +get_predictions.default <- function(object, type = c("raw", "prob")) { + stats::predict +} #' @keywords internal diff --git a/R/metrics.R b/R/metrics.R index fad0698c..67845ad7 100644 --- a/R/metrics.R +++ b/R/metrics.R @@ -10,7 +10,7 @@ perf_rmse <- ModelMetrics::rmse #' @keywords internal perf_rsquared <- function(actual, predicted) { - stats::cor(actual, predicted)^2 + stats::cor(x = actual, y = predicted) ^ 2 } @@ -27,7 +27,10 @@ perf_auc <- function(actual, predicted) { if (NCOL(predicted) != 2L) { stop("Expected a 2 column matrix of predicted class probabilities.") } - ModelMetrics::auc(actual = actual, predicted = predicted[, 1L, drop = TRUE]) + ModelMetrics::auc( + actual = actual, + predicted = predicted[, 1L, drop = TRUE] + ) } @@ -36,7 +39,10 @@ perf_logLoss <- function(actual, predicted) { if (NCOL(predicted) != 2L) { stop("Expected a 2 column matrix of predicted class probabilities.") } - ModelMetrics::logLoss(actual = actual, predicted = predicted[, 1L, drop = TRUE]) + ModelMetrics::logLoss( + actual = actual, + predicted = predicted[, 1L, drop = TRUE] + ) } @@ -47,7 +53,10 @@ perf_mauc <- function(actual, predicted) { if (NCOL(predicted) <= 2L) { stop("Expected a >2 column matrix of predicted class probabilities.") } - ModelMetrics::mauc(actual = actual, predicted = predicted)$mauc + ModelMetrics::mauc( + actual = actual, + predicted = predicted + )$mauc } @@ -56,7 +65,10 @@ perf_mlogLoss <- function(actual, predicted) { if (NCOL(predicted) <= 2L) { stop("Expected a >2 column matrix of predicted class probabilities.") } - ModelMetrics::mlogLoss(actual = actual, predicted = predicted) + ModelMetrics::mlogLoss( + actual = actual, + predicted = predicted + ) } diff --git a/R/vi.R b/R/vi.R index 9ff0f73b..7a0396d6 100644 --- a/R/vi.R +++ b/R/vi.R @@ -14,7 +14,7 @@ #' @param feature_names Character string giving the names of the predictor #' variables (i.e., features) of interest. #' -#' @param FUN List with two componenets, \code{"cat"} and \code{"con"}, +#' @param FUN List with two components, \code{"cat"} and \code{"con"}, #' containing the functions to use for categorical and continuous features, #' respectively. If \code{NULL}, the standard deviation is used for continuous #' features. For categorical features, the range statistic is used (i.e., @@ -74,17 +74,19 @@ vi <- function( # Construct VI scores method <- match.arg(method) - if (method %in% c("pdp", "ice", "permute")) { + if (method %in% c("pdp", "ice")) { if (missing(feature_names)) { feature_names <- get_feature_names(object) } } + # Construct tibble of VI scores tib <- switch(method, - "model" = vi_model(object, ...), - "pdp" = vi_pdp(object, feature_names = feature_names, FUN = FUN, ...), - "ice" = vi_ice(object, feature_names = feature_names, FUN = FUN, ...), - vi_permute(object, feature_names = feature_names, ...)) + "model" = vi_model(object, ...), + "pdp" = vi_pdp(object, feature_names = feature_names, FUN = FUN, ...), + "ice" = vi_ice(object, feature_names = feature_names, FUN = FUN, ...), + vi_permute(object, feature_names = feature_names, ...) + ) # Save attribute vi_type <- attr(tib, which = "type") diff --git a/R/vi_ice.R b/R/vi_ice.R index d445f34f..0ba521fa 100644 --- a/R/vi_ice.R +++ b/R/vi_ice.R @@ -8,7 +8,7 @@ #' @param feature_names Character string giving the names of the predictor #' variables (i.e., features) of interest. #' -#' @param FUN List with two componenets, \code{"cat"} and \code{"con"}, +#' @param FUN List with two components, \code{"cat"} and \code{"con"}, #' containing the functions to use for categorical and continuous features, #' respectively. If \code{NULL}, the standard deviation is used for continuous #' features. For categorical features, the range statistic is used (i.e., diff --git a/R/vi_pdp.R b/R/vi_pdp.R index 2bc367b7..b518337a 100644 --- a/R/vi_pdp.R +++ b/R/vi_pdp.R @@ -8,7 +8,7 @@ #' @param feature_names Character string giving the names of the predictor #' variables (i.e., features) of interest. #' -#' @param FUN List with two componenets, \code{"cat"} and \code{"con"}, +#' @param FUN List with two components, \code{"cat"} and \code{"con"}, #' containing the functions to use for categorical and continuous features, #' respectively. If \code{NULL}, the standard deviation is used for continuous #' features. For categorical features, the range statistic is used (i.e., diff --git a/R/vi_permute.R b/R/vi_permute.R index 90f8495b..daad13d8 100644 --- a/R/vi_permute.R +++ b/R/vi_permute.R @@ -5,34 +5,40 @@ #' #' @param object A fitted model object (e.g., a \code{"randomForest"} object). #' -#' @param train Data frame containing the original training data. +#' @param train A matrix-like R object (e.g., a data frame or matrix) +#' containing the training data. #' -#' @param response_name Character string giving the name (or position) of the -#' traget column in \code{train}. -#' -#' @param pred_fun Optional prediction function that requires two arguments, -#' \code{object} and \code{newdata}. Default is \code{NULL}. +#' @param target Either a character string giving the name (or position) of the +#' target column in \code{train} or, if \code{train} only contains feature +#' columns, a vector containing the target values used to train \code{object}. #' #' @param metric Either a function or character string specifying the -#' performancefor metric to use in computing model performance (e.g., -#' RMSE for regression or accuracy for binary classification). If \code{metric} -#' is a function, then it requires two arguments, \code{actual} and -#' \code{predicted}, and should return a single, numeric value. +#' performance metric to use in computing model performance (e.g., RMSE for +#' regression or accuracy for binary classification). If \code{metric} is a +#' function, then it requires two arguments, \code{actual} and \code{predicted}, +#' and should return a single, numeric value. #' #' @param smaller_is_better Logical indicating whether or not a smaller value #' of \code{metric} is better. Default is \code{NULL}. Must be supplied if #' \code{metric} is a user-supplied function. #' -#' @param pos_class Character string specifying which category in `obs` -#' represents the "positive" class (i.e., the class for which the predicted -#' class probabilties correspond to). Only needed for binary classification +#' @param reference_class Character string specifying which response category +#' represents the "reference" class (i.e., the class for which the predicted +#' class probabilities correspond to). Only needed for binary classification #' problems. #' +#' @param pred_fun Optional prediction function that requires two arguments, +#' \code{object} and \code{newdata}. Default is \code{NULL}. Must be supplied +#' whenever \code{metric} is a custom function. +#' #' @return A tidy data frame (i.e., a \code{"tibble"} object) with two columns: #' \code{Variable} and \code{Importance}. For \code{"glm"}-like object, an #' additional column, called \code{Sign}, is also included which gives the sign #' (i.e., POS/NEG) of the original coefficient. #' +#' @param verbose Logical indicating whether or not to print information during +#' the construction of variable importance scores. Default is \code{FALSE}. +#' #' @param progress Character string giving the name of the progress bar to use. #' See \code{\link[plyr]{create_progress_bar}} for details. Default is #' \code{"none"}. @@ -62,7 +68,7 @@ #' #' # Simulate training data #' set.seed(101) # for reproducibility -#' trn <- as.data.frame(mlbench.friedman1(500) # ?mlbench.friedman1 +#' trn <- as.data.frame(mlbench.friedman1(500)) # ?mlbench.friedman1 #' #' # Inspect data #' tibble::as.tibble(trn) @@ -76,9 +82,9 @@ #' #' # Plot VI scores #' set.seed(2021) # for reproducibility -#' p1 <- vip(pp, method = "permute", response_name = "y", metric = "rsquared", +#' p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared", #' pred_fun = predict) + ggtitle("PPR") -#' p2 <- vip(nn, method = "permute", response_name = "y", metric = "rsquared", +#' p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared", #' pred_fun = predict) + ggtitle("NN") #' grid.arrange(p1, p2, ncol = 2) #' @@ -89,12 +95,11 @@ #' #' # Permutation-based VIP with user-defined MAE metric #' set.seed(1101) # for reproducibility -#' vip(pp, method = "permute", -#' response_name = "y", -#' metric = mae, +#' vip(pp, method = "permute", target = "y", metric = mae, #' smaller_is_better = TRUE, #' pred_fun = function(object, newdata) predict(object, newdata) # wrapper #' ) + ggtitle("PPR") +#' } vi_permute <- function(object, ...) { UseMethod("vi_permute") } @@ -103,19 +108,9 @@ vi_permute <- function(object, ...) { #' @rdname vi_permute #' #' @export -vi_permute.default <- function( - object, - train, - response_name, - # perf_fun = NULL, - metric = "auto", # add log loss, auc, mae, mape, etc. - smaller_is_better = NULL, - pos_class = NULL, - pred_fun = NULL, - progress = "none", - parallel = FALSE, - paropts = NULL, - ... +vi_permute.default <- function(object, train, target, metric = "auto", + smaller_is_better = NULL, reference_class = NULL, pred_fun = NULL, + verbose = FALSE, progress = "none", parallel = FALSE, paropts = NULL, ... ) { # Issue warning until this function is complete! @@ -127,11 +122,16 @@ vi_permute.default <- function( train <- get_training_data(object) } - # Feature names - feature_names <- setdiff(names(train), response_name) - - # Observed (training) response values - obs <- train[[response_name]] + # Extract feature names and separate features from target (if necessary) + if (is.character(target)) { + feature_names <- setdiff(colnames(train), target) + train_x <- train[, feature_names] + train_y <- train[, target, drop = TRUE] + } else { + feature_names <- colnames(train) + train_x <- train + train_y <- target + } # Metric if (is.function(metric)) { # user-supplied function @@ -150,7 +150,7 @@ vi_permute.default <- function( call. = FALSE) } else { # Check prediction function arguments - if (!identical(c("object", "newdata"), names(formals(pred_fun)))) { + if (!all(c("object", "newdata") %in% names(formals(pred_fun)))) { stop("`pred_fun()` must be a function with arguments `object` and ", "`newdata`.", call. = FALSE) } @@ -229,17 +229,21 @@ vi_permute.default <- function( pred_fun <- get_predictions(object, type = type) } - } + # Determine reference class (binary classification only) + if (is.null(reference_class) && metric %in% c("auc", "logloss")) { + stop("Please specify the reference class via the `reference_class` ", + "argument when using \"auc\" or \"logloss\".") + } + if (!is.null(reference_class) && metric %in% c("auc", "logloss")) { + train_y <- ifelse(train_y == reference_class, yes = 1, no = 0) + } - # Determine reference class (classification only) - if (!is.null(pos_class)) { - obs <- ifelse(obs == pos_class, yes = 1, no = 0) } # Compute baseline metric for comparison baseline <- perf_fun( - actual = obs, - predicted = pred_fun(object, newdata = train) + actual = train_y, + predicted = pred_fun(object, newdata = train_x) ) # Construct VI scores @@ -253,11 +257,14 @@ vi_permute.default <- function( vis <- unlist(plyr::llply(feature_names, .progress = progress, .parallel = parallel, .paropts = paropts, .fun = function(x) { - copy <- train # make copy - copy[[x]] <- sample(copy[[x]]) # permute values + if (verbose && !parallel) { + message("Computing variable importance for ", x, "...") + } + train_x_permuted <- train_x # make copy + train_x_permuted[[x]] <- sample(train_x_permuted[[x]]) # permute values permuted <- perf_fun( - actual = obs, - predicted = pred_fun(object, newdata = copy) + actual = train_y, + predicted = pred_fun(object, newdata = train_x_permuted) ) if (smaller_is_better) { permuted - baseline diff --git a/R/vip.R b/R/vip.R index 76121023..279e6d56 100644 --- a/R/vip.R +++ b/R/vip.R @@ -16,8 +16,8 @@ #' @param horizontal Logical indicating whether or not to plot the importance #' scores on the x-axis (\code{TRUE}). Default is \code{TRUE}. #' -#' @param alpha Numeric value between 0 and 1 giving the trasparency of the -#' bars. +#' @param alpha Numeric value between 0 and 1 giving the transparency of the +#' bars (\code{bar = TRUE}) or points (\code{bar = FALSE}). #' #' @param color Character string specifying the color to use for the borders of #' the bars. Could also be a function, such as diff --git a/README.Rmd b/README.Rmd index e2c1d0de..8be149ea 100644 --- a/README.Rmd +++ b/README.Rmd @@ -41,4 +41,4 @@ if (!requireNamespace("devtools")) { devtools::install_github("koalaverse/vip") ``` -For details and example usage, click the [Get started](https://koalaverse.github.io/vip/articles/vip.html) tab on the [`vip` package website](https://koalaverse.github.io/vip/index.html). +For details and example usage, visit the [**vip** package website](https://koalaverse.github.io/vip/index.html). diff --git a/man/vi.Rd b/man/vi.Rd index d7a76ae0..100d3865 100644 --- a/man/vi.Rd +++ b/man/vi.Rd @@ -21,7 +21,7 @@ reference below.} \item{feature_names}{Character string giving the names of the predictor variables (i.e., features) of interest.} -\item{FUN}{List with two componenets, \code{"cat"} and \code{"con"}, +\item{FUN}{List with two components, \code{"cat"} and \code{"con"}, containing the functions to use for categorical and continuous features, respectively. If \code{NULL}, the standard deviation is used for continuous features. For categorical features, the range statistic is used (i.e., diff --git a/man/vi_ice.Rd b/man/vi_ice.Rd index 9d938c13..588cd28d 100644 --- a/man/vi_ice.Rd +++ b/man/vi_ice.Rd @@ -18,7 +18,7 @@ vi_ice(object, ...) \item{feature_names}{Character string giving the names of the predictor variables (i.e., features) of interest.} -\item{FUN}{List with two componenets, \code{"cat"} and \code{"con"}, +\item{FUN}{List with two components, \code{"cat"} and \code{"con"}, containing the functions to use for categorical and continuous features, respectively. If \code{NULL}, the standard deviation is used for continuous features. For categorical features, the range statistic is used (i.e., diff --git a/man/vi_pdp.Rd b/man/vi_pdp.Rd index 43b16522..68bed551 100644 --- a/man/vi_pdp.Rd +++ b/man/vi_pdp.Rd @@ -18,7 +18,7 @@ vi_pdp(object, ...) \item{feature_names}{Character string giving the names of the predictor variables (i.e., features) of interest.} -\item{FUN}{List with two componenets, \code{"cat"} and \code{"con"}, +\item{FUN}{List with two components, \code{"cat"} and \code{"con"}, containing the functions to use for categorical and continuous features, respectively. If \code{NULL}, the standard deviation is used for continuous features. For categorical features, the range statistic is used (i.e., diff --git a/man/vi_permute.Rd b/man/vi_permute.Rd index 5e383337..c441f0ae 100644 --- a/man/vi_permute.Rd +++ b/man/vi_permute.Rd @@ -7,9 +7,9 @@ \usage{ vi_permute(object, ...) -\method{vi_permute}{default}(object, train, response_name, - metric = "auto", smaller_is_better = NULL, pos_class = NULL, - pred_fun = NULL, progress = "none", parallel = FALSE, +\method{vi_permute}{default}(object, train, target, metric = "auto", + smaller_is_better = NULL, reference_class = NULL, pred_fun = NULL, + verbose = FALSE, progress = "none", parallel = FALSE, paropts = NULL, ...) } \arguments{ @@ -17,28 +17,34 @@ vi_permute(object, ...) \item{...}{Additional optional arguments. (Currently ignored.)} -\item{train}{Data frame containing the original training data.} +\item{train}{A matrix-like R object (e.g., a data frame or matrix) +containing the training data.} -\item{response_name}{Character string giving the name (or position) of the -traget column in \code{train}.} +\item{target}{Either a character string giving the name (or position) of the +target column in \code{train} or, if \code{train} only contains feature +columns, a vector containing the target values used to train \code{object}.} \item{metric}{Either a function or character string specifying the -performancefor metric to use in computing model performance (e.g., -RMSE for regression or accuracy for binary classification). If \code{metric} -is a function, then it requires two arguments, \code{actual} and -\code{predicted}, and should return a single, numeric value.} +performance metric to use in computing model performance (e.g., RMSE for +regression or accuracy for binary classification). If \code{metric} is a +function, then it requires two arguments, \code{actual} and \code{predicted}, +and should return a single, numeric value.} \item{smaller_is_better}{Logical indicating whether or not a smaller value of \code{metric} is better. Default is \code{NULL}. Must be supplied if \code{metric} is a user-supplied function.} -\item{pos_class}{Character string specifying which category in `obs` -represents the "positive" class (i.e., the class for which the predicted -class probabilties correspond to). Only needed for binary classification +\item{reference_class}{Character string specifying which response category +represents the "reference" class (i.e., the class for which the predicted +class probabilities correspond to). Only needed for binary classification problems.} \item{pred_fun}{Optional prediction function that requires two arguments, -\code{object} and \code{newdata}. Default is \code{NULL}.} +\code{object} and \code{newdata}. Default is \code{NULL}. Must be supplied +whenever \code{metric} is a custom function.} + +\item{verbose}{Logical indicating whether or not to print information during +the construction of variable importance scores. Default is \code{FALSE}.} \item{progress}{Character string giving the name of the progress bar to use. See \code{\link[plyr]{create_progress_bar}} for details. Default is @@ -65,3 +71,45 @@ model. (This function is meant for internal use only.) \details{ Coming soon! } +\examples{ +\dontrun{ +# Load required packages +library(ggplot2) # for ggtitle() function +library(mlbench) # for ML benchmark data sets +library(nnet) # for fitting neural networks + +# Simulate training data +set.seed(101) # for reproducibility +trn <- as.data.frame(mlbench.friedman1(500)) # ?mlbench.friedman1 + +# Inspect data +tibble::as.tibble(trn) + +# Fit PPR and NN models (hyperparameters were chosen using the caret package +# with 5 repeats of 5-fold cross-validation) +pp <- ppr(y ~ ., data = trn, nterms = 11) +set.seed(0803) # for reproducibility +nn <- nnet(y ~ ., data = trn, size = 7, decay = 0.1, linout = TRUE, + maxit = 500) + +# Plot VI scores +set.seed(2021) # for reproducibility +p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared", + pred_fun = predict) + ggtitle("PPR") +p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared", + pred_fun = predict) + ggtitle("NN") +grid.arrange(p1, p2, ncol = 2) + +# Mean absolute error +mae <- function(actual, predicted) { + mean(abs(actual - predicted)) +} + +# Permutation-based VIP with user-defined MAE metric +set.seed(1101) # for reproducibility +vip(pp, method = "permute", target = "y", metric = mae, + smaller_is_better = TRUE, + pred_fun = function(object, newdata) predict(object, newdata) # wrapper +) + ggtitle("PPR") +} +} diff --git a/man/vip.Rd b/man/vip.Rd index 0d99b27c..12adc32b 100644 --- a/man/vip.Rd +++ b/man/vip.Rd @@ -28,8 +28,8 @@ is \code{TRUE}. If \code{bar = FALSE}, then a dotchart is displayed instead.} \item{horizontal}{Logical indicating whether or not to plot the importance scores on the x-axis (\code{TRUE}). Default is \code{TRUE}.} -\item{alpha}{Numeric value between 0 and 1 giving the trasparency of the -bars.} +\item{alpha}{Numeric value between 0 and 1 giving the transparency of the +bars (\code{bar = TRUE}) or points (\code{bar = FALSE}).} \item{color}{Character string specifying the color to use for the borders of the bars. Could also be a function, such as diff --git a/slowtests/slowtests_vi_permute.R b/slowtests/slowtests_vi_permute.R index 6d6b00ad..43d8e306 100644 --- a/slowtests/slowtests_vi_permute.R +++ b/slowtests/slowtests_vi_permute.R @@ -4,18 +4,66 @@ library(randomForest) library(vip) + # Regression ------------------------------------------------------------------- +# Random forest +boston <- pdp::boston +set.seed(101) +rfo1 <- randomForest(cmedv ~ ., data = boston) + +# VIPs +set.seed(102) +vips <- lapply(c("mse", "rmse", "r2"), FUN = function(m) { + vip(rfo1, method = "permute", response_name = "cmedv", metric = m) + # or use pred_fun = predict + ggplot2::ggtitle(m) +}) +grid.arrange(grobs = vips, ncol = 3) + +# VIP based on user-supplied prediction function +mae <- function(actual, predicted) { + mean(abs(actual - predicted)) +} +set.seed(103) +pfun <- function(object, newdata) predict +vip(rfo1, method = "permute", response_name = "cmedv", metric = mae, + smaller_is_better = TRUE, pred_fun = randomForest:::predict.randomForest) + + ggplot2::ggtitle("Use-supplied metric: MAE") # Classification (binary) ------------------------------------------------- +# Random forest +pima <- na.omit(pdp::pima) +set.seed(201) +rfo2 <- randomForest(diabetes ~ ., data = pima) + +# Prediction wrappers +class_probs <- function(object, newdata) { + predict(object, newdata = newdata, type = "prob") +} +class_labels <- function(object, newdata) { + predict(object, newdata = newdata, type = "response") +} + +# VIPs +set.seed(202) +vips <- lapply(c("error", "auc", "logloss"), FUN = function(m) { + vip(rfo2, method = "permute", response_name = "diabetes", metric = m, + reference_class = "neg", + pred_fun = if (m == "error") class_labels else class_probs) + + ggplot2::ggtitle(m) +}) +grid.arrange(grobs = vips, ncol = 3) + # Classification (multiclass) --------------------------------------------- +# Random forest set.seed(301) rfo3 <- randomForest(Species ~ ., data = iris) +# Prediction wrappers class_probs <- function(object, newdata) { predict(object, newdata = newdata, type = "prob") } @@ -23,6 +71,7 @@ class_labels <- function(object, newdata) { predict(object, newdata = newdata, type = "response") } +# VIPs set.seed(302) vips <- lapply(c("error", "mauc", "mlogloss"), FUN = function(m) { vip(rfo3, method = "permute", response_name = "Species", metric = m, diff --git a/vignettes/vip-tensorflow.Rmd b/vignettes/vip-tensorflow.Rmd index b0960c53..88c2a641 100644 --- a/vignettes/vip-tensorflow.Rmd +++ b/vignettes/vip-tensorflow.Rmd @@ -116,13 +116,12 @@ A simple measure of variable importance can be obtained using the permutation ap set.seed(102) # for reproducibility p1 <- vip( object = model, # fitted model - method = "permute", # permutation-based VI scores + method = "permute", # request permutation-based VI scores num_features = ncol(train_x), # default only plots top 10 features - feature_names = colnames(train_x), # feature names in training data pred_fun = pred_wrapper, # user-defined prediction function - obs = train_y, # response values used for training + target = train_y, # name of the target variable column metric = "rsquared", # evaluation metric - train = as.data.frame(train_x) # training data + train = as.data.frame(train_x), # training data # progress = "text" # request a text-based progress bar ) print(p1) # display plot diff --git a/vignettes/vip.Rmd b/vignettes/vip.Rmd index a28b3b44..970f5750 100644 --- a/vignettes/vip.Rmd +++ b/vignettes/vip.Rmd @@ -279,20 +279,15 @@ grid.arrange(p1, p2, ncol = 2) #### Permutation method -The permutation method exists in various forms and was made popular in @random-breiman-2001 for random forests. A more general approach to the permutation method is described in [Assessing Variable Importance for Predictive Models of Arbitrary Type](https://cran.r-project.org/web/packages/datarobot/vignettes/VariableImportance.html), an R package vignette by DataRobot. The permutation approach used in `vip` is quite simple. The idea is that if we randomly permute the values of an important feature in the training data, the training performance would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable). This of course assumes that the model has been properly tuned (e.g., using cross-validation) and is not [over fitting](https://en.wikipedia.org/wiki/Overfitting). The permutation approach uses the difference between some baseline performance measure (e.g., training $R^2$ or RMSE) and the same performance measure obtained after permuting the values of a particular feature in the training data (**Note**: the model is NOT refit to the training data after randomly permuting the values of a feature). To use the permutation approach, specify `method = "permute"` in the call to `vi()` or `vip()`. **Note**: using `method = "permute"` requires specifying a few additional arguments (some of which will be automated in the future): - -* `obs` a vector containing the response values from the training data; -* `metric` the performance metric to use (e.g., `"rmse"` for regression or `"logloss"` for binary classification); -* `pred_fun` a prediction function with two arguments, `object` and `newdata`, that returns a vector of predicted values or class labels (regression or classification), or a matrix of predicted probabilities (classification). The output of `pred_fun` depends upon which performance metric is used (for example, `metric = "mauc"` (for multiclass area under the curve requires that `pred_fun` return a matrix of predicted probabilities---one for each class). -* `pos_class` character string specifying which category in `obs` represents the "positive" class in binary classification problems (i.e., the class for which the predicted class probabilities correspond to) +The permutation method exists in various forms and was made popular in @random-breiman-2001 for random forests. A more general approach to the permutation method is described in [Assessing Variable Importance for Predictive Models of Arbitrary Type](https://cran.r-project.org/web/packages/datarobot/vignettes/VariableImportance.html), an R package vignette by DataRobot. The permutation approach used in `vip` is quite simple. The idea is that if we randomly permute the values of an important feature in the training data, the training performance would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable). This of course assumes that the model has been properly tuned (e.g., using cross-validation) and is not [over fitting](https://en.wikipedia.org/wiki/Overfitting). The permutation approach uses the difference between some baseline performance measure (e.g., training $R^2$ or RMSE) and the same performance measure obtained after permuting the values of a particular feature in the training data (**Note**: the model is NOT refit to the training data after randomly permuting the values of a feature). To use the permutation approach, specify `method = "permute"` in the call to `vi()` or `vip()`. Note that using `method = "permute"` requires specifying a few additional arguments; see `?vi_permute` for details. An example is given below for the previously fitted PPR and NN models. ```{r ppr-permute} # Plot VI scores set.seed(2021) # for reproducibility -p1 <- vip(pp, method = "permute", obs = trn$y, metric = "rsquared", +p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared", pred_fun = predict) + ggtitle("PPR") -p2 <- vip(nn, method = "permute", obs = trn$y, metric = "rsquared", +p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared", pred_fun = predict) + ggtitle("NN") grid.arrange(p1, p2, ncol = 2) ``` @@ -300,7 +295,7 @@ grid.arrange(p1, p2, ncol = 2) ### The Pima indians diabetes data -As a final example, we'll consider the well-known Pima Indians diabetes data; see `?pdp::pima` for details. These data contain diabetes test results collected by the the US National Institute of Diabetes and Digestive and Kidney Diseases from a population of women who were at least 21 years old, of Pima Indian heritage, and living near Phoenix, Arizona. The target variable, `diabetes`, is a factor indicating the diabetes test result (`pos`/`neg`). In the code chunk below, we fit a random forest to the Pima Indians data using the fantastic [ranger](https://github.com/imbs-hl/ranger) package. **Note**: we fit two different random forests: `rfo1` and `rfo2`. The only difference is that we would use `rfo1` if we wanted predicted class labels and we would use `rfo2` for predicted class probabilities. The distinction is important when using `method = "permute"` since the performance metric being used requires the predicted outcome to be either the class labels (e.g., `metric = "error"` for classification error) or predicted class labels (e.g., `"auc"` for area under the curve). We'll illustrate both below. We should point out that there is more built-in support for `"ranger"` objects, so it is not necessary to supply `pred_fun` or specify a specific metric (the default is `metric = "auto"`), but for completeness, we explicitly specify all the options. +As a final example, we'll consider the well-known Pima Indians diabetes data; see `?pdp::pima` for details. These data contain diabetes test results collected by the the US National Institute of Diabetes and Digestive and Kidney Diseases from a population of women who were at least 21 years old, of Pima Indian heritage, and living near Phoenix, Arizona. The target variable, `diabetes`, is a factor indicating the diabetes test result (`pos`/`neg`). In the code chunk below, we fit a random forest to the Pima Indians data using the fantastic [ranger](https://github.com/imbs-hl/ranger) package. Note that we fit two different random forests: `rfo1` and `rfo2`. The only difference is that we would use `rfo1` if we wanted predicted class labels and we would use `rfo2` for predicted class probabilities. The distinction is important when using `method = "permute"` since the performance metric being used requires the predicted outcome to be either the class labels (e.g., `metric = "error"` for classification error) or predicted class labels (e.g., `"auc"` for area under the curve). We'll illustrate both below. We should point out that there is more built-in support for `"ranger"` objects, so it is not necessary to supply `pred_fun` or specify a specific metric (the default is `metric = "auto"`), but for completeness, we explicitly specify all the options. ```{r pima-ranger, fig.width=7, fig.height=5, out.width="100%"} # Load required packages @@ -322,9 +317,9 @@ p2 <- vip(rfo2) # model-specific set.seed(1329) # for reproducibility pfun <- function(object, newdata) predict(object, data = newdata)$predictions p3 <- vip(rfo1, method = "permute", metric = "error", pred_fun = pfun, - obs = pima$diabetes) + target = "diabetes") p4 <- vip(rfo2, method = "permute", metric = "auc", pred_fun = pfun, - obs = pima$diabetes, pos_class = "neg") + target = "diabetes", reference_class = "neg") grid.arrange(p1, p2, p3, p4, ncol = 2) ```