Merge pull request #5 from animusnaturae/merge_imbs

Merge new ranger version
imbs-hl · May 26, 2016 · d0e82e0 · d0e82e0
2 parents d09c5d9 + 736c973
commit d0e82e0
Show file tree

Hide file tree

Showing 44 changed files with 2,070 additions and 232 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,25 @@
+##### Version 0.4.4
+* Add p-values for variable importance
+* Bug fixes
+
+##### Version 0.4.3
+* Add splitting by maximally selected rank statistics for survival forests
+* Bug fixes
+
+##### Version 0.4.2
+* Add Windows multithreading support for new toolchain
+
+##### Version 0.4.1
+* Runtime improvement for regression forests on classification data
+
+##### Version 0.4.0
+* New CRAN version. New CRAN versions will be 0.x.0, development versions 0.x.y
+
 ##### Version 0.3.9
 * Reduce memory usage of savest forest objects (changed child.nodeIDs interface)
 
 ##### Version 0.3.8
-* Remove tuning functions, please use mlr or caret.
+* Remove tuning functions, please use mlr or caret
 
 ##### Version 0.3.7
 * Fix bug with alternative interface and prediction

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 [![Build Status](https://travis-ci.org/imbs-hl/ranger.svg?branch=master)](https://travis-ci.org/imbs-hl/ranger)
-[![Coverage Status](https://coveralls.io/repos/github/imbs-hl/ranger/badge.svg?branch=coveralls_new)](https://coveralls.io/github/imbs-hl/ranger?branch=coveralls_new)
+[![Coverage Status](https://coveralls.io/repos/github/imbs-hl/ranger/badge.svg?branch=master)](https://coveralls.io/github/imbs-hl/ranger?branch=master)
 ![CRAN Downloads month](http://cranlogs.r-pkg.org/badges/ranger?color=brightgreen)
 ![CRAN Downloads overall](http://cranlogs.r-pkg.org/badges/grand-total/ranger?color=brightgreen)
 ## ranger: A Fast Implementation of Random Forests
@@ -17,7 +17,7 @@ To install the Ranger R package from CRAN, just run
 install.packages("ranger”)
 ```
 
-R version >= 3.1 is required. Note that, for now, no multithreading is supported in the R version on Windows platforms (the compiler in RTools is too old).
+R version >= 3.1 is required. Note that, for now, R-devel and the new RTools toolchain is required for multithreading on Windows platforms (or install a binary version).
 
 To install the C++ version of Ranger in Linux or Mac OS X you will need a compiler supporting C++11 (i.e. gcc >= 4.7 or Clang >= 3.0) and Cmake. To build start a terminal from the Ranger main directory and run the following commands
 
@@ -55,6 +55,9 @@ ranger --verbose --file data.dat --depvarname Species --treetype 1 --ntree 1000
 If you find any bugs, or if you experience any crashes, please report to us. If you have any questions just ask, we won't bite. 
 
 ### References
+* Wright, M. N. & Ziegler, A. (2016). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. Journal of Statistical Software, in press. http://arxiv.org/abs/1508.04409.
+* Schmid, M., Wright, M. N. & Ziegler, A. (2015). On the Use of Harrell's C for Node Splitting in Random Survival Forests. Technical Report. http://arxiv.org/abs/1507.03092.
+* Wright, M. N., Dankowski, T. & Ziegler, A. (2016). Random forests for survival analysis using maximally selected rank statistics. Technical Report. http://arxiv.org/abs/1605.03391.
 * Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
 * Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests. The Annals of Applied Statistics, 841-860.
 * Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods Inf Med, 51(1), 74.
diff --git a/ranger-r-package/ranger/DESCRIPTION b/ranger-r-package/ranger/DESCRIPTION
@@ -1,17 +1,19 @@
 Package: ranger
 Type: Package
 Title: A Fast Implementation of Random Forests
-Version: 0.3.9
-Date: 2016-03-22
+Version: 0.4.4
+Date: 2016-05-24
 Author: Marvin N. Wright
 Maintainer: Marvin N. Wright <wright@imbs.uni-luebeck.de>
-Description: A fast implementation of Random Forests, particularly suited for high dimensional data. Ensembles of
-             classification, regression, survival and probability prediction trees are supported. Data from
-             genome-wide association studies can be analyzed efficiently. In addition to data frames, datasets of
-             class 'gwaa.data' (R package GenABEL) can be directly analyzed.
+Description: A fast implementation of Random Forests, particularly suited for high dimensional data. Ensembles
+            of classification, regression, survival and probability prediction trees are supported. Data from
+            genome-wide association studies can be analyzed efficiently. In addition to data frames, datasets
+            of class 'gwaa.data' (R package 'GenABEL') can be directly analyzed.
 License: GPL-3
 Imports: Rcpp (>= 0.11.2)
 LinkingTo: Rcpp
 Depends: R (>= 3.1)
-Suggests: survival, testthat
+Suggests: survival, testthat, GenABEL
 RoxygenNote: 5.0.1
+URL: https://github.com/imbs-hl/ranger
+BugReports: https://github.com/imbs-hl/ranger/issues
diff --git a/ranger-r-package/ranger/NAMESPACE b/ranger-r-package/ranger/NAMESPACE
@@ -12,7 +12,9 @@ S3method(timepoints,ranger)
 S3method(timepoints,ranger.prediction)
 export(csrf)
 export(getTerminalNodeIDs)
+export(holdoutRF)
 export(importance)
+export(importance_pvalues)
 export(predictions)
 export(ranger)
 export(timepoints)

diff --git a/ranger-r-package/ranger/NEWS b/ranger-r-package/ranger/NEWS
@@ -1,32 +1,28 @@
-##### Version 0.3.9
-* Reduce memory usage of savest forest objects (changed child.nodeIDs interface)
+##### Version 0.4.4
+* Add p-values for variable importance
+* Bug fixes
+
+##### Version 0.4.3
+* Add splitting by maximally selected rank statistics for survival forests
+* Bug fixes
 
-##### Version 0.3.8
-* Remove tuning functions, please use mlr or caret.
+##### Version 0.4.2
+* Add Windows multithreading support for new toolchain
 
-##### Version 0.3.7
-* Fix bug with alternative interface and prediction
-* Small fixes
+##### Version 0.4.1
+* Runtime improvement for regression forests on classification data
 
-##### Version 0.3.6
+##### Version 0.4.0
+* Reduce memory usage of savest forest objects (changed child.nodeIDs interface)
 * Add keep.inbag option to track in-bag counts
 * Add option sample.fraction for fraction of sampled observations
-
-##### Version 0.3.5
 * Add tree-wise split.select.weights
-
-##### Version 0.3.4
 * Add predict.all option in predict() to get individual predictions for each tree for classification and regression
-* Small changes in documentation
-
-##### Version 0.3.3
 * Add case-specific random forests
-
-##### Version 0.3.2
 * Add case weights (weighted bootstrapping or subsampling)
-
-##### Version 0.3.1
+* Remove tuning functions, please use mlr or caret
 * Catch error of outdated gcc not supporting C++11 completely
+* Bug fixes
 
 ##### Version 0.3.0
 * Allow the user to interrupt computation from R

diff --git a/ranger-r-package/ranger/R/RcppExports.R b/ranger-r-package/ranger/R/RcppExports.R
@@ -1,7 +1,7 @@
 # This file was generated by Rcpp::compileAttributes
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, sparse_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, predict_all, keep_inbag, sample_fraction) {
-    .Call('ranger_rangerCpp', PACKAGE = 'ranger', treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, sparse_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, predict_all, keep_inbag, sample_fraction)
+rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, sparse_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout) {
+    .Call('ranger_rangerCpp', PACKAGE = 'ranger', treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, sparse_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout)
 }
 
diff --git a/ranger-r-package/ranger/R/getTerminalNodeIDs.R b/ranger-r-package/ranger/R/getTerminalNodeIDs.R
@@ -36,7 +36,7 @@
 ##'
 ##' @examples
 ##' library(ranger)
-##' rf <- ranger(Species ~ ., data = iris, write.forest = TRUE)
+##' rf <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE)
 ##' getTerminalNodeIDs(rf, iris)
 ##' @export
 getTerminalNodeIDs <- function(rf, dat) {

diff --git a/ranger-r-package/ranger/R/holdoutRF.R b/ranger-r-package/ranger/R/holdoutRF.R
@@ -0,0 +1,68 @@
+# -------------------------------------------------------------------------------
+#   This file is part of Ranger.
+#
+# Ranger is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Ranger is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Ranger. If not, see <http://www.gnu.org/licenses/>.
+#
+# Written by:
+#
+#   Marvin N. Wright
+# Institut fuer Medizinische Biometrie und Statistik
+# Universitaet zu Luebeck
+# Ratzeburger Allee 160
+# 23562 Luebeck
+# Germany
+#
+# http://www.imbs-luebeck.de
+# wright@imbs.uni-luebeck.de
+# -------------------------------------------------------------------------------
+
+##' Grow two random forests on two cross-validation folds. 
+##' Instead of out-of-bag data, the other fold is used to compute permutation importance.
+##' Related to the novel permutation variable importance by Janitza et al. (2015).
+##'
+##' @title Hold-out random forests
+##' @param formula Object of class \code{formula} or \code{character} describing the model to fit.
+##' @param data Training data of class \code{data.frame}, \code{matrix} or \code{gwaa.data} (GenABEL).
+##' @param ... Further arguments passed to ranger(). 
+##' @return Hold-out random forests with variable importance.
+##' @seealso \code{\link{ranger}}
+##' @author Marvin N. Wright
+##' @references
+##'   Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forest for high dimensional data, Technical Report 185, University of Munich, \url{https://epub.ub.uni-muenchen.de/25587}. \cr
+##' @export 
+holdoutRF <- function(formula, data, ...) {
+  ## Split data
+  if ("gwaa.data" %in% class(data)) {
+    n <- nrow(data@phdata) 
+  } else {
+    n <- nrow(data)
+  }
+  weights <- rbinom(n, 1, 0.5)
+
+  ## Grow RFs
+  res <- list(
+    rf1 = ranger(formula = formula, data = data, importance = "permutation",  
+                 case.weights = weights, replace = FALSE, holdout = TRUE, ...),
+    rf2 = ranger(formula = formula, data = data, importance = "permutation",
+                 case.weights = 1-weights, replace = FALSE, holdout = TRUE, ...)
+  )
+
+  ## Compute importance
+  res$variable.importance <- (res$rf1$variable.importance + res$rf2$variable.importance)/2
+  res$treetype <- res$rf1$treetype
+  res$importance.mode <- res$rf1$importance.mode
+  class(res) <- "holdoutRF"
+
+  res
+}
diff --git a/ranger-r-package/ranger/R/importance.R b/ranger-r-package/ranger/R/importance.R
@@ -30,11 +30,11 @@
 ##' @export
 importance <- function(x, ...)  UseMethod("importance")
 
-##' Extract variable importance of Ranger object.
+##' Extract variable importance of ranger object.
 ##'
 ##'
-##' @title Ranger variable importance
-##' @param x Ranger object.
+##' @title ranger variable importance
+##' @param x ranger object.
 ##' @param ... Further arguments passed to or from other methods.
 ##' @return Variable importance measures.
 ##' @seealso \code{\link{ranger}}
@@ -50,3 +50,86 @@ importance.ranger <- function(x, ...) {
   }
   return(x$variable.importance)
 }
+
+##' Compute variable importance with confidence intervals and p-values.
+##'
+##'
+##' @title ranger variable importance confidence intervals and p-values
+##' @param x ranger or holdoutRF object.
+##' @param method Method to compute p-values. Use "janitza" for the method by Janitza et al. (2015) or "altmann" for the non-parametric method by Altmann et al. (2010).
+##' @param conf.level Confidence level for confidence intervals.
+##' @param num.permutations Number of permutations. Used in the "altmann" method only.
+##' @param formula Object of class formula or character describing the model to fit. Used in the "altmann" method only.
+##' @param data Training data of class data.frame or matrix. Used in the "altmann" method only.
+##' @param ... Further arguments passed to ranger(). Used in the "altmann" method only.
+##' @return Variable importance, confidence intervals and p-values.
+##' @seealso \code{\link{ranger}}
+##' @author Marvin N. Wright
+##' @references
+##'   Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forest for high dimensional data, Technical Report 185, University of Munich, \url{https://epub.ub.uni-muenchen.de/25587}. \cr
+##'   Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26(10):1340-1347.
+##' @export 
+importance_pvalues <- function(x, method = c("janitza", "altmann"), conf.level = 0.95, num.permutations = 100, formula = NULL, data = NULL, ...) {
+  if (class(x) != "ranger" & class(x) != "holdoutRF") {
+    stop("Object is no ranger or holdoutRF object.")
+  }
+  if (x$importance.mode == "none" | is.null(x$variable.importance) | length(x$variable.importance) < 1) {
+    stop("No variable importance found. Please use 'importance' option when growing the forest.")
+  }
+
+  if (method == "janitza") {
+    if (x$importance.mode == "impurity") {
+      stop("Impurity variable importance found. Please use (hold-out) permutation importance to use this method.")
+    }
+    if (class(x) != "holdoutRF" & x$importance.mode == "permutation") {
+      warning("Permutation variable importance found, inaccurate p-values. Please use hold-out permutation importance to use this method.")
+    }
+    if (x$treetype != "Classification") {
+      warning("This method is tested for classification only, use with care.")
+    }
+
+    ## Mirrored VIMP
+    m1 <- x$variable.importance[x$variable.importance < 0]
+    m2 <- x$variable.importance[x$variable.importance == 0]
+    vimp <- c(m1, -m1, m2)
+
+    ## TODO: 100 ok? increase? 
+    if (length(m1) == 0) {
+      stop("No negative importance values found. Consider the 'altmann' approach.")
+    }
+    if (length(m1) < 100) {
+      warning("Only few negative importance values found, inaccurate p-values. Consider the 'altmann' approach.")
+    }
+  } else if (method == "altmann") {
+    if (class(x) != "ranger") {
+      stop("Altmann method not available for holdoutRF objects.")
+    }
+    if (is.null(formula) | is.null(data)) {
+      stop("Formula and data required for the 'altmann' method.")
+    }
+
+    ## Permute and compute importance again
+    dependent.variable.name <- all.vars(formula)[1]
+    vimp <- replicate(num.permutations, {
+      dat <- data
+      dat[, dependent.variable.name] <- sample(dat[, dependent.variable.name])
+      ranger(formula, dat, num.trees = x$num.trees, mtry = x$mtry, min.node.size = x$min.node.size, 
+             importance = x$importance.mode)$variable.importance
+    })
+  } else {
+    stop("Unknown p-value method. Available methods are: 'janitza' and 'altmann'.")
+  }
+
+  ## Compute p-value
+  pval <- 1 - ecdf(vimp)(x$variable.importance)
+
+  ## Compute CI
+  width <- qnorm((1+conf.level)/2) * sd(vimp)
+  ci <- cbind(x$variable.importance - width, 
+              x$variable.importance + width)
+
+  ## Return VIMP and p-values
+  res <- cbind(x$variable.importance, ci, pval)
+  colnames(res) <- c("importance", "CI_lower", "CI_upper", "pvalue")
+  return(res)
+}
diff --git a/ranger-r-package/ranger/R/predict.R b/ranger-r-package/ranger/R/predict.R
@@ -194,10 +194,13 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   use.unordered.factor.variables <- FALSE
   save.memory <- FALSE
   splitrule <- 1
+  alpha <- 0
+  minprop <- 0
   case.weights <- c(0, 0)
   use.case.weights <- FALSE
   keep.inbag <- FALSE
   sample.fraction <- 1
+  holdout <- FALSE
 
   ## Call Ranger
   result <- rangerCpp(treetype, dependent.variable.name, data.final, variable.names, mtry,
@@ -206,7 +209,8 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
                       always.split.variables, use.always.split.variables,
                       status.variable.name, prediction.mode, forest, sparse.data, replace, probability,
                       unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule, 
-                      case.weights, use.case.weights, predict.all, keep.inbag, sample.fraction)
+                      case.weights, use.case.weights, predict.all, keep.inbag, sample.fraction, 
+                      alpha, minprop, holdout)
 
   if (length(result) == 0) {
     stop("User interrupt or internal error.")

diff --git a/ranger-r-package/ranger/R/print.R b/ranger-r-package/ranger/R/print.R
@@ -47,6 +47,7 @@ print.ranger <- function(x, ...) {
   cat("Target node size:                ", x$min.node.size, "\n")
   cat("Variable importance mode:        ", x$importance.mode, "\n")
   if (x$treetype == "Survival") {
+    cat("Splitrule:                       ", x$splitrule, "\n")
     cat("Number of unique death times:    ", length(x$unique.death.times), "\n")
   }
   if (x$treetype == "Classification") {