Merge e135077 into 3c25198

imbs-hl · Aug 29, 2019 · 970ecc9 · 970ecc9
2 parents 3c25198 + e135077
commit 970ecc9
Show file tree

Hide file tree

Showing 57 changed files with 1,369 additions and 1,102 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: ranger
 Type: Package
 Title: A Fast Implementation of Random Forests
-Version: 0.11.4
-Date: 2019-08-15
+Version: 0.11.5
+Date: 2019-08-27
 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb]
 Maintainer: Marvin N. Wright <cran@wrig.de>
 Description: A fast implementation of Random Forests, particularly suited for high

diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+##### Version 0.11.5
+* Add x/y interface
+* Internal changes (seed differences possible, prediction incompatible with older versions)
+
 ##### Version 0.11.4
 * Add "beta" splitrule for bounded outcomes
 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+##### Version 0.11.5
+* Add x/y interface
+* Internal changes (seed differences possible, prediction incompatible with older versions)
+
 ##### Version 0.11.4
 * Add "beta" splitrule for bounded outcomes
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,8 +1,8 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) {
-    .Call(`_ranger_rangerCpp`, treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)
+rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) {
+    .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)
 }
 
 numSmaller <- function(values, reference) {

diff --git a/R/predict.R b/R/predict.R
@@ -81,11 +81,9 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
     snp.data <- data@gtdata@gtps@.Data
     data <- data@phdata[, -1, drop = FALSE]
     gwa.mode <- TRUE
-    variable.names <- c(names(data), snp.names)
   } else {
     snp.data <- as.matrix(0)
     gwa.mode <- FALSE
-    variable.names <- colnames(data)
   }
 
   ## Check forest argument
@@ -94,21 +92,23 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   } else {
     forest <- object
   }
-  if (is.null(forest$dependent.varID) || is.null(forest$num.trees) ||
+  if (is.null(forest$num.trees) ||
         is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) ||
         is.null(forest$split.values) || is.null(forest$independent.variable.names) ||
         is.null(forest$treetype)) {
     stop("Error: Invalid forest object.")
   }
-  if (forest$treetype == "Survival" && (is.null(forest$status.varID)  ||
-                                        is.null(forest$chf) || is.null(forest$unique.death.times))) {
+  if (forest$treetype == "Survival" && (is.null(forest$chf) || is.null(forest$unique.death.times))) {
     stop("Error: Invalid forest object.")
   }
 
   ## Check for old ranger version
   if (length(forest$child.nodeIDs) != forest$num.trees || length(forest$child.nodeIDs[[1]]) != 2) {
     stop("Error: Invalid forest object. Is the forest grown in ranger version <0.3.9? Try to predict with the same version the forest was grown.")
   }
+  if (!is.null(forest$dependent.varID)) {
+    stop("Error: Invalid forest object. Is the forest grown in ranger version <0.11.5? Try to predict with the same version the forest was grown.")
+  }
 
   ## Prediction type
   if (type == "response" || type == "se") {
@@ -142,106 +142,53 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   if (type == "se") {
     predict.all <- TRUE
   }
+
+  x <- data
+
+  if (sum(!(forest$independent.variable.names %in% colnames(x))) > 0) {
+    stop("Error: One or more independent variables not found in data.")
+  }
 
-  ## Create final data
-  if (forest$treetype == "Survival") {
-    if (forest$dependent.varID > 0 && forest$status.varID > 1) {
-      if (ncol(data) == length(forest$independent.variable.names)+2) {
-        ## If alternative interface used and same data structure, don't subset data
-        data.used <- data
-      } else if (ncol(data) == length(forest$independent.variable.names)) {
-        data.selected <- data[, forest$independent.variable.names, drop = FALSE]
-        data.used <- cbind(0, 0, data.selected)
-        variable.names <- c("time", "status", forest$independent.variable.names)
-        forest$dependent.varID <- 0
-        forest$status.varID <- 1
-      } else {
-        stop("Invalid prediction data. Include both time and status variable or none.")
-      }
-    } else {
-      ## If formula interface used, subset data
-      data.selected <- data[, forest$independent.variable.names, drop = FALSE]
-
-      ## Arange data as in original data
-      data.used <- cbind(0, 0, data.selected)
-      variable.names <- c("time", "status", forest$independent.variable.names)
-    }
-
-  ## Index of no-recode variables
-  idx.norecode <- c(-(forest$dependent.varID+1), -(forest$status.varID+1))
-
-  } else {
-    ## No survival
-    if (ncol(data) == length(forest$independent.variable.names)+1 && forest$dependent.varID > 0) {
-      ## If alternative interface used and same data structure, don't subset data
-      data.used <- data
-    } else {
-      ## If formula interface used, subset data
-      data.selected <- data[, forest$independent.variable.names, drop = FALSE]
-
-      ## Arange data as in original data
-      if (forest$dependent.varID == 0) {
-        data.used <- cbind(0, data.selected)
-        variable.names <- c("dependent", forest$independent.variable.names)
-      } else if (forest$dependent.varID >= ncol(data)) {
-        data.used <- cbind(data.selected, 0)
-        variable.names <- c(forest$independent.variable.names, "dependent")
-      } else {
-        data.used <- cbind(data.selected[, 1:forest$dependent.varID],
-                           0,
-                           data.selected[, (forest$dependent.varID+1):ncol(data.selected)])
-        variable.names <- c(forest$independent.variable.names[1:forest$dependent.varID],
-                            "dependent",
-                            forest$independent.variable.names[(forest$dependent.varID+1):length(forest$independent.variable.names)])
-      }
-    }
-
-    ## Index of no-recode variables
-    idx.norecode <- -(forest$dependent.varID+1)
+  ## Subset to same column as in training if necessary
+  if (length(colnames(x)) != length(forest$independent.variable.names) || any(colnames(x) != forest$independent.variable.names)) {
+    x <- x[, forest$independent.variable.names, drop = FALSE]
   }
 
   ## Recode characters
-  if (!is.matrix(data.used) && !inherits(data.used, "Matrix")) {
-    char.columns <- sapply(data.used, is.character)
-    data.used[char.columns] <- lapply(data.used[char.columns], factor)
+  if (!is.matrix(x) && !inherits(x, "Matrix")) {
+    char.columns <- sapply(x, is.character)
+    if (length(char.columns) > 0) {
+      x[char.columns] <- lapply(x[char.columns], factor)
+    }
   }
-
+  
   ## Recode factors if forest grown 'order' mode
   if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null))) {
-    data.used[, idx.norecode] <- mapply(function(x, y) {
-      if(is.null(y)) {
-        x
+    x <- mapply(function(xx, yy) {
+      if(is.null(yy)) {
+        xx
       } else {
-        new.levels <- setdiff(levels(x), y)
-        factor(x, levels = c(y, new.levels), exclude = NULL)
+        new.levels <- setdiff(levels(xx), yy)
+        factor(xx, levels = c(yy, new.levels), exclude = NULL)
       }
-    }, data.used[, idx.norecode], forest$covariate.levels, SIMPLIFY = !is.data.frame(data.used[, idx.norecode]))
+    }, x, forest$covariate.levels, SIMPLIFY = !is.data.frame(x))
   }
-
-  ## Convert to data matrix
-  if (is.matrix(data.used) || inherits(data.used, "Matrix")) {
-    data.final <- data.used
-  } else {
-    data.final <- data.matrix(data.used)
+  if (is.list(x) && !is.data.frame(x)) {
+    x <- as.data.frame(x)
   }
-
 
-  ## If gwa mode, add snp variable names
-  if (gwa.mode) {
-    variable.names <- c(variable.names, snp.names)
+  ## Convert to data matrix
+  if (!is.matrix(x) & !inherits(x, "Matrix")) {
+    x <- data.matrix(x)
   }
 
   ## Check missing values
-  if (any(is.na(data.final))) {
-    offending_columns <- colnames(data.final)[colSums(is.na(data.final)) > 0]
+  if (any(is.na(x))) {
+    offending_columns <- colnames(x)[colSums(is.na(x)) > 0]
     stop("Missing data in columns: ",
          paste0(offending_columns, collapse = ", "), ".", call. = FALSE)
   }
 
-  if (sum(!(forest$independent.variable.names %in% variable.names)) > 0) {
-    stop("Error: One or more independent variables not found in data.")
-  }
-
   ## Num threads
   ## Default 0 -> detect from system in C++.
   if (is.null(num.threads)) {
@@ -268,15 +215,13 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   }
 
   ## Defaults for variables not needed
-  dependent.variable.name <- ""
   mtry <- 0
   importance <- 0
   min.node.size <- 0
   split.select.weights <- list(c(0, 0))
   use.split.select.weights <- FALSE
   always.split.variables <- c("0", "0")
   use.always.split.variables <- FALSE
-  status.variable.name <- "status"
   prediction.mode <- TRUE
   write.forest <- FALSE
   replace <- TRUE
@@ -299,35 +244,37 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
   max.depth <- 0
   inbag <- list(c(0,0))
   use.inbag <- FALSE
+  y <- matrix(c(0, 0))
 
   ## Use sparse matrix
-  if ("dgCMatrix" %in% class(data.final)) {
-    sparse.data <- data.final
-    data.final <- matrix(c(0, 0))
+  if ("dgCMatrix" %in% class(x)) {
+    sparse.x <- x
+    x <- matrix(c(0, 0))
     use.sparse.data <- TRUE
   } else {
-    sparse.data <- Matrix(matrix(c(0, 0)))
+    sparse.x <- Matrix(matrix(c(0, 0)))
     use.sparse.data <- FALSE
+    x <- data.matrix(x)
   }
 
   ## Call Ranger
-  result <- rangerCpp(treetype, dependent.variable.name, data.final, variable.names, mtry,
+  result <- rangerCpp(treetype, x, y, forest$independent.variable.names, mtry,
                       num.trees, verbose, seed, num.threads, write.forest, importance,
                       min.node.size, split.select.weights, use.split.select.weights,
                       always.split.variables, use.always.split.variables,
-                      status.variable.name, prediction.mode, forest, snp.data, replace, probability,
+                      prediction.mode, forest, snp.data, replace, probability,
                       unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule,
                       case.weights, use.case.weights, class.weights, 
                       predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, 
-                      prediction.type, num.random.splits, sparse.data, use.sparse.data,
+                      prediction.type, num.random.splits, sparse.x, use.sparse.data,
                       order.snps, oob.error, max.depth, inbag, use.inbag)
 
   if (length(result) == 0) {
     stop("User interrupt or internal error.")
   }
 
   ## Prepare results
-  result$num.samples <- nrow(data.final)
+  result$num.samples <- nrow(x)
   result$treetype <- forest$treetype
 
   if (predict.all) {