Skip to content

Commit

Permalink
Merge e135077 into 3c25198
Browse files Browse the repository at this point in the history
  • Loading branch information
mnwright committed Aug 29, 2019
2 parents 3c25198 + e135077 commit 970ecc9
Show file tree
Hide file tree
Showing 57 changed files with 1,369 additions and 1,102 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: ranger
Type: Package
Title: A Fast Implementation of Random Forests
Version: 0.11.4
Date: 2019-08-15
Version: 0.11.5
Date: 2019-08-27
Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb]
Maintainer: Marvin N. Wright <cran@wrig.de>
Description: A fast implementation of Random Forests, particularly suited for high
Expand Down
4 changes: 4 additions & 0 deletions NEWS
@@ -1,3 +1,7 @@
##### Version 0.11.5
* Add x/y interface
* Internal changes (seed differences possible, prediction incompatible with older versions)

##### Version 0.11.4
* Add "beta" splitrule for bounded outcomes

Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
@@ -1,3 +1,7 @@
##### Version 0.11.5
* Add x/y interface
* Internal changes (seed differences possible, prediction incompatible with older versions)

##### Version 0.11.4
* Add "beta" splitrule for bounded outcomes

Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
@@ -1,8 +1,8 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) {
.Call(`_ranger_rangerCpp`, treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)
rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) {
.Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)
}

numSmaller <- function(values, reference) {
Expand Down
139 changes: 43 additions & 96 deletions R/predict.R
Expand Up @@ -81,11 +81,9 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
snp.data <- data@gtdata@gtps@.Data
data <- data@phdata[, -1, drop = FALSE]
gwa.mode <- TRUE
variable.names <- c(names(data), snp.names)
} else {
snp.data <- as.matrix(0)
gwa.mode <- FALSE
variable.names <- colnames(data)
}

## Check forest argument
Expand All @@ -94,21 +92,23 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
} else {
forest <- object
}
if (is.null(forest$dependent.varID) || is.null(forest$num.trees) ||
if (is.null(forest$num.trees) ||
is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) ||
is.null(forest$split.values) || is.null(forest$independent.variable.names) ||
is.null(forest$treetype)) {
stop("Error: Invalid forest object.")
}
if (forest$treetype == "Survival" && (is.null(forest$status.varID) ||
is.null(forest$chf) || is.null(forest$unique.death.times))) {
if (forest$treetype == "Survival" && (is.null(forest$chf) || is.null(forest$unique.death.times))) {
stop("Error: Invalid forest object.")
}

## Check for old ranger version
if (length(forest$child.nodeIDs) != forest$num.trees || length(forest$child.nodeIDs[[1]]) != 2) {
stop("Error: Invalid forest object. Is the forest grown in ranger version <0.3.9? Try to predict with the same version the forest was grown.")
}
if (!is.null(forest$dependent.varID)) {
stop("Error: Invalid forest object. Is the forest grown in ranger version <0.11.5? Try to predict with the same version the forest was grown.")
}

## Prediction type
if (type == "response" || type == "se") {
Expand Down Expand Up @@ -142,106 +142,53 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
if (type == "se") {
predict.all <- TRUE
}

x <- data

if (sum(!(forest$independent.variable.names %in% colnames(x))) > 0) {
stop("Error: One or more independent variables not found in data.")
}

## Create final data
if (forest$treetype == "Survival") {
if (forest$dependent.varID > 0 && forest$status.varID > 1) {
if (ncol(data) == length(forest$independent.variable.names)+2) {
## If alternative interface used and same data structure, don't subset data
data.used <- data
} else if (ncol(data) == length(forest$independent.variable.names)) {
data.selected <- data[, forest$independent.variable.names, drop = FALSE]
data.used <- cbind(0, 0, data.selected)
variable.names <- c("time", "status", forest$independent.variable.names)
forest$dependent.varID <- 0
forest$status.varID <- 1
} else {
stop("Invalid prediction data. Include both time and status variable or none.")
}
} else {
## If formula interface used, subset data
data.selected <- data[, forest$independent.variable.names, drop = FALSE]

## Arange data as in original data
data.used <- cbind(0, 0, data.selected)
variable.names <- c("time", "status", forest$independent.variable.names)
}

## Index of no-recode variables
idx.norecode <- c(-(forest$dependent.varID+1), -(forest$status.varID+1))

} else {
## No survival
if (ncol(data) == length(forest$independent.variable.names)+1 && forest$dependent.varID > 0) {
## If alternative interface used and same data structure, don't subset data
data.used <- data
} else {
## If formula interface used, subset data
data.selected <- data[, forest$independent.variable.names, drop = FALSE]

## Arange data as in original data
if (forest$dependent.varID == 0) {
data.used <- cbind(0, data.selected)
variable.names <- c("dependent", forest$independent.variable.names)
} else if (forest$dependent.varID >= ncol(data)) {
data.used <- cbind(data.selected, 0)
variable.names <- c(forest$independent.variable.names, "dependent")
} else {
data.used <- cbind(data.selected[, 1:forest$dependent.varID],
0,
data.selected[, (forest$dependent.varID+1):ncol(data.selected)])
variable.names <- c(forest$independent.variable.names[1:forest$dependent.varID],
"dependent",
forest$independent.variable.names[(forest$dependent.varID+1):length(forest$independent.variable.names)])
}
}

## Index of no-recode variables
idx.norecode <- -(forest$dependent.varID+1)
## Subset to same column as in training if necessary
if (length(colnames(x)) != length(forest$independent.variable.names) || any(colnames(x) != forest$independent.variable.names)) {
x <- x[, forest$independent.variable.names, drop = FALSE]
}

## Recode characters
if (!is.matrix(data.used) && !inherits(data.used, "Matrix")) {
char.columns <- sapply(data.used, is.character)
data.used[char.columns] <- lapply(data.used[char.columns], factor)
if (!is.matrix(x) && !inherits(x, "Matrix")) {
char.columns <- sapply(x, is.character)
if (length(char.columns) > 0) {
x[char.columns] <- lapply(x[char.columns], factor)
}
}

## Recode factors if forest grown 'order' mode
if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null))) {
data.used[, idx.norecode] <- mapply(function(x, y) {
if(is.null(y)) {
x
x <- mapply(function(xx, yy) {
if(is.null(yy)) {
xx
} else {
new.levels <- setdiff(levels(x), y)
factor(x, levels = c(y, new.levels), exclude = NULL)
new.levels <- setdiff(levels(xx), yy)
factor(xx, levels = c(yy, new.levels), exclude = NULL)
}
}, data.used[, idx.norecode], forest$covariate.levels, SIMPLIFY = !is.data.frame(data.used[, idx.norecode]))
}, x, forest$covariate.levels, SIMPLIFY = !is.data.frame(x))
}

## Convert to data matrix
if (is.matrix(data.used) || inherits(data.used, "Matrix")) {
data.final <- data.used
} else {
data.final <- data.matrix(data.used)
if (is.list(x) && !is.data.frame(x)) {
x <- as.data.frame(x)
}


## If gwa mode, add snp variable names
if (gwa.mode) {
variable.names <- c(variable.names, snp.names)
## Convert to data matrix
if (!is.matrix(x) & !inherits(x, "Matrix")) {
x <- data.matrix(x)
}

## Check missing values
if (any(is.na(data.final))) {
offending_columns <- colnames(data.final)[colSums(is.na(data.final)) > 0]
if (any(is.na(x))) {
offending_columns <- colnames(x)[colSums(is.na(x)) > 0]
stop("Missing data in columns: ",
paste0(offending_columns, collapse = ", "), ".", call. = FALSE)
}

if (sum(!(forest$independent.variable.names %in% variable.names)) > 0) {
stop("Error: One or more independent variables not found in data.")
}

## Num threads
## Default 0 -> detect from system in C++.
if (is.null(num.threads)) {
Expand All @@ -268,15 +215,13 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
}

## Defaults for variables not needed
dependent.variable.name <- ""
mtry <- 0
importance <- 0
min.node.size <- 0
split.select.weights <- list(c(0, 0))
use.split.select.weights <- FALSE
always.split.variables <- c("0", "0")
use.always.split.variables <- FALSE
status.variable.name <- "status"
prediction.mode <- TRUE
write.forest <- FALSE
replace <- TRUE
Expand All @@ -299,35 +244,37 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE,
max.depth <- 0
inbag <- list(c(0,0))
use.inbag <- FALSE
y <- matrix(c(0, 0))

## Use sparse matrix
if ("dgCMatrix" %in% class(data.final)) {
sparse.data <- data.final
data.final <- matrix(c(0, 0))
if ("dgCMatrix" %in% class(x)) {
sparse.x <- x
x <- matrix(c(0, 0))
use.sparse.data <- TRUE
} else {
sparse.data <- Matrix(matrix(c(0, 0)))
sparse.x <- Matrix(matrix(c(0, 0)))
use.sparse.data <- FALSE
x <- data.matrix(x)
}

## Call Ranger
result <- rangerCpp(treetype, dependent.variable.name, data.final, variable.names, mtry,
result <- rangerCpp(treetype, x, y, forest$independent.variable.names, mtry,
num.trees, verbose, seed, num.threads, write.forest, importance,
min.node.size, split.select.weights, use.split.select.weights,
always.split.variables, use.always.split.variables,
status.variable.name, prediction.mode, forest, snp.data, replace, probability,
prediction.mode, forest, snp.data, replace, probability,
unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule,
case.weights, use.case.weights, class.weights,
predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout,
prediction.type, num.random.splits, sparse.data, use.sparse.data,
prediction.type, num.random.splits, sparse.x, use.sparse.data,
order.snps, oob.error, max.depth, inbag, use.inbag)

if (length(result) == 0) {
stop("User interrupt or internal error.")
}

## Prepare results
result$num.samples <- nrow(data.final)
result$num.samples <- nrow(x)
result$treetype <- forest$treetype

if (predict.all) {
Expand Down

0 comments on commit 970ecc9

Please sign in to comment.