diff --git a/DESCRIPTION b/DESCRIPTION index a610da806..36e1a101a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ranger Type: Package Title: A Fast Implementation of Random Forests -Version: 0.11.4 +Version: 0.11.5 Date: 2019-09-25 Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb] Maintainer: Marvin N. Wright diff --git a/NEWS b/NEWS index a46075005..3f2d244c4 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +##### Version 0.11.5 +* Add x/y interface +* Internal changes (seed differences possible, prediction incompatible with older versions) + ##### Version 0.11.4 * Add "beta" splitrule for bounded outcomes diff --git a/NEWS.md b/NEWS.md index ee7b103df..b2d62e431 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +##### Version 0.11.5 +* Add x/y interface +* Internal changes (seed differences possible, prediction incompatible with older versions) + ##### Version 0.11.4 * Add "beta" splitrule for bounded outcomes diff --git a/R/RcppExports.R b/R/RcppExports.R index 4312eaaef..9d4618600 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) { - .Call(`_ranger_rangerCpp`, treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) +rangerCpp <- function(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) { + .Call(`_ranger_rangerCpp`, treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) } numSmaller <- function(values, reference) { diff --git a/R/predict.R b/R/predict.R index 90e1582bd..8b423eb98 100644 --- a/R/predict.R +++ b/R/predict.R @@ -81,11 +81,9 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, snp.data <- data@gtdata@gtps@.Data data <- data@phdata[, -1, drop = FALSE] gwa.mode <- TRUE - variable.names <- c(names(data), snp.names) } else { snp.data <- as.matrix(0) gwa.mode <- FALSE - variable.names <- colnames(data) } ## Check forest argument @@ -94,14 +92,13 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, } else { forest <- object } - if (is.null(forest$dependent.varID) || is.null(forest$num.trees) || + if (is.null(forest$num.trees) || is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) || is.null(forest$split.values) || is.null(forest$independent.variable.names) || is.null(forest$treetype)) { stop("Error: Invalid forest object.") } - if (forest$treetype == "Survival" && (is.null(forest$status.varID) || - is.null(forest$chf) || is.null(forest$unique.death.times))) { + if (forest$treetype == "Survival" && (is.null(forest$chf) || is.null(forest$unique.death.times))) { stop("Error: Invalid forest object.") } @@ -109,6 +106,9 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, if (length(forest$child.nodeIDs) != forest$num.trees || length(forest$child.nodeIDs[[1]]) != 2) { stop("Error: Invalid forest object. Is the forest grown in ranger version <0.3.9? Try to predict with the same version the forest was grown.") } + if (!is.null(forest$dependent.varID)) { + stop("Error: Invalid forest object. Is the forest grown in ranger version <0.11.5? Try to predict with the same version the forest was grown.") + } ## Prediction type if (type == "response" || type == "se") { @@ -142,106 +142,53 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, if (type == "se") { predict.all <- TRUE } + + x <- data + + if (sum(!(forest$independent.variable.names %in% colnames(x))) > 0) { + stop("Error: One or more independent variables not found in data.") + } - ## Create final data - if (forest$treetype == "Survival") { - if (forest$dependent.varID > 0 && forest$status.varID > 1) { - if (ncol(data) == length(forest$independent.variable.names)+2) { - ## If alternative interface used and same data structure, don't subset data - data.used <- data - } else if (ncol(data) == length(forest$independent.variable.names)) { - data.selected <- data[, forest$independent.variable.names, drop = FALSE] - data.used <- cbind(0, 0, data.selected) - variable.names <- c("time", "status", forest$independent.variable.names) - forest$dependent.varID <- 0 - forest$status.varID <- 1 - } else { - stop("Invalid prediction data. Include both time and status variable or none.") - } - } else { - ## If formula interface used, subset data - data.selected <- data[, forest$independent.variable.names, drop = FALSE] - - ## Arange data as in original data - data.used <- cbind(0, 0, data.selected) - variable.names <- c("time", "status", forest$independent.variable.names) - } - - ## Index of no-recode variables - idx.norecode <- c(-(forest$dependent.varID+1), -(forest$status.varID+1)) - - } else { - ## No survival - if (ncol(data) == length(forest$independent.variable.names)+1 && forest$dependent.varID > 0) { - ## If alternative interface used and same data structure, don't subset data - data.used <- data - } else { - ## If formula interface used, subset data - data.selected <- data[, forest$independent.variable.names, drop = FALSE] - - ## Arange data as in original data - if (forest$dependent.varID == 0) { - data.used <- cbind(0, data.selected) - variable.names <- c("dependent", forest$independent.variable.names) - } else if (forest$dependent.varID >= ncol(data)) { - data.used <- cbind(data.selected, 0) - variable.names <- c(forest$independent.variable.names, "dependent") - } else { - data.used <- cbind(data.selected[, 1:forest$dependent.varID], - 0, - data.selected[, (forest$dependent.varID+1):ncol(data.selected)]) - variable.names <- c(forest$independent.variable.names[1:forest$dependent.varID], - "dependent", - forest$independent.variable.names[(forest$dependent.varID+1):length(forest$independent.variable.names)]) - } - } - - ## Index of no-recode variables - idx.norecode <- -(forest$dependent.varID+1) + ## Subset to same column as in training if necessary + if (length(colnames(x)) != length(forest$independent.variable.names) || any(colnames(x) != forest$independent.variable.names)) { + x <- x[, forest$independent.variable.names, drop = FALSE] } ## Recode characters - if (!is.matrix(data.used) && !inherits(data.used, "Matrix")) { - char.columns <- sapply(data.used, is.character) - data.used[char.columns] <- lapply(data.used[char.columns], factor) + if (!is.matrix(x) && !inherits(x, "Matrix")) { + char.columns <- sapply(x, is.character) + if (length(char.columns) > 0) { + x[char.columns] <- lapply(x[char.columns], factor) + } } - + ## Recode factors if forest grown 'order' mode if (!is.null(forest$covariate.levels) && !all(sapply(forest$covariate.levels, is.null))) { - data.used[, idx.norecode] <- mapply(function(x, y) { - if(is.null(y)) { - x + x <- mapply(function(xx, yy) { + if(is.null(yy)) { + xx } else { - new.levels <- setdiff(levels(x), y) - factor(x, levels = c(y, new.levels), exclude = NULL) + new.levels <- setdiff(levels(xx), yy) + factor(xx, levels = c(yy, new.levels), exclude = NULL) } - }, data.used[, idx.norecode], forest$covariate.levels, SIMPLIFY = !is.data.frame(data.used[, idx.norecode])) + }, x, forest$covariate.levels, SIMPLIFY = !is.data.frame(x)) } - - ## Convert to data matrix - if (is.matrix(data.used) || inherits(data.used, "Matrix")) { - data.final <- data.used - } else { - data.final <- data.matrix(data.used) + if (is.list(x) && !is.data.frame(x)) { + x <- as.data.frame(x) } - - ## If gwa mode, add snp variable names - if (gwa.mode) { - variable.names <- c(variable.names, snp.names) + ## Convert to data matrix + if (!is.matrix(x) & !inherits(x, "Matrix")) { + x <- data.matrix(x) } ## Check missing values - if (any(is.na(data.final))) { - offending_columns <- colnames(data.final)[colSums(is.na(data.final)) > 0] + if (any(is.na(x))) { + offending_columns <- colnames(x)[colSums(is.na(x)) > 0] stop("Missing data in columns: ", paste0(offending_columns, collapse = ", "), ".", call. = FALSE) } - if (sum(!(forest$independent.variable.names %in% variable.names)) > 0) { - stop("Error: One or more independent variables not found in data.") - } - ## Num threads ## Default 0 -> detect from system in C++. if (is.null(num.threads)) { @@ -268,7 +215,6 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, } ## Defaults for variables not needed - dependent.variable.name <- "" mtry <- 0 importance <- 0 min.node.size <- 0 @@ -276,7 +222,6 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, use.split.select.weights <- FALSE always.split.variables <- c("0", "0") use.always.split.variables <- FALSE - status.variable.name <- "status" prediction.mode <- TRUE write.forest <- FALSE replace <- TRUE @@ -299,27 +244,29 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, max.depth <- 0 inbag <- list(c(0,0)) use.inbag <- FALSE + y <- matrix(c(0, 0)) ## Use sparse matrix - if ("dgCMatrix" %in% class(data.final)) { - sparse.data <- data.final - data.final <- matrix(c(0, 0)) + if ("dgCMatrix" %in% class(x)) { + sparse.x <- x + x <- matrix(c(0, 0)) use.sparse.data <- TRUE } else { - sparse.data <- Matrix(matrix(c(0, 0))) + sparse.x <- Matrix(matrix(c(0, 0))) use.sparse.data <- FALSE + x <- data.matrix(x) } ## Call Ranger - result <- rangerCpp(treetype, dependent.variable.name, data.final, variable.names, mtry, + result <- rangerCpp(treetype, x, y, forest$independent.variable.names, mtry, num.trees, verbose, seed, num.threads, write.forest, importance, min.node.size, split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, - status.variable.name, prediction.mode, forest, snp.data, replace, probability, + prediction.mode, forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule, case.weights, use.case.weights, class.weights, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, - prediction.type, num.random.splits, sparse.data, use.sparse.data, + prediction.type, num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag) if (length(result) == 0) { @@ -327,7 +274,7 @@ predict.ranger.forest <- function(object, data, predict.all = FALSE, } ## Prepare results - result$num.samples <- nrow(data.final) + result$num.samples <- nrow(x) result$treetype <- forest$treetype if (predict.all) { diff --git a/R/ranger.R b/R/ranger.R index 6fdbc8811..141cdcafe 100644 --- a/R/ranger.R +++ b/R/ranger.R @@ -66,7 +66,8 @@ ##' This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. ##' ##' For a large number of variables and data frames as input data the formula interface can be slow or impossible to use. -##' Alternatively \code{dependent.variable.name} (and \code{status.variable.name} for survival) can be used. +##' Alternatively \code{dependent.variable.name} (and \code{status.variable.name} for survival) or \code{x} and \code{y} can be used. +##' Use \code{x} and \code{y} with a matrix for \code{x} to avoid conversions and save memory. ##' Consider setting \code{save.memory = TRUE} if you encounter memory problems for very large datasets, but be aware that this option slows down the tree growing. ##' ##' For GWAS data consider combining \code{ranger} with the \code{GenABEL} package. @@ -114,6 +115,8 @@ ##' @param dependent.variable.name Name of dependent variable, needed if no formula given. For survival forests this is the time variable. ##' @param status.variable.name Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring. ##' @param classification Only needed if data is a matrix. Set to \code{TRUE} to grow a classification forest. +##' @param x Predictor data (independent variables), alternative interface to data with formula or dependent.variable.name. +##' @param y Response vector (dependent variable), alternative interface to data with formula or dependent.variable.name. For survival use a \code{Surv()} object or a matrix with time and status. ##' @return Object of class \code{ranger} with elements ##' \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.} ##' \item{\code{predictions}}{Predicted classes/values, based on out of bag samples (classification and regression only).} @@ -161,8 +164,9 @@ ##' rg.veteran <- ranger(Surv(time, status) ~ ., data = veteran) ##' plot(rg.veteran$unique.death.times, rg.veteran$survival[1,]) ##' -##' ## Alternative interface +##' ## Alternative interfaces (same results) ##' ranger(dependent.variable.name = "Species", data = iris) +##' ranger(y = iris[, 5], x = iris[, -5]) ##' ##' \dontrun{ ##' ## Use GenABEL interface to read Plink data into R and grow a classification forest @@ -212,79 +216,93 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, num.threads = NULL, save.memory = FALSE, verbose = TRUE, seed = NULL, dependent.variable.name = NULL, status.variable.name = NULL, - classification = NULL) { - - ## GenABEL GWA data - if ("gwaa.data" %in% class(data)) { - snp.names <- data@gtdata@snpnames - snp.data <- data@gtdata@gtps@.Data - data <- data@phdata - if ("id" %in% names(data)) { - data$"id" <- NULL - } - gwa.mode <- TRUE - save.memory <- FALSE - } else { - snp.data <- as.matrix(0) - gwa.mode <- FALSE + classification = NULL, x = NULL, y = NULL) { + + ## By default not in GWAS mode + snp.data <- as.matrix(0) + gwa.mode <- FALSE + + if (is.null(data)) { + ## x/y interface + if (is.null(x) | is.null(y)) { + stop("Error: Either data or x and y is required.") + } + } else { + ## GenABEL GWA data + if ("gwaa.data" %in% class(data)) { + snp.names <- data@gtdata@snpnames + snp.data <- data@gtdata@gtps@.Data + data <- data@phdata + if ("id" %in% names(data)) { + data$"id" <- NULL + } + gwa.mode <- TRUE + save.memory <- FALSE + } + + ## Formula interface. Use whole data frame if no formula provided and depvarname given + if (is.null(formula)) { + if (is.null(dependent.variable.name)) { + if (is.null(y) | is.null(x)) { + stop("Error: Please give formula, dependent variable name or x/y.") + } + } else { + if (is.null(status.variable.name)) { + y <- data[, dependent.variable.name, drop = TRUE] + x <- data[, !(colnames(data) %in% dependent.variable.name), drop = FALSE] + } else { + y <- survival::Surv(data[, dependent.variable.name], data[, status.variable.name]) + x <- data[, !(colnames(data) %in% c(dependent.variable.name, status.variable.name)), drop = FALSE] + } + } + } else { + formula <- formula(formula) + if (class(formula) != "formula") { + stop("Error: Invalid formula.") + } + data.selected <- parse.formula(formula, data, env = parent.frame()) + y <- data.selected[, 1] + x <- data.selected[, -1, drop = FALSE] + } } ## Sparse matrix data - if (inherits(data, "Matrix")) { - if (!("dgCMatrix" %in% class(data))) { + if (inherits(x, "Matrix")) { + if (!("dgCMatrix" %in% class(x))) { stop("Error: Currently only sparse data of class 'dgCMatrix' supported.") - } - + } if (!is.null(formula)) { - stop("Error: Sparse matrices only supported with alternative interface. Use dependent.variable.name instead of formula.") - } - } - - ## Formula interface. Use whole data frame is no formula provided and depvarname given - if (is.null(formula)) { - if (is.null(dependent.variable.name)) { - stop("Error: Please give formula or dependent variable name.") - } - if (is.null(status.variable.name)) { - status.variable.name <- "" - response <- data[, dependent.variable.name, drop = TRUE] - } else { - response <- survival::Surv(data[, dependent.variable.name], data[, status.variable.name]) #data[, c(dependent.variable.name, status.variable.name)] - } - data.selected <- data - } else { - formula <- formula(formula) - if (class(formula) != "formula") { - stop("Error: Invalid formula.") + stop("Error: Sparse matrices only supported with alternative interface. Use dependent.variable.name or x/y instead of formula.") } - data.selected <- parse.formula(formula, data, env = parent.frame()) - response <- data.selected[, 1] } ## Check missing values - if (any(is.na(data.selected))) { - offending_columns <- colnames(data.selected)[colSums(is.na(data.selected)) > 0] + if (any(is.na(x))) { + offending_columns <- colnames(x)[colSums(is.na(x)) > 0] stop("Missing data in columns: ", paste0(offending_columns, collapse = ", "), ".", call. = FALSE) } + if (any(is.na(y))) { + stop("Missing data in dependent variable.", call. = FALSE) + } ## Check response levels - if (is.factor(response)) { - if (nlevels(response) != nlevels(droplevels(response))) { - dropped_levels <- setdiff(levels(response), levels(droplevels(response))) + if (is.factor(y)) { + if (nlevels(y) != nlevels(droplevels(y))) { + dropped_levels <- setdiff(levels(y), levels(droplevels(y))) warning("Dropped unused factor level(s) in dependent variable: ", paste0(dropped_levels, collapse = ", "), ".", call. = FALSE) } } ## Treetype - if (is.factor(response)) { + if (is.factor(y)) { if (probability) { treetype <- 9 } else { treetype <- 1 } - } else if (is.numeric(response) && (is.null(ncol(response)) || ncol(response) == 1)) { + } else if (is.numeric(y) && (is.null(ncol(y)) || ncol(y) == 1)) { if (!is.null(classification) && classification && !probability) { treetype <- 1 } else if (probability) { @@ -292,7 +310,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } else { treetype <- 3 } - } else if (class(response) == "Surv" || is.data.frame(response) || is.matrix(response)) { + } else if (class(y) == "Surv" || is.data.frame(y) || is.matrix(y)) { treetype <- 5 } else { stop("Error: Unsupported type of dependent variable.") @@ -302,21 +320,8 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (quantreg && treetype != 3) { stop("Error: Quantile prediction implemented only for regression outcomes.") } - - ## Dependent and status variable name. For non-survival dummy status variable name. - if (!is.null(formula)) { - if (treetype == 5) { - dependent.variable.name <- dimnames(response)[[2]][1] - status.variable.name <- dimnames(response)[[2]][2] - } else { - dependent.variable.name <- names(data.selected)[1] - status.variable.name <- "" - } - independent.variable.names <- names(data.selected)[-1] - } else { - independent.variable.names <- colnames(data.selected)[colnames(data.selected) != dependent.variable.name & - colnames(data.selected) != status.variable.name] - } + + independent.variable.names <- colnames(x) ## respect.unordered.factors if (is.null(respect.unordered.factors)) { @@ -335,82 +340,66 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } ## Recode characters as factors and recode factors if 'order' mode - if (!is.matrix(data.selected) && !inherits(data.selected, "Matrix")) { - character.idx <- sapply(data.selected, is.character) + if (!is.matrix(x) && !inherits(x, "Matrix") && ncol(x) > 0) { + character.idx <- sapply(x, is.character) if (respect.unordered.factors == "order") { ## Recode characters and unordered factors - names.selected <- names(data.selected) - ordered.idx <- sapply(data.selected, is.ordered) - factor.idx <- sapply(data.selected, is.factor) - independent.idx <- names.selected %in% independent.variable.names - recode.idx <- independent.idx & (character.idx | (factor.idx & !ordered.idx)) + ordered.idx <- sapply(x, is.ordered) + factor.idx <- sapply(x, is.factor) + recode.idx <- character.idx | (factor.idx & !ordered.idx) if (any(recode.idx) & (importance == "impurity_corrected" || importance == "impurity_unbiased")) { warning("Corrected impurity importance may not be unbiased for re-ordered factor levels. Consider setting respect.unordered.factors to 'ignore' or 'partition' or manually compute corrected importance.") } ## Numeric response - if (is.factor(response)) { - num.response <- as.numeric(response) + if (is.factor(y)) { + num.y <- as.numeric(y) } else { - num.response <- response + num.y <- y } ## Recode each column - data.selected[recode.idx] <- lapply(data.selected[recode.idx], function(x) { - if (!is.factor(x)) { - x <- as.factor(x) + x[recode.idx] <- lapply(x[recode.idx], function(xx) { + if (!is.factor(xx)) { + xx <- as.factor(xx) } - if (length(levels(x)) == 1) { + if (length(levels(xx)) == 1) { ## Don't order if only one level - levels.ordered <- levels(x) - } else if ("Surv" %in% class(response)) { + levels.ordered <- levels(xx) + } else if ("Surv" %in% class(y)) { ## Use median survival if available or largest quantile available in all strata if median not available - levels.ordered <- largest.quantile(response ~ x) + levels.ordered <- largest.quantile(y ~ xx) ## Get all levels not in node - levels.missing <- setdiff(levels(x), levels.ordered) + levels.missing <- setdiff(levels(xx), levels.ordered) levels.ordered <- c(levels.missing, levels.ordered) - } else if (is.factor(response) & nlevels(response) > 2) { - levels.ordered <- pca.order(y = response, x = x) + } else if (is.factor(y) & nlevels(y) > 2) { + levels.ordered <- pca.order(y = y, x = xx) } else { ## Order factor levels by mean response - means <- sapply(levels(x), function(y) { - mean(num.response[x == y]) + means <- sapply(levels(xx), function(y) { + mean(num.y[xx == y]) }) - levels.ordered <- as.character(levels(x)[order(means)]) + levels.ordered <- as.character(levels(xx)[order(means)]) } ## Return reordered factor - factor(x, levels = levels.ordered, ordered = TRUE, exclude = NULL) + factor(xx, levels = levels.ordered, ordered = TRUE, exclude = NULL) }) ## Save levels - covariate.levels <- lapply(data.selected[independent.idx], levels) + covariate.levels <- lapply(x, levels) } else { ## Recode characters only - data.selected[character.idx] <- lapply(data.selected[character.idx], factor) + x[character.idx] <- lapply(x[character.idx], factor) } } - ## Input data and variable names, create final data matrix - if (!is.null(formula) && treetype == 5) { - data.final <- data.matrix(cbind(response[, 1], response[, 2], - data.selected[-1])) - colnames(data.final) <- c(dependent.variable.name, status.variable.name, - independent.variable.names) - } else if (is.matrix(data.selected) || inherits(data.selected, "Matrix")) { - data.final <- data.selected - } else { - data.final <- data.matrix(data.selected) - } - variable.names <- colnames(data.final) - ## If gwa mode, add snp variable names if (gwa.mode) { - variable.names <- c(variable.names, snp.names) all.independent.variable.names <- c(independent.variable.names, snp.names) } else { all.independent.variable.names <- independent.variable.names @@ -507,14 +496,14 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (sum(sample.fraction) <= 0) { stop("Error: Invalid value for sample.fraction. Sum of values must be >0.") } - if (length(sample.fraction) != nlevels(response)) { - stop("Error: Invalid value for sample.fraction. Expecting ", nlevels(response), " values, provided ", length(sample.fraction), ".") + if (length(sample.fraction) != nlevels(y)) { + stop("Error: Invalid value for sample.fraction. Expecting ", nlevels(y), " values, provided ", length(sample.fraction), ".") } - if (!replace & any(sample.fraction * length(response) > table(response))) { - idx <- which(sample.fraction * length(response) > table(response))[1] + if (!replace & any(sample.fraction * length(y) > table(y))) { + idx <- which(sample.fraction * length(y) > table(y))[1] stop("Error: Not enough samples in class ", names(idx), - "; available: ", table(response)[idx], - ", requested: ", (sample.fraction * length(response))[idx], ".") + "; available: ", table(y)[idx], + ", requested: ", (sample.fraction * length(y))[idx], ".") } if (!is.null(case.weights)) { stop("Error: Combination of case.weights and class-wise sampling not supported.") @@ -557,7 +546,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, sample.fraction <- sample.fraction * mean(case.weights > 0) } - if (!replace && sum(case.weights > 0) < sample.fraction * nrow(data.final)) { + if (!replace && sum(case.weights > 0) < sample.fraction * nrow(x)) { stop("Error: Fewer non-zero case weights than observations to sample.") } } @@ -583,7 +572,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Class weights: NULL for no weights (all 1) if (is.null(class.weights)) { - class.weights <- rep(1, nlevels(response)) + class.weights <- rep(1, nlevels(y)) } else { if (!(treetype %in% c(1, 9))) { stop("Error: Argument class.weights only valid for classification forests.") @@ -591,12 +580,12 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (!is.numeric(class.weights) || any(class.weights < 0)) { stop("Error: Invalid value for class.weights. Please give a vector of non-negative values.") } - if (length(class.weights) != nlevels(response)) { + if (length(class.weights) != nlevels(y)) { stop("Error: Number of class weights not equal to number of classes.") } ## Reorder (C++ expects order as appearing in the data) - class.weights <- class.weights[unique(as.numeric(response))] + class.weights <- class.weights[unique(as.numeric(y))] } ## Split select weights: NULL for no weights @@ -686,7 +675,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } ## Check for 0..1 outcome - if (min(response) < 0 || max(response) > 1) { + if (min(y) < 0 || max(y) > 1) { stop("Error: beta splitrule applicable to regression data with outcome between 0 and 1 only.") } } else { @@ -711,16 +700,14 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Unordered factors if (respect.unordered.factors == "partition") { - names.selected <- names(data.selected) - ordered.idx <- sapply(data.selected, is.ordered) - factor.idx <- sapply(data.selected, is.factor) - independent.idx <- names.selected != dependent.variable.name & names.selected != status.variable.name - unordered.factor.variables <- names.selected[factor.idx & !ordered.idx & independent.idx] + ordered.idx <- sapply(x, is.ordered) + factor.idx <- sapply(x, is.factor) + unordered.factor.variables <- independent.variable.names[factor.idx & !ordered.idx] if (length(unordered.factor.variables) > 0) { use.unordered.factor.variables <- TRUE ## Check level count - num.levels <- sapply(data.selected[, factor.idx & !ordered.idx & independent.idx, drop = FALSE], nlevels) + num.levels <- sapply(x[, factor.idx & !ordered.idx, drop = FALSE], nlevels) max.level.count <- .Machine$double.digits if (max(num.levels) > max.level.count) { stop(paste("Too many levels in unordered categorical variable ", unordered.factor.variables[which.max(num.levels)], @@ -754,12 +741,11 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, if (treetype == 3 && splitrule == "maxstat") { warning("Warning: The 'order' mode for unordered factor handling with the 'maxstat' splitrule is experimental.") } - if (gwa.mode & ((treetype %in% c(1,9) & nlevels(response) > 2) | treetype == 5)) { + if (gwa.mode & ((treetype %in% c(1,9) & nlevels(y) > 2) | treetype == 5)) { stop("Error: Ordering of SNPs currently only implemented for regression and binary outcomes.") } } - ## Prediction mode always false. Use predict.ranger() method. prediction.mode <- FALSE predict.all <- FALSE @@ -769,13 +755,22 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, loaded.forest <- list() ## Use sparse matrix - if ("dgCMatrix" %in% class(data.final)) { - sparse.data <- data.final - data.final <- matrix(c(0, 0)) + if ("dgCMatrix" %in% class(x)) { + sparse.x <- x + x <- matrix(c(0, 0)) use.sparse.data <- TRUE } else { - sparse.data <- Matrix(matrix(c(0, 0))) + sparse.x <- Matrix(matrix(c(0, 0))) use.sparse.data <- FALSE + if (is.data.frame(x)) { + x <- data.matrix(x) + } + } + + if (treetype == 5) { + y.mat <- as.matrix(y) + } else { + y.mat <- as.matrix(as.numeric(y)) } if (respect.unordered.factors == "order"){ @@ -784,20 +779,16 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, order.snps <- FALSE } - - ## Clean up - rm("data.selected") - ## Call Ranger - result <- rangerCpp(treetype, dependent.variable.name, data.final, variable.names, mtry, + result <- rangerCpp(treetype, x, y.mat, independent.variable.names, mtry, num.trees, verbose, seed, num.threads, write.forest, importance.mode, min.node.size, split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, - status.variable.name, prediction.mode, loaded.forest, snp.data, + prediction.mode, loaded.forest, snp.data, replace, probability, unordered.factor.variables, use.unordered.factor.variables, save.memory, splitrule.num, case.weights, use.case.weights, class.weights, predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, - num.random.splits, sparse.data, use.sparse.data, order.snps, oob.error, max.depth, + num.random.splits, sparse.x, use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag) if (length(result) == 0) { @@ -810,12 +801,12 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } ## Set predictions - if (treetype == 1 && is.factor(response) && oob.error) { - result$predictions <- integer.to.factor(result$predictions, - levels(response)) - true.values <- integer.to.factor(unlist(data.final[, dependent.variable.name]), - levels(response)) - result$confusion.matrix <- table(true.values, result$predictions, + if (treetype == 1 && oob.error) { + if (is.factor(y)) { + result$predictions <- integer.to.factor(result$predictions, + levels(y)) + } + result$confusion.matrix <- table(y, result$predictions, dnn = c("true", "predicted"), useNA = "ifany") } else if (treetype == 5 && oob.error) { if (is.list(result$predictions)) { @@ -827,7 +818,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, result$chf <- result$predictions result$predictions <- NULL result$survival <- exp(-result$chf) - } else if (treetype == 9 && !is.matrix(data) && oob.error) { + } else if (treetype == 9 && oob.error) { if (is.list(result$predictions)) { result$predictions <- do.call(rbind, result$predictions) } @@ -836,9 +827,9 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, } ## Set colnames and sort by levels - colnames(result$predictions) <- unique(response) - if (is.factor(response)) { - result$predictions <- result$predictions[, levels(droplevels(response)), drop = FALSE] + colnames(result$predictions) <- unique(y) + if (is.factor(y)) { + result$predictions <- result$predictions[, levels(droplevels(y)), drop = FALSE] } } @@ -856,24 +847,24 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, result$treetype <- "Probability estimation" } if (treetype == 3) { - result$r.squared <- 1 - result$prediction.error / var(response) + result$r.squared <- 1 - result$prediction.error / var(y) } result$call <- sys.call() result$importance.mode <- importance - result$num.samples <- nrow(data.final) + result$num.samples <- nrow(x) result$replace <- replace ## Write forest object if (write.forest) { - if (is.factor(response)) { - result$forest$levels <- levels(response) + if (is.factor(y)) { + result$forest$levels <- levels(y) } result$forest$independent.variable.names <- independent.variable.names result$forest$treetype <- result$treetype class(result$forest) <- "ranger.forest" ## In 'ordered' mode, save covariate levels - if (respect.unordered.factors == "order" && !is.matrix(data)) { + if (respect.unordered.factors == "order" && ncol(x) > 0) { result$forest$covariate.levels <- covariate.levels } } @@ -882,14 +873,14 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, ## Prepare quantile prediction if (quantreg) { - terminal.nodes <- predict(result, data, type = "terminalNodes")$predictions + 1 + terminal.nodes <- predict(result, x, type = "terminalNodes")$predictions + 1 n <- result$num.samples result$random.node.values <- matrix(nrow = max(terminal.nodes), ncol = num.trees) ## Select one random obs per node and tree for (tree in 1:num.trees){ idx <- sample(1:n, n) - result$random.node.values[terminal.nodes[idx, tree], tree] <- response[idx] + result$random.node.values[terminal.nodes[idx, tree], tree] <- y[idx] } ## Prepare out-of-bag quantile regression @@ -909,7 +900,7 @@ ranger <- function(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, for (j in 1:num.oob) { idx <- terminal.nodes[, tree] == oob.nodes[j] idx[oob.obs[j]] <- FALSE - random.node.values.oob[oob.obs[j], tree] <- save.sample(response[idx], size = 1) + random.node.values.oob[oob.obs[j], tree] <- save.sample(y[idx], size = 1) } } } diff --git a/R/treeInfo.R b/R/treeInfo.R index e197ed1f2..bcff7a27c 100644 --- a/R/treeInfo.R +++ b/R/treeInfo.R @@ -68,14 +68,13 @@ treeInfo <- function(object, tree = 1) { if (is.null(forest)) { stop("Error: No saved forest in ranger object. Please set write.forest to TRUE when calling ranger.") } - if (is.null(forest$dependent.varID) || is.null(forest$num.trees) || + if (is.null(forest$num.trees) || is.null(forest$child.nodeIDs) || is.null(forest$split.varIDs) || is.null(forest$split.values) || is.null(forest$independent.variable.names) || is.null(forest$treetype)) { stop("Error: Invalid forest object.") } - if (forest$treetype == "Survival" && (is.null(forest$status.varID) || - is.null(forest$chf) || is.null(forest$unique.death.times))) { + if (forest$treetype == "Survival" && (is.null(forest$chf) || is.null(forest$unique.death.times))) { stop("Error: Invalid forest object.") } if (length(forest$child.nodeIDs) != forest$num.trees || length(forest$child.nodeIDs[[1]]) != 2) { @@ -99,25 +98,19 @@ treeInfo <- function(object, tree = 1) { result$splitvarID[result$terminal] <- NA result$splitvarName[result$terminal] <- NA result$splitval[result$terminal] <- NA - - ## Get names of splitting variables - # should be -1 for all >= dependent.varID but +1 change for 1-index - # for survival another -1 if >= status.varID - independent.varID <- result$splitvarID - idx <- !is.na(result$splitvarID) & result$splitvarID < forest$dependent.varID - independent.varID[idx] <- result$splitvarID[idx] + 1 - if (forest$treetype == "Survival") { - idx <- !is.na(result$splitvarID) & result$splitvarID >= forest$status.varID - independent.varID[idx] <- independent.varID[idx] - 1 - } - result$splitvarName <- forest$independent.variable.names[independent.varID] + result$splitvarName <- forest$independent.variable.names[result$splitvarID + 1] ## Unordered splitting idx.unordered <- !result$terminal & !forest$is.ordered[result$splitvarID + 1] if (any(idx.unordered)) { - result$splitval[idx.unordered] <- sapply(result$splitval[idx.unordered], function(x) { - paste(which(as.logical(intToBits(x))), collapse = ",") - }) + if (any(result$splitval[idx.unordered] > (2^31 - 1))) { + warning("Unordered splitting levels can only be shown for up to 31 levels.") + result$splitval[idx.unordered] <- NA + } else { + result$splitval[idx.unordered] <- sapply(result$splitval[idx.unordered], function(x) { + paste(which(as.logical(intToBits(x))), collapse = ",") + }) + } } ## Prediction diff --git a/cpp_version/.cproject b/cpp_version/.cproject index 7c0cd1c8d..b4e456f7f 100644 --- a/cpp_version/.cproject +++ b/cpp_version/.cproject @@ -5,12 +5,12 @@ + - diff --git a/cpp_version/src/utility/ArgumentHandler.cpp b/cpp_version/src/utility/ArgumentHandler.cpp index cc2afddc0..15f1a9cb8 100644 --- a/cpp_version/src/utility/ArgumentHandler.cpp +++ b/cpp_version/src/utility/ArgumentHandler.cpp @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------------- -This file is part of ranger. + This file is part of ranger. -Copyright (c) [2014-2018] [Marvin N. Wright] + Copyright (c) [2014-2018] [Marvin N. Wright] -This software may be modified and distributed under the terms of the MIT license. + This software may be modified and distributed under the terms of the MIT license. -Please note that the C++ core of ranger is distributed under MIT license and the -R package "ranger" under GPL3 license. -#-------------------------------------------------------------------------------*/ + Please note that the C++ core of ranger is distributed under MIT license and the + R package "ranger" under GPL3 license. + #-------------------------------------------------------------------------------*/ #include #include @@ -36,46 +36,24 @@ int ArgumentHandler::processArguments() { char const *short_options = "A:C:D:F:HM:NOP:Q:R:S:U:XZa:b:c:d:f:hil::m:o:pr:s:t:uvwy:z:"; // long options: longname, no/optional/required argument?, flag(not used!), shortname - const struct option long_options[] = { - - { "alwayssplitvars", required_argument, 0, 'A'}, - { "caseweights", required_argument, 0, 'C'}, - { "depvarname", required_argument, 0, 'D'}, - { "fraction", required_argument, 0, 'F'}, - { "holdout", no_argument, 0, 'H'}, - { "memmode", required_argument, 0, 'M'}, - { "savemem", no_argument, 0, 'N'}, - { "skipoob", no_argument, 0, 'O'}, - { "predict", required_argument, 0, 'P'}, - { "predictiontype", required_argument, 0, 'Q'}, - { "randomsplits", required_argument, 0, 'R'}, - { "splitweights", required_argument, 0, 'S'}, - { "nthreads", required_argument, 0, 'U'}, - { "predall", no_argument, 0, 'X'}, - { "version", no_argument, 0, 'Z'}, - - { "alpha", required_argument, 0, 'a'}, - { "minprop", required_argument, 0, 'b'}, - { "catvars", required_argument, 0, 'c'}, - { "maxdepth", required_argument, 0, 'd'}, - { "file", required_argument, 0, 'f'}, - { "help", no_argument, 0, 'h'}, - { "impmeasure", required_argument, 0, 'i'}, - { "targetpartitionsize", required_argument, 0, 'l'}, - { "mtry", required_argument, 0, 'm'}, - { "outprefix", required_argument, 0, 'o'}, - { "probability", no_argument, 0, 'p'}, - { "splitrule", required_argument, 0, 'r'}, - { "statusvarname", required_argument, 0, 's'}, - { "ntree", required_argument, 0, 't'}, - { "noreplace", no_argument, 0, 'u'}, - { "verbose", no_argument, 0, 'v'}, - { "write", no_argument, 0, 'w'}, - { "treetype", required_argument, 0, 'y'}, - { "seed", required_argument, 0, 'z'}, - - { 0, 0, 0, 0} - }; + const struct option long_options[] = { + + { "alwayssplitvars", required_argument, 0, 'A' }, { "caseweights", required_argument, 0, 'C' }, { "depvarname", + required_argument, 0, 'D' }, { "fraction", required_argument, 0, 'F' }, { "holdout", no_argument, 0, 'H' }, { + "memmode", required_argument, 0, 'M' }, { "savemem", no_argument, 0, 'N' }, { "skipoob", no_argument, 0, 'O' }, { + "predict", required_argument, 0, 'P' }, { "predictiontype", required_argument, 0, 'Q' }, { "randomsplits", + required_argument, 0, 'R' }, { "splitweights", required_argument, 0, 'S' }, { "nthreads", required_argument, 0, + 'U' }, { "predall", no_argument, 0, 'X' }, { "version", no_argument, 0, 'Z' }, + + { "alpha", required_argument, 0, 'a' }, { "minprop", required_argument, 0, 'b' }, { "catvars", required_argument, 0, + 'c' }, { "maxdepth", required_argument, 0, 'd' }, { "file", required_argument, 0, 'f' }, { "help", no_argument, 0, + 'h' }, { "impmeasure", required_argument, 0, 'i' }, { "targetpartitionsize", required_argument, 0, 'l' }, { + "mtry", required_argument, 0, 'm' }, { "outprefix", required_argument, 0, 'o' }, { "probability", no_argument, 0, + 'p' }, { "splitrule", required_argument, 0, 'r' }, { "statusvarname", required_argument, 0, 's' }, { "ntree", + required_argument, 0, 't' }, { "noreplace", no_argument, 0, 'u' }, { "verbose", no_argument, 0, 'v' }, { "write", + no_argument, 0, 'w' }, { "treetype", required_argument, 0, 'y' }, { "seed", required_argument, 0, 'z' }, + + { 0, 0, 0, 0 } }; while (1) { int option_index = 0; @@ -142,22 +120,22 @@ int ArgumentHandler::processArguments() { break; case 'Q': - try { - switch (std::stoi(optarg)) { - case 1: - predictiontype = RESPONSE; - break; - case 2: - predictiontype = TERMINALNODES; - break; - default: - throw std::runtime_error(""); - break; - } - } catch (...) { - throw std::runtime_error("Illegal prediction type selected. See '--help' for details."); - } + try { + switch (std::stoi(optarg)) { + case 1: + predictiontype = RESPONSE; + break; + case 2: + predictiontype = TERMINALNODES; break; + default: + throw std::runtime_error(""); + break; + } + } catch (...) { + throw std::runtime_error("Illegal prediction type selected. See '--help' for details."); + } + break; case 'R': try { @@ -424,7 +402,7 @@ void ArgumentHandler::checkArguments() { throw std::runtime_error("Please specify a dependent variable name with '--depvarname'. See '--help' for details."); } - if (treetype == TREE_SURVIVAL && statusvarname.empty()) { + if (predict.empty() && treetype == TREE_SURVIVAL && statusvarname.empty()) { throw std::runtime_error("Please specify a status variable name with '--statusvarname'. See '--help' for details."); } if (treetype != TREE_SURVIVAL && !statusvarname.empty()) { @@ -448,8 +426,8 @@ void ArgumentHandler::checkArguments() { throw std::runtime_error("Could not read from input file: " + predict + "."); } - // Do not read dependent_varID, num_variables, num_trees and is_ordered_variable - infile.seekg(2 * sizeof(size_t)); + // Do not read num_variables, num_trees and is_ordered_variable + infile.seekg(sizeof(size_t)); size_t length; infile.read((char*) &length, sizeof(length)); infile.seekg(4 * sizeof(size_t) + length * sizeof(bool)); @@ -469,7 +447,8 @@ void ArgumentHandler::checkArguments() { // Check splitrule if (((splitrule == AUC || splitrule == AUC_IGNORE_TIES) && treetype != TREE_SURVIVAL) - || (splitrule == MAXSTAT && (treetype != TREE_SURVIVAL && treetype != TREE_REGRESSION))) { + || (splitrule == MAXSTAT && (treetype != TREE_SURVIVAL && treetype != TREE_REGRESSION)) + || (splitrule == BETA && treetype != TREE_REGRESSION)) { throw std::runtime_error("Illegal splitrule selected. See '--help' for details."); } @@ -503,64 +482,119 @@ void ArgumentHandler::displayHelp() { std::cout << " " << "--help Print this help." << std::endl; std::cout << " " << "--version Print version and citation information." << std::endl; std::cout << " " << "--verbose Turn on verbose mode." << std::endl; - std::cout << " " << "--file FILE Filename of input data. Only numerical values are supported." << std::endl; + std::cout << " " << "--file FILE Filename of input data. Only numerical values are supported." + << std::endl; std::cout << " " << "--treetype TYPE Set tree type to:" << std::endl; std::cout << " " << " TYPE = 1: Classification." << std::endl; std::cout << " " << " TYPE = 3: Regression." << std::endl; std::cout << " " << " TYPE = 5: Survival." << std::endl; std::cout << " " << " (Default: 1)" << std::endl; - std::cout << " " << "--probability Grow a Classification forest with probability estimation for the classes." << std::endl; + std::cout << " " + << "--probability Grow a Classification forest with probability estimation for the classes." + << std::endl; std::cout << " " << " Use in combination with --treetype 1." << std::endl; - std::cout << " " << "--depvarname NAME Name of dependent variable. For survival trees this is the time variable." << std::endl; - std::cout << " " << "--statusvarname NAME Name of status variable, only applicable for survival trees." << std::endl; + std::cout << " " + << "--depvarname NAME Name of dependent variable. For survival trees this is the time variable." + << std::endl; + std::cout << " " << "--statusvarname NAME Name of status variable, only applicable for survival trees." + << std::endl; std::cout << " " << " Coding is 1 for event and 0 for censored." << std::endl; std::cout << " " << "--ntree N Set number of trees to N." << std::endl; std::cout << " " << " (Default: 500)" << std::endl; - std::cout << " " << "--mtry N Number of variables to possibly split at in each node." << std::endl; - std::cout << " " << " (Default: sqrt(p) with p = number of independent variables)" << std::endl; + std::cout << " " << "--mtry N Number of variables to possibly split at in each node." + << std::endl; + std::cout << " " << " (Default: sqrt(p) with p = number of independent variables)" + << std::endl; std::cout << " " << "--targetpartitionsize N Set minimal node size to N." << std::endl; - std::cout << " " << " For Classification and Regression growing is stopped if a node reaches a size smaller than N." << std::endl; - std::cout << " " << " For Survival growing is stopped if one child would reach a size smaller than N." << std::endl; - std::cout << " " << " This means nodes with size smaller N can occur for Classification and Regression." << std::endl; - std::cout << " " << " (Default: 1 for Classification, 5 for Regression, and 3 for Survival)" << std::endl; + std::cout << " " + << " For Classification and Regression growing is stopped if a node reaches a size smaller than N." + << std::endl; + std::cout << " " + << " For Survival growing is stopped if one child would reach a size smaller than N." + << std::endl; + std::cout << " " + << " This means nodes with size smaller N can occur for Classification and Regression." + << std::endl; + std::cout << " " + << " (Default: 1 for Classification, 5 for Regression, and 3 for Survival)" + << std::endl; std::cout << " " << "--maxdepth N Set maximal tree depth to N." << std::endl; - std::cout << " " << " Set to 0 for unlimited depth. A value of 1 corresponds to tree stumps (1 split)." << std::endl; - std::cout << " " << "--catvars V1,V2,.. Comma separated list of names of (unordered) categorical variables. " << std::endl; - std::cout << " " << " Categorical variables must contain only positive integer values." << std::endl; + std::cout << " " + << " Set to 0 for unlimited depth. A value of 1 corresponds to tree stumps (1 split)." + << std::endl; + std::cout << " " + << "--catvars V1,V2,.. Comma separated list of names of (unordered) categorical variables. " + << std::endl; + std::cout << " " + << " Categorical variables must contain only positive integer values." << std::endl; std::cout << " " << "--write Save forest to file .forest." << std::endl; - std::cout << " " << "--predict FILE Load forest from FILE and predict with new data. The new data is expected in the exact same " << std::endl; - std::cout << " " << " shape as the training data. If the outcome of your new dataset is unknown, add a dummy column." << std::endl; - std::cout << " " << "--predall Return a matrix with individual predictions for each tree instead of aggregated " << std::endl; - std::cout << " " << " predictions for all trees (classification and regression only)." << std::endl; + std::cout << " " + << "--predict FILE Load forest from FILE and predict with new data. The new data is expected in the exact same " + << std::endl; + std::cout << " " + << " shape as the training data. If the outcome of your new dataset is unknown, add a dummy column." + << std::endl; + std::cout << " " + << "--predall Return a matrix with individual predictions for each tree instead of aggregated " + << std::endl; + std::cout << " " << " predictions for all trees (classification and regression only)." + << std::endl; std::cout << " " << "--predictiontype TYPE Set type of prediction to:" << std::endl; std::cout << " " << " TYPE = 1: Return predicted classes or values." << std::endl; - std::cout << " " << " TYPE = 2: Return terminal node IDs per tree for new observations." << std::endl; + std::cout << " " + << " TYPE = 2: Return terminal node IDs per tree for new observations." << std::endl; std::cout << " " << " (Default: 1)" << std::endl; std::cout << " " << "--impmeasure TYPE Set importance mode to:" << std::endl; std::cout << " " << " TYPE = 0: none." << std::endl; - std::cout << " " << " TYPE = 1: Node impurity: Gini for Classification, variance for Regression, sum of test statistic for Survival." << std::endl; - std::cout << " " << " TYPE = 2: Permutation importance, scaled by standard errors." << std::endl; + std::cout << " " + << " TYPE = 1: Node impurity: Gini for Classification, variance for Regression, sum of test statistic for Survival." + << std::endl; + std::cout << " " << " TYPE = 2: Permutation importance, scaled by standard errors." + << std::endl; std::cout << " " << " TYPE = 3: Permutation importance, no scaling." << std::endl; - std::cout << " " << " TYPE = 5: Corrected node impurity: Bias-corrected version of node impurity importance." << std::endl; + std::cout << " " + << " TYPE = 5: Corrected node impurity: Bias-corrected version of node impurity importance." + << std::endl; std::cout << " " << " (Default: 0)" << std::endl; std::cout << " " << "--noreplace Sample without replacement." << std::endl; - std::cout << " " << "--fraction X Fraction of observations to sample. Default is 1 for sampling with replacement " << std::endl; + std::cout << " " + << "--fraction X Fraction of observations to sample. Default is 1 for sampling with replacement " + << std::endl; std::cout << " " << " and 0.632 for sampling without replacement." << std::endl; std::cout << " " << "--splitrule RULE Splitting rule:" << std::endl; - std::cout << " " << " RULE = 1: Gini for Classification, variance for Regression, logrank for Survival." << std::endl; - std::cout << " " << " RULE = 2: AUC for Survival, not available for Classification and Regression." << std::endl; - std::cout << " " << " RULE = 3: AUC (ignore ties) for Survival, not available for Classification and Regression." << std::endl; - std::cout << " " << " RULE = 4: MAXSTAT for Survival and Regression, not available for Classification." << std::endl; + std::cout << " " + << " RULE = 1: Gini for Classification, variance for Regression, logrank for Survival." + << std::endl; + std::cout << " " + << " RULE = 2: AUC for Survival, not available for Classification and Regression." + << std::endl; + std::cout << " " + << " RULE = 3: AUC (ignore ties) for Survival, not available for Classification and Regression." + << std::endl; + std::cout << " " + << " RULE = 4: MAXSTAT for Survival and Regression, not available for Classification." + << std::endl; std::cout << " " << " RULE = 5: ExtraTrees for all tree types." << std::endl; + std::cout << " " << " RULE = 6: BETA for regression." << std::endl; std::cout << " " << " (Default: 1)" << std::endl; - std::cout << " " << "--randomsplits N Number of random splits to consider for each splitting variable (ExtraTrees splitrule only)." << std::endl; - std::cout << " " << "--alpha VAL Significance threshold to allow splitting (MAXSTAT splitrule only)." << std::endl; - std::cout << " " << "--minprop VAL Lower quantile of covariate distribtuion to be considered for splitting (MAXSTAT splitrule only)." << std::endl; + std::cout << " " + << "--randomsplits N Number of random splits to consider for each splitting variable (ExtraTrees splitrule only)." + << std::endl; + std::cout << " " + << "--alpha VAL Significance threshold to allow splitting (MAXSTAT splitrule only)." + << std::endl; + std::cout << " " + << "--minprop VAL Lower quantile of covariate distribtuion to be considered for splitting (MAXSTAT splitrule only)." + << std::endl; std::cout << " " << "--caseweights FILE Filename of case weights file." << std::endl; - std::cout << " " << "--holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable " << std::endl; + std::cout << " " + << "--holdout Hold-out mode. Hold-out all samples with case weight 0 and use these for variable " + << std::endl; std::cout << " " << " importance and prediction error." << std::endl; std::cout << " " << "--splitweights FILE Filename of split select weights file." << std::endl; - std::cout << " " << "--alwayssplitvars V1,V2,.. Comma separated list of variable names to be always considered for splitting." << std::endl; + std::cout << " " + << "--alwayssplitvars V1,V2,.. Comma separated list of variable names to be always considered for splitting." + << std::endl; std::cout << " " << "--skipoob Skip computation of OOB error." << std::endl; std::cout << " " << "--nthreads N Set number of parallel threads to N." << std::endl; std::cout << " " << " (Default: Number of CPUs available)" << std::endl; @@ -582,11 +616,15 @@ void ArgumentHandler::displayVersion() { std::cout << "Ranger version: " << RANGER_VERSION << std::endl; std::cout << std::endl; std::cout << "Please cite Ranger: " << std::endl; - std::cout << "Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. Journal of Statistical Software 77:1-17." << std::endl; + std::cout + << "Wright, M. N. & Ziegler, A. (2017). ranger: A Fast Implementation of Random Forests for High Dimensional Data in C++ and R. Journal of Statistical Software 77:1-17." + << std::endl; std::cout << std::endl; std::cout << "BibTeX:" << std::endl; std::cout << "@Article{," << std::endl; - std::cout << " title = {{ranger}: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}," << std::endl; + std::cout + << " title = {{ranger}: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}," + << std::endl; std::cout << " author = {Wright, Marvin N. and Ziegler, Andreas}," << std::endl; std::cout << " journal = {Journal of Statistical Software}," << std::endl; std::cout << " year = {2017}," << std::endl; diff --git a/cpp_version/src/utility/DataChar.cpp b/cpp_version/src/utility/DataChar.cpp deleted file mode 120000 index 364b3291d..000000000 --- a/cpp_version/src/utility/DataChar.cpp +++ /dev/null @@ -1 +0,0 @@ -../../../src/DataChar.cpp \ No newline at end of file diff --git a/cpp_version/src/utility/DataFloat.cpp b/cpp_version/src/utility/DataFloat.cpp deleted file mode 120000 index 7b07157f7..000000000 --- a/cpp_version/src/utility/DataFloat.cpp +++ /dev/null @@ -1 +0,0 @@ -../../../src/DataFloat.cpp \ No newline at end of file diff --git a/cpp_version/src/version.h b/cpp_version/src/version.h index 2ebfb79b4..5c6aeecbb 100644 --- a/cpp_version/src/version.h +++ b/cpp_version/src/version.h @@ -1,3 +1,3 @@ #ifndef RANGER_VERSION -#define RANGER_VERSION "0.11.4" +#define RANGER_VERSION "0.11.5" #endif diff --git a/cpp_version/test/test_maxstat.R b/cpp_version/test/test_maxstat.R deleted file mode 100644 index c9a1d4066..000000000 --- a/cpp_version/test/test_maxstat.R +++ /dev/null @@ -1,105 +0,0 @@ -library(ranger) -library(maxstatRF) -library(survival) -library(reshape2) -library(ggplot2) - -# Compare survival curves ------------------------------------------------- -idx <- sample(nrow(veteran), 2/3*nrow(veteran), replace = FALSE) -train_data <- veteran[idx, ] -test_data <- veteran[-idx, ] - -num_trees <- 50 -mtry <- 3 -minprop <- 0.1 -alpha <- 0.5 -min_node_size <- 5 -replace <- TRUE -pmethod <- "minLau" - -rf_maxstatRF <- maxstatRF(Surv(time, status) ~., train_data, - num_trees = num_trees, mtry = mtry, minprop = minprop, - alpha = alpha, min_node_size = min_node_size, replace = replace, - pmethod = pmethod) - -rf_ranger <- ranger(Surv(time, status) ~., train_data, splitrule = "maxstat", - write.forest = TRUE, - num.trees = num_trees, mtry = mtry, minprop = minprop, - alpha = alpha, min.node.size = min_node_size, replace = replace) - -pred_maxstatRF <- rf_maxstatRF$predict(test_data) -pred_ranger <- predict(rf_ranger, test_data)$survival - -colnames(pred_maxstatRF) <- rf_maxstatRF$timepoints -colnames(pred_ranger) <- timepoints(rf_ranger) -df <- rbind(data.frame(Package = "maxstatRF", melt(pred_maxstatRF, value.name = "Survival", varnames = c("ID", "Time"))), - data.frame(Package = "ranger", melt(pred_ranger, value.name = "Survival", varnames = c("ID", "Time")))) - -ggplot(df, aes(x = Time, y = Survival, color = Package)) + - geom_line() + - facet_wrap(~ID) - -# Compare Brier score ------------------------------------------------- -library(ranger) -library(maxstatRF) -library(survival) -library(prodlim) -library(pec) - -## Pec function for maxstatRF -predictSurvProb.ForestSurvival <- function(object, newdata, times, ...) { - pred <- object$predict(newdata) - pos <- sapply(times, function(x) { - max(c(0, which(x >= object$timepoints))) - }) - p <- cbind(1, pred)[, pos+1, drop = FALSE] - return(p) -} - -## Pec function for ranger -predictSurvProb.ranger <- function(object, newdata, times, ...) { - pred <- predict(object, newdata) - pos <- sapply(times, function(x) { - max(c(0, which(x >= pred$unique.death.times))) - }) - p <- cbind(1, predictions(pred))[, pos+1, drop = FALSE] - return(p) -} - -## Parameters -num_trees <- 50 -mtry <- 3 -minprop <- 0.1 -alpha <- 0.5 -min_node_size <- 5 -replace <- TRUE -pmethod <- "minLau" -formula <- Surv(time, status) ~. - -compare_packages <- function() { - ## Data - idx <- sample(nrow(veteran), 2/3*nrow(veteran), replace = FALSE) - train_data <- veteran[idx, ] - test_data <- veteran[-idx, ] - time <- sort(unique(train_data$time)) - - ## Run - rf_maxstatRF <- maxstatRF(formula, train_data, - num_trees = num_trees, mtry = mtry, minprop = minprop, - alpha = alpha, min_node_size = min_node_size, replace = replace, - pmethod = pmethod) - - rf_ranger <- ranger(formula, train_data, splitrule = "maxstat", - write.forest = TRUE, - num.trees = num_trees, mtry = mtry, minprop = minprop, - alpha = alpha, min.node.size = min_node_size, replace = replace) - - ## Compare - fit.pec <- pec(list(maxstatRF = rf_maxstatRF, ranger = rf_ranger), - formula = formula, data = test_data, - times = time, cens.model = "marginal", reference = FALSE) - crps(fit.pec)[, 1] -} - -res <- replicate(100, compare_packages()) -boxplot(t(res)) diff --git a/cpp_version/test/testthat.R b/cpp_version/test/testthat.R new file mode 100644 index 000000000..5aea37d66 --- /dev/null +++ b/cpp_version/test/testthat.R @@ -0,0 +1,28 @@ + +library(ranger) +library(data.table) +library(testthat) +library(survival) + +# Function to call C++ version from R +ranger_cpp <- function(data, ...) { + if (is.data.frame(data) && any(sapply(data, is.numeric))) { + idx_numeric <- sapply(data, is.numeric) + data[, !idx_numeric] <- lapply(data[, !idx_numeric, drop = FALSE], as.numeric) + } + fwrite(data, "temp_data.csv") + ret <- system2("../../build/ranger", + args = c("--verbose", "--file temp_data.csv", paste0("--", names(list(...)), " ", list(...))), + stdout = TRUE, stderr = TRUE) + if (length(ret) == 1 && nchar(ret) >= 5 && substr(ret, 1, 5) == "Error") { + stop(ret) + } + unlink("temp_data.csv") + ret +} + +test_dir("testthat") + + + + diff --git a/cpp_version/test/testthat/.gitignore b/cpp_version/test/testthat/.gitignore new file mode 100644 index 000000000..b1873686f --- /dev/null +++ b/cpp_version/test/testthat/.gitignore @@ -0,0 +1 @@ +ranger_out.* \ No newline at end of file diff --git a/cpp_version/test/testthat/test_arguments.R b/cpp_version/test/testthat/test_arguments.R new file mode 100644 index 000000000..23a1786d2 --- /dev/null +++ b/cpp_version/test/testthat/test_arguments.R @@ -0,0 +1,11 @@ + +context("ranger_cpp_arguments") + +test_that("Error if sample fraction is 0 or >1", { + expect_warning( + expect_error(ranger_cpp(data = iris, depvarname = "Species", ntree = 5, fraction = 0), + "Error: Illegal argument for option 'fraction'\\. Please give a value in \\(0,1\\]\\. See '--help' for details\\. Ranger will EXIT now\\.")) + expect_warning( + expect_error(ranger_cpp(data = iris, depvarname = "Species", ntree = 5, fraction = 1.1), + "Error: Illegal argument for option 'fraction'\\. Please give a value in \\(0,1\\]\\. See '--help' for details\\. Ranger will EXIT now\\.")) +}) diff --git a/cpp_version/test/testthat/test_classification.R b/cpp_version/test/testthat/test_classification.R new file mode 100644 index 000000000..b48ede461 --- /dev/null +++ b/cpp_version/test/testthat/test_classification.R @@ -0,0 +1,49 @@ + +context("ranger_cpp_classification") + +test_that("Prediction is equal to R version", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Species", ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = iris, predict = "ranger_out.forest", seed = 20) + preds_cpp <- as.data.frame(fread("ranger_out.prediction"))[, 1] + + # R version + rf <- ranger(Species ~ ., iris, num.trees = 5, seed = 10) + preds_r <- as.numeric(predict(rf, iris, seed = 20)$predictions) + + expect_equal(preds_cpp, preds_r) +}) + +test_that("Predictions are positive numbers", { + rf <- ranger_cpp(data = iris, depvarname = "Species", ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = iris, predict = "ranger_out.forest") + preds_cpp <- as.data.frame(fread("ranger_out.prediction"))[, 1] + expect_is(preds_cpp, "integer") + expect_true(all(preds_cpp > 0)) +}) + +test_that("Same result with default splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Species", ntree = 5, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Species ~ ., iris, num.trees = 5, seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with extratrees splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Species", ntree = 5, splitrule = 5, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Species ~ ., iris, num.trees = 5, splitrule = "extratrees", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) diff --git a/cpp_version/test/testthat/test_probability.R b/cpp_version/test/testthat/test_probability.R new file mode 100644 index 000000000..a2a2c0b3a --- /dev/null +++ b/cpp_version/test/testthat/test_probability.R @@ -0,0 +1,53 @@ + +context("ranger_cpp_probability") + +test_that("Prediction is equal to R version", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Species", probability = "", ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = iris, predict = "ranger_out.forest", probability = "", seed = 20) + preds_cpp <- as.matrix(fread("ranger_out.prediction")) + colnames(preds_cpp) <- NULL + + # R version + rf <- ranger(Species ~ ., iris, probability = TRUE, num.trees = 5, seed = 10) + preds_r <- predict(rf, iris, seed = 20)$predictions + colnames(preds_r) <- NULL + + expect_equal(round(preds_cpp, 4), round(preds_r, 4)) +}) + +test_that("Predictions are probabilites", { + rf <- ranger_cpp(data = iris, depvarname = "Species", probability = "", ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = iris, predict = "ranger_out.forest", probability = "") + preds_cpp <- as.matrix(fread("ranger_out.prediction")) + expect_is(preds_cpp, "matrix") + expect_equal(dim(preds_cpp), c(150, 3)) + expect_true(all(preds_cpp >= 0)) + expect_true(all(preds_cpp <= 1)) +}) + +test_that("Same result with default splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Species", probability = "", ntree = 5, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Species ~ ., iris, probability = TRUE, num.trees = 5, seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with extratrees splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Species", probability = "", ntree = 5, splitrule = 5, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Species ~ ., iris, probability = TRUE, num.trees = 5, splitrule = "extratrees", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) diff --git a/cpp_version/test/testthat/test_regression.R b/cpp_version/test/testthat/test_regression.R new file mode 100644 index 000000000..015955bc0 --- /dev/null +++ b/cpp_version/test/testthat/test_regression.R @@ -0,0 +1,98 @@ + +context("ranger_cpp_regression") + +test_that("Prediction is equal to R version", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Sepal.Length", treetype = 3, ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = iris, treetype = 3, predict = "ranger_out.forest", seed = 20) + preds_cpp <- as.data.frame(fread("ranger_out.prediction"))[, 1] + + # R version + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, seed = 10) + preds_r <- as.numeric(predict(rf, iris, seed = 20)$predictions) + + expect_equal(round(preds_cpp, 4), round(preds_r, 4)) +}) + +test_that("Predictions are in range of original data", { + rf <- ranger_cpp(data = iris, depvarname = "Sepal.Width", treetype = 3, ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = iris, predict = "ranger_out.forest", treetype = 3) + preds_cpp <- as.data.frame(fread("ranger_out.prediction"))[, 1] + expect_is(preds_cpp, "numeric") + expect_true(all(preds_cpp >= min(iris$Sepal.Width))) + expect_true(all(preds_cpp <= max(iris$Sepal.Width))) +}) + +test_that("Same result with default splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Petal.Length", treetype = 3, ntree = 5, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Petal.Length ~ ., iris, num.trees = 5, seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with extratrees splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Sepal.Length", treetype = 3, ntree = 5, splitrule = 5, catvars = "Species", seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "extratrees", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with maxstat splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Sepal.Length", treetype = 3, ntree = 5, splitrule = 4, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, splitrule = "maxstat", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with beta splitting", { + # Generate data with 0..1 outcome + n <- 100 + p <- 4 + beta <- c(0, 1, 2, 3) + x <- round(replicate(p, runif(n)), 3) + y <- as.vector(x %*% beta) + y <- round((y-min(y))/(max(y)-min(y)), 3) + dat <- data.frame(y = y, x) + + # C++ version + rf <- ranger_cpp(data = dat, depvarname = "y", treetype = 3, ntree = 5, splitrule = 6, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(y ~ ., dat, num.trees = 5, splitrule = "beta", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with unordered splitting", { + # C++ version + rf <- ranger_cpp(data = iris, depvarname = "Sepal.Length", treetype = 3, ntree = 5, catvars = "Species", seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Sepal.Length ~ ., iris, num.trees = 5, respect.unordered.factors = "partition", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) diff --git a/cpp_version/test/testthat/test_survival.R b/cpp_version/test/testthat/test_survival.R new file mode 100644 index 000000000..0beb0e20d --- /dev/null +++ b/cpp_version/test/testthat/test_survival.R @@ -0,0 +1,76 @@ + +context("ranger_cpp_regression") + +test_that("Prediction is equal to R version", { + # C++ version + rf <- ranger_cpp(data = veteran, depvarname = "time", statusvarname = "status", treetype = 5, ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = veteran, treetype = 5, predict = "ranger_out.forest", seed = 20) + preds_cpp <- as.matrix(fread("ranger_out.prediction", skip = 4)) + dimnames(preds_cpp) <- NULL + + # R version + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, seed = 10) + preds_r <- predict(rf, veteran, seed = 20)$chf + + expect_equal(round(preds_cpp, 2), round(preds_r, 2)) +}) + +test_that("Predictions are positive numerics", { + rf <- ranger_cpp(data = veteran, depvarname = "time", statusvarname = "status", treetype = 5, ntree = 5, write = "", seed = 10) + pred <- ranger_cpp(data = veteran, predict = "ranger_out.forest", treetype = 5) + preds_cpp <- as.matrix(fread("ranger_out.prediction", skip = 4)) + expect_is(preds_cpp, "matrix") + expect_true(all(preds_cpp >= 0)) +}) + +test_that("Same result with default splitting", { + # C++ version + rf <- ranger_cpp(data = veteran, depvarname = "time", statusvarname = "status", treetype = 5, ntree = 5, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with extratrees splitting", { + # C++ version + rf <- ranger_cpp(data = veteran, depvarname = "time", statusvarname = "status", treetype = 5, ntree = 5, splitrule = 5, catvars = "celltype", seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, splitrule = "extratrees", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with maxstat splitting", { + # C++ version + rf <- ranger_cpp(data = veteran, depvarname = "time", statusvarname = "status", treetype = 5, ntree = 5, splitrule = 4, seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, splitrule = "maxstat", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) + +test_that("Same result with unordered splitting", { + # C++ version + rf <- ranger_cpp(data = veteran, depvarname = "time", statusvarname = "status", treetype = 5, ntree = 5, catvars = "celltype", seed = 10) + err_cpp <- grep("Overall OOB prediction error", rf, value = TRUE) + err_cpp <- as.numeric(gsub("[^0-9.]", "", err_cpp)) + + # R version + rf <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5, respect.unordered.factors = "partition", seed = 10) + err_r <- rf$prediction.error + + expect_equal(round(err_cpp, 4), round(err_r, 4)) +}) diff --git a/man/ranger.Rd b/man/ranger.Rd index f1a5bd91a..199866408 100644 --- a/man/ranger.Rd +++ b/man/ranger.Rd @@ -16,7 +16,8 @@ ranger(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, inbag = NULL, holdout = FALSE, quantreg = FALSE, oob.error = TRUE, num.threads = NULL, save.memory = FALSE, verbose = TRUE, seed = NULL, dependent.variable.name = NULL, - status.variable.name = NULL, classification = NULL) + status.variable.name = NULL, classification = NULL, x = NULL, + y = NULL) } \arguments{ \item{formula}{Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables.} @@ -84,6 +85,10 @@ ranger(formula = NULL, data = NULL, num.trees = 500, mtry = NULL, \item{status.variable.name}{Name of status variable, only applicable to survival data and needed if no formula given. Use 1 for event and 0 for censoring.} \item{classification}{Only needed if data is a matrix. Set to \code{TRUE} to grow a classification forest.} + +\item{x}{Predictor data (independent variables), alternative interface to data with formula or dependent.variable.name.} + +\item{y}{Response vector (dependent variable), alternative interface to data with formula or dependent.variable.name. For survival use a \code{Surv()} object or a matrix with time and status.} } \value{ Object of class \code{ranger} with elements @@ -148,7 +153,8 @@ See Nembrini et al. (2018) for details. This importance measure can be combined with the methods to estimate p-values in \code{\link{importance_pvalues}}. For a large number of variables and data frames as input data the formula interface can be slow or impossible to use. -Alternatively \code{dependent.variable.name} (and \code{status.variable.name} for survival) can be used. +Alternatively \code{dependent.variable.name} (and \code{status.variable.name} for survival) or \code{x} and \code{y} can be used. +Use \code{x} and \code{y} with a matrix for \code{x} to avoid conversions and save memory. Consider setting \code{save.memory = TRUE} if you encounter memory problems for very large datasets, but be aware that this option slows down the tree growing. For GWAS data consider combining \code{ranger} with the \code{GenABEL} package. @@ -190,8 +196,9 @@ require(survival) rg.veteran <- ranger(Surv(time, status) ~ ., data = veteran) plot(rg.veteran$unique.death.times, rg.veteran$survival[1,]) -## Alternative interface +## Alternative interfaces (same results) ranger(dependent.variable.name = "Species", data = iris) +ranger(y = iris[, 5], x = iris[, -5]) \dontrun{ ## Use GenABEL interface to read Plink data into R and grow a classification forest diff --git a/src/Data.cpp b/src/Data.cpp index f3ca090d6..ca85ba166 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -33,14 +33,16 @@ size_t Data::getVariableID(const std::string& variable_name) const { return (std::distance(variable_names.cbegin(), it)); } +// #nocov start (cannot be tested anymore because GenABEL not on CRAN) void Data::addSnpData(unsigned char* snp_data, size_t num_cols_snp) { num_cols = num_cols_no_snp + num_cols_snp; num_rows_rounded = roundToNextMultiple(num_rows, 4); this->snp_data = snp_data; } +// #nocov end // #nocov start -bool Data::loadFromFile(std::string filename) { +bool Data::loadFromFile(std::string filename, std::vector& dependent_variable_names) { bool result; @@ -67,11 +69,11 @@ bool Data::loadFromFile(std::string filename) { // Find out if comma, semicolon or whitespace seperated and call appropriate method if (header_line.find(",") != std::string::npos) { - result = loadFromFileOther(input_file, header_line, ','); + result = loadFromFileOther(input_file, header_line, dependent_variable_names, ','); } else if (header_line.find(";") != std::string::npos) { - result = loadFromFileOther(input_file, header_line, ';'); + result = loadFromFileOther(input_file, header_line, dependent_variable_names, ';'); } else { - result = loadFromFileWhitespace(input_file, header_line); + result = loadFromFileWhitespace(input_file, header_line, dependent_variable_names); } externalData = false; @@ -79,19 +81,36 @@ bool Data::loadFromFile(std::string filename) { return result; } -bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_line) { +bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_line, + std::vector& dependent_variable_names) { + + size_t num_dependent_variables = dependent_variable_names.size(); + std::vector dependent_varIDs; + dependent_varIDs.resize(num_dependent_variables); // Read header std::string header_token; std::stringstream header_line_stream(header_line); + size_t col = 0; while (header_line_stream >> header_token) { - variable_names.push_back(header_token); + bool is_dependent_var = false; + for (size_t i = 0; i < dependent_variable_names.size(); ++i) { + if (header_token == dependent_variable_names[i]) { + dependent_varIDs[i] = col; + is_dependent_var = true; + } + } + if (!is_dependent_var) { + variable_names.push_back(header_token); + } + ++col; } + num_cols = variable_names.size(); num_cols_no_snp = num_cols; // Read body - reserveMemory(); + reserveMemory(num_dependent_variables); bool error = false; std::string line; size_t row = 0; @@ -100,13 +119,26 @@ bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_ std::stringstream line_stream(line); size_t column = 0; while (readFromStream(line_stream, token)) { - set(column, row, token, error); + size_t column_x = column; + bool is_dependent_var = false; + for (size_t i = 0; i < dependent_varIDs.size(); ++i) { + if (column == dependent_varIDs[i]) { + set_y(i, row, token, error); + is_dependent_var = true; + break; + } else if (column > dependent_varIDs[i]) { + --column_x; + } + } + if (!is_dependent_var) { + set_x(column_x, row, token, error); + } ++column; } - if (column > num_cols) { + if (column > (num_cols + num_dependent_variables)) { throw std::runtime_error( std::string("Could not open input file. Too many columns in row ") + std::to_string(row) + std::string(".")); - } else if (column < num_cols) { + } else if (column < (num_cols + num_dependent_variables)) { throw std::runtime_error( std::string("Could not open input file. Too few columns in row ") + std::to_string(row) + std::string(". Are all values numeric?")); @@ -117,19 +149,36 @@ bool Data::loadFromFileWhitespace(std::ifstream& input_file, std::string header_ return error; } -bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator) { +bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, + std::vector& dependent_variable_names, char seperator) { + + size_t num_dependent_variables = dependent_variable_names.size(); + std::vector dependent_varIDs; + dependent_varIDs.resize(num_dependent_variables); // Read header std::string header_token; std::stringstream header_line_stream(header_line); + size_t col = 0; while (getline(header_line_stream, header_token, seperator)) { - variable_names.push_back(header_token); + bool is_dependent_var = false; + for (size_t i = 0; i < dependent_variable_names.size(); ++i) { + if (header_token == dependent_variable_names[i]) { + dependent_varIDs[i] = col; + is_dependent_var = true; + } + } + if (!is_dependent_var) { + variable_names.push_back(header_token); + } + ++col; } + num_cols = variable_names.size(); num_cols_no_snp = num_cols; // Read body - reserveMemory(); + reserveMemory(num_dependent_variables); bool error = false; std::string line; size_t row = 0; @@ -141,7 +190,21 @@ bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, while (getline(line_stream, token_string, seperator)) { std::stringstream token_stream(token_string); readFromStream(token_stream, token); - set(column, row, token, error); + + size_t column_x = column; + bool is_dependent_var = false; + for (size_t i = 0; i < dependent_varIDs.size(); ++i) { + if (column == dependent_varIDs[i]) { + set_y(i, row, token, error); + is_dependent_var = true; + break; + } else if (column > dependent_varIDs[i]) { + --column_x; + } + } + if (!is_dependent_var) { + set_x(column_x, row, token, error); + } ++column; } ++row; @@ -151,14 +214,15 @@ bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line, } // #nocov end -void Data::getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const { +void Data::getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, + size_t end) const { // All values for varID (no duplicates) for given sampleIDs if (getUnpermutedVarID(varID) < num_cols_no_snp) { - all_values.reserve(end-start); + all_values.reserve(end - start); for (size_t pos = start; pos < end; ++pos) { - all_values.push_back(get(sampleIDs[pos], varID)); + all_values.push_back(get_x(sampleIDs[pos], varID)); } std::sort(all_values.begin(), all_values.end()); all_values.erase(std::unique(all_values.begin(), all_values.end()), all_values.end()); @@ -168,13 +232,14 @@ void Data::getAllValues(std::vector& all_values, std::vector& sa } } -void Data::getMinMaxValues(double& min, double&max, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const { +void Data::getMinMaxValues(double& min, double&max, std::vector& sampleIDs, size_t varID, size_t start, + size_t end) const { if (sampleIDs.size() > 0) { - min = get(sampleIDs[start], varID); + min = get_x(sampleIDs[start], varID); max = min; } for (size_t pos = start; pos < end; ++pos) { - double value = get(sampleIDs[pos], varID); + double value = get_x(sampleIDs[pos], varID); if (value < min) { min = value; } @@ -195,14 +260,15 @@ void Data::sort() { // Get all unique values std::vector unique_values(num_rows); for (size_t row = 0; row < num_rows; ++row) { - unique_values[row] = get(row, col); + unique_values[row] = get_x(row, col); } std::sort(unique_values.begin(), unique_values.end()); unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); // Get index of unique value for (size_t row = 0; row < num_rows; ++row) { - size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin(); + size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get_x(row, col)) + - unique_values.begin(); index_data[col * num_rows + row] = idx; } @@ -215,13 +281,13 @@ void Data::sort() { } // TODO: Implement ordering for multiclass and survival -void Data::orderSnpLevels(std::string dependent_variable_name, bool corrected_importance) { +// #nocov start (cannot be tested anymore because GenABEL not on CRAN) +void Data::orderSnpLevels(bool corrected_importance) { // Stop if now SNP data if (snp_data == 0) { return; } - size_t dependent_varID = getVariableID(dependent_variable_name); size_t num_snps; if (corrected_importance) { num_snps = 2 * (num_cols - num_cols_no_snp); @@ -256,7 +322,7 @@ void Data::orderSnpLevels(std::string dependent_variable_name, bool corrected_im value = 0; } - means[value] += get(row, dependent_varID); + means[value] += get_y(row, 0); ++counts[value]; } @@ -270,6 +336,7 @@ void Data::orderSnpLevels(std::string dependent_variable_name, bool corrected_im order_snps = true; } +// #nocov end } // namespace ranger diff --git a/src/Data.h b/src/Data.h index b4519e229..7c0ebc993 100644 --- a/src/Data.h +++ b/src/Data.h @@ -31,22 +31,29 @@ class Data { virtual ~Data() = default; - virtual double get(size_t row, size_t col) const = 0; + virtual double get_x(size_t row, size_t col) const = 0; + virtual double get_y(size_t row, size_t col) const = 0; size_t getVariableID(const std::string& variable_name) const; - virtual void reserveMemory() = 0; - virtual void set(size_t col, size_t row, double value, bool& error) = 0; + virtual void reserveMemory(size_t y_cols) = 0; + + virtual void set_x(size_t col, size_t row, double value, bool& error) = 0; + virtual void set_y(size_t col, size_t row, double value, bool& error) = 0; void addSnpData(unsigned char* snp_data, size_t num_cols_snp); - bool loadFromFile(std::string filename); - bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line); - bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator); + bool loadFromFile(std::string filename, std::vector& dependent_variable_names); + bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line, + std::vector& dependent_variable_names); + bool loadFromFileOther(std::ifstream& input_file, std::string header_line, + std::vector& dependent_variable_names, char seperator); - void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const; + void getAllValues(std::vector& all_values, std::vector& sampleIDs, size_t varID, size_t start, + size_t end) const; - void getMinMaxValues(double& min, double&max, std::vector& sampleIDs, size_t varID, size_t start, size_t end) const; + void getMinMaxValues(double& min, double&max, std::vector& sampleIDs, size_t varID, size_t start, + size_t end) const; size_t getIndex(size_t row, size_t col) const { // Use permuted data for corrected impurity importance @@ -63,6 +70,7 @@ class Data { } } + // #nocov start (cannot be tested anymore because GenABEL not on CRAN) size_t getSnp(size_t row, size_t col, size_t col_permuted) const { // Get data out of snp storage. -1 because of GenABEL coding. size_t idx = (col - num_cols_no_snp) * num_rows_rounded + row; @@ -76,13 +84,14 @@ class Data { // Order SNPs if (order_snps) { if (col_permuted >= num_cols) { - result = snp_order[col_permuted + no_split_variables.size() - 2 * num_cols_no_snp][result]; + result = snp_order[col_permuted - 2 * num_cols_no_snp][result]; } else { result = snp_order[col - num_cols_no_snp][result]; } } return result; } + // #nocov end double getUniqueDataValue(size_t varID, size_t index) const { // Use permuted data for corrected impurity importance @@ -114,7 +123,7 @@ class Data { void sort(); - void orderSnpLevels(std::string dependent_variable_name, bool corrected_importance); + void orderSnpLevels(bool corrected_importance); const std::vector& getVariableNames() const { return variable_names; @@ -136,15 +145,6 @@ class Data { } } - const std::vector& getNoSplitVariables() const noexcept { - return no_split_variables; - } - - void addNoSplitVariable(size_t varID) { - no_split_variables.push_back(varID); - std::sort(no_split_variables.begin(), no_split_variables.end()); - } - std::vector& getIsOrderedVariable() noexcept { return is_ordered_variable; } @@ -182,16 +182,11 @@ class Data { size_t getUnpermutedVarID(size_t varID) const { if (varID >= num_cols) { varID -= num_cols; - - for (auto& skip : no_split_variables) { - if (varID >= skip) { - ++varID; - } - } } return varID; } + // #nocov start (cannot be tested anymore because GenABEL not on CRAN) const std::vector>& getSnpOrder() const { return snp_order; } @@ -200,6 +195,7 @@ class Data { this->snp_order = snp_order; order_snps = true; } + // #nocov end protected: std::vector variable_names; @@ -216,9 +212,6 @@ class Data { std::vector> unique_data_values; size_t max_num_unique_values; - // Variable to not split at (only dependent_varID for non-survival trees) - std::vector no_split_variables; - // For each varID true if ordered std::vector is_ordered_variable; diff --git a/src/DataChar.cpp b/src/DataChar.cpp deleted file mode 100644 index 24169c880..000000000 --- a/src/DataChar.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/*------------------------------------------------------------------------------- - This file is part of ranger. - - Copyright (c) [2014-2018] [Marvin N. Wright] - - This software may be modified and distributed under the terms of the MIT license. - - Please note that the C++ core of ranger is distributed under MIT license and the - R package "ranger" under GPL3 license. - #-------------------------------------------------------------------------------*/ - -// Ignore in coverage report (not used in R package) -// #nocov start -#include -#include -#include - -#include "DataChar.h" - -namespace ranger { - -DataChar::DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, - bool& error) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_snp = num_cols; - - reserveMemory(); - - // Save data and report errors - for (size_t i = 0; i < num_cols; ++i) { - for (size_t j = 0; j < num_rows; ++j) { - double value = data_double[i * num_rows + j]; - if (value > CHAR_MAX || value < CHAR_MIN) { - error = true; - } - if (floor(value) != ceil(value)) { - error = true; - } - data[i * num_rows + j] = value; - } - } -} - -} // namespace ranger - -// #nocov end diff --git a/src/DataChar.h b/src/DataChar.h index b0f4e0dae..2302c2d48 100644 --- a/src/DataChar.h +++ b/src/DataChar.h @@ -15,9 +15,10 @@ #define DATACHAR_H_ #include -#include +#include #include "globals.h" +#include "utility.h" #include "Data.h" namespace ranger { @@ -25,14 +26,13 @@ namespace ranger { class DataChar: public Data { public: DataChar() = default; - DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, bool& error); DataChar(const DataChar&) = delete; DataChar& operator=(const DataChar&) = delete; virtual ~DataChar() override = default; - double get(size_t row, size_t col) const override { + double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { @@ -41,28 +41,32 @@ class DataChar: public Data { } if (col < num_cols_no_snp) { - return data[col * num_rows + row]; + return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } - void reserveMemory() override { - data.resize(num_cols * num_rows); + double get_y(size_t row, size_t col) const override { + return y[col * num_rows + row]; } - void set(size_t col, size_t row, double value, bool& error) override { - if (value > CHAR_MAX || value < CHAR_MIN) { - error = true; - } - if (floor(value) != ceil(value)) { - error = true; - } - data[col * num_rows + row] = value; + void reserveMemory(size_t y_cols) override { + x.resize(num_cols * num_rows); + y.resize(y_cols * num_rows); + } + + void set_x(size_t col, size_t row, double value, bool& error) override { + x[col * num_rows + row] = value; + } + + void set_y(size_t col, size_t row, double value, bool& error) override { + y[col * num_rows + row] = value; } private: - std::vector data; + std::vector x; + std::vector y; }; } // namespace ranger diff --git a/src/DataDouble.h b/src/DataDouble.h index aa8713053..aa2e60bdb 100644 --- a/src/DataDouble.h +++ b/src/DataDouble.h @@ -9,6 +9,8 @@ R package "ranger" under GPL3 license. #-------------------------------------------------------------------------------*/ +// Ignore in coverage report (not used in R package) +// #nocov start #ifndef DATADOUBLE_H_ #define DATADOUBLE_H_ @@ -24,20 +26,13 @@ namespace ranger { class DataDouble: public Data { public: DataDouble() = default; - DataDouble(std::vector data, std::vector variable_names, size_t num_rows, size_t num_cols) : - data { std::move(data) } { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_snp = num_cols; - } DataDouble(const DataDouble&) = delete; DataDouble& operator=(const DataDouble&) = delete; virtual ~DataDouble() override = default; - double get(size_t row, size_t col) const override { + double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { @@ -46,24 +41,35 @@ class DataDouble: public Data { } if (col < num_cols_no_snp) { - return data[col * num_rows + row]; + return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } - void reserveMemory() override { - data.resize(num_cols * num_rows); + double get_y(size_t row, size_t col) const override { + return y[col * num_rows + row]; + } + + void reserveMemory(size_t y_cols) override { + x.resize(num_cols * num_rows); + y.resize(y_cols * num_rows); + } + + void set_x(size_t col, size_t row, double value, bool& error) override { + x[col * num_rows + row] = value; } - void set(size_t col, size_t row, double value, bool& error) override { - data[col * num_rows + row] = value; + void set_y(size_t col, size_t row, double value, bool& error) override { + y[col * num_rows + row] = value; } private: - std::vector data; + std::vector x; + std::vector y; }; } // namespace ranger #endif /* DATADOUBLE_H_ */ +// #nocov end diff --git a/src/DataFloat.cpp b/src/DataFloat.cpp deleted file mode 100644 index a8027abbb..000000000 --- a/src/DataFloat.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/*------------------------------------------------------------------------------- - This file is part of ranger. - - Copyright (c) [2014-2018] [Marvin N. Wright] - - This software may be modified and distributed under the terms of the MIT license. - - Please note that the C++ core of ranger is distributed under MIT license and the - R package "ranger" under GPL3 license. - #-------------------------------------------------------------------------------*/ - -// Ignore in coverage report (not used in R package) -// #nocov start -#include - -#include "DataFloat.h" - -namespace ranger { - -DataFloat::DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_snp = num_cols; - - reserveMemory(); - for (size_t i = 0; i < num_cols; ++i) { - for (size_t j = 0; j < num_rows; ++j) { - data[i * num_rows + j] = (float) data_double[i * num_rows + j]; - } - } -} - -// #nocov end - -}// namespace ranger diff --git a/src/DataFloat.h b/src/DataFloat.h index a3c4da4f2..67989bb72 100644 --- a/src/DataFloat.h +++ b/src/DataFloat.h @@ -15,8 +15,10 @@ #define DATAFLOAT_H_ #include +#include #include "globals.h" +#include "utility.h" #include "Data.h" namespace ranger { @@ -24,14 +26,13 @@ namespace ranger { class DataFloat: public Data { public: DataFloat() = default; - DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols); DataFloat(const DataFloat&) = delete; DataFloat& operator=(const DataFloat&) = delete; virtual ~DataFloat() override = default; - double get(size_t row, size_t col) const override { + double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { @@ -40,22 +41,32 @@ class DataFloat: public Data { } if (col < num_cols_no_snp) { - return data[col * num_rows + row]; + return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } - void reserveMemory() override { - data.resize(num_cols * num_rows); + double get_y(size_t row, size_t col) const override { + return y[col * num_rows + row]; } - void set(size_t col, size_t row, double value, bool& error) override { - data[col * num_rows + row] = value; + void reserveMemory(size_t y_cols) override { + x.resize(num_cols * num_rows); + y.resize(y_cols * num_rows); + } + + void set_x(size_t col, size_t row, double value, bool& error) override { + x[col * num_rows + row] = value; + } + + void set_y(size_t col, size_t row, double value, bool& error) override { + y[col * num_rows + row] = value; } private: - std::vector data; + std::vector x; + std::vector y; }; } // namespace ranger diff --git a/src/DataRcpp.h b/src/DataRcpp.h index 5b3c4c714..a4b8db08f 100644 --- a/src/DataRcpp.h +++ b/src/DataRcpp.h @@ -39,8 +39,9 @@ namespace ranger { class DataRcpp: public Data { public: DataRcpp() = default; - DataRcpp(Rcpp::NumericMatrix& data, std::vector variable_names, size_t num_rows, size_t num_cols) { - this->data = data; + DataRcpp(Rcpp::NumericMatrix& x, Rcpp::NumericMatrix& y, std::vector variable_names, size_t num_rows, size_t num_cols) { + this->x = x; + this->y = y; this->variable_names = variable_names; this->num_rows = num_rows; this->num_cols = num_cols; @@ -52,7 +53,7 @@ class DataRcpp: public Data { virtual ~DataRcpp() override = default; - double get(size_t row, size_t col) const override { + double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance size_t col_permuted = col; if (col >= num_cols) { @@ -61,22 +62,33 @@ class DataRcpp: public Data { } if (col < num_cols_no_snp) { - return data[col * num_rows + row]; + return x[col * num_rows + row]; } else { return getSnp(row, col, col_permuted); } } - void reserveMemory() override { + double get_y(size_t row, size_t col) const override { + return y[col * num_rows + row]; + } + + // #nocov start + void reserveMemory(size_t y_cols) override { // Not needed } - void set(size_t col, size_t row, double value, bool& error) override { - data[col * num_rows + row] = value; + void set_x(size_t col, size_t row, double value, bool& error) override { + x[col * num_rows + row] = value; + } + + void set_y(size_t col, size_t row, double value, bool& error) override { + y[col * num_rows + row] = value; } + // #nocov end private: - Rcpp::NumericMatrix data; + Rcpp::NumericMatrix x; + Rcpp::NumericMatrix y; }; } // namespace ranger diff --git a/src/DataSparse.cpp b/src/DataSparse.cpp index 3b3d0add1..779a54d6b 100644 --- a/src/DataSparse.cpp +++ b/src/DataSparse.cpp @@ -30,10 +30,11 @@ namespace ranger { -DataSparse::DataSparse(Eigen::SparseMatrix& data, std::vector variable_names, size_t num_rows, +DataSparse::DataSparse(Eigen::SparseMatrix& x, Rcpp::NumericMatrix& y, std::vector variable_names, size_t num_rows, size_t num_cols) : - data { } { - this->data.swap(data); + x { }{ + this->x.swap(x); + this->y = y; this->variable_names = variable_names; this->num_rows = num_rows; this->num_cols = num_cols; diff --git a/src/DataSparse.h b/src/DataSparse.h index a56648f0e..af5032c54 100644 --- a/src/DataSparse.h +++ b/src/DataSparse.h @@ -40,7 +40,7 @@ class DataSparse: public Data { public: DataSparse() = default; - DataSparse(Eigen::SparseMatrix& data, std::vector variable_names, size_t num_rows, + DataSparse(Eigen::SparseMatrix& x, Rcpp::NumericMatrix& y, std::vector variable_names, size_t num_rows, size_t num_cols); DataSparse(const DataSparse&) = delete; @@ -48,26 +48,36 @@ class DataSparse: public Data { virtual ~DataSparse() override = default; - double get(size_t row, size_t col) const override { + double get_x(size_t row, size_t col) const override { // Use permuted data for corrected impurity importance - size_t col_permuted = col; if (col >= num_cols) { col = getUnpermutedVarID(col); row = getPermutedSampleID(row); } - return data.coeff(row, col); + return x.coeff(row, col); + } + + double get_y(size_t row, size_t col) const override { + return y[col * num_rows + row]; } - void reserveMemory() override { - data.resize(num_rows, num_cols); + // #nocov start + void reserveMemory(size_t y_cols) override { + // Not needed } - void set(size_t col, size_t row, double value, bool& error) override { - data.coeffRef(row, col) = value; + void set_x(size_t col, size_t row, double value, bool& error) override { + x.coeffRef(row, col) = value; + } + + void set_y(size_t col, size_t row, double value, bool& error) override { + y[col * num_rows + row] = value; } + // #nocov end private: - Eigen::SparseMatrix data; + Eigen::SparseMatrix x; + Rcpp::NumericMatrix y; }; } // namespace ranger diff --git a/src/Forest.cpp b/src/Forest.cpp index 12b13a7ad..89341738b 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -29,40 +29,15 @@ namespace ranger { Forest::Forest() : - verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_variables(0), num_independent_variables( - 0), seed(0), dependent_varID(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement( - true), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( - { 1 }), holdout(false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( + verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_independent_variables(0), seed(0), num_samples( + 0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement(true), memory_saving_splitting( + false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction( { 1 }), holdout( + false), prediction_type(DEFAULT_PREDICTIONTYPE), num_random_splits(DEFAULT_NUM_RANDOM_SPLITS), max_depth( DEFAULT_MAXDEPTH), alpha(DEFAULT_ALPHA), minprop(DEFAULT_MINPROP), num_threads(DEFAULT_NUM_THREADS), data { }, overall_prediction_error( NAN), importance_mode(DEFAULT_IMPORTANCE_MODE), progress(0) { } // #nocov start -std::unique_ptr load_data_from_file(const std::string& data_path, const MemoryMode memory_mode, - std::ostream* verbose_out = nullptr) { - std::unique_ptr result { }; - switch (memory_mode) { - case MEM_DOUBLE: - result = make_unique(); - break; - case MEM_FLOAT: - result = make_unique(); - break; - case MEM_CHAR: - result = make_unique(); - break; - } - - if (verbose_out) - *verbose_out << "Loading input file: " << data_path << "." << std::endl; - bool found_rounding_error = result->loadFromFile(data_path); - if (found_rounding_error && verbose_out) { - *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." - << std::endl; - } - return result; -} - void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, @@ -74,6 +49,14 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode this->verbose_out = verbose_out; + if (!dependent_variable_name.empty()) { + if (status_variable_name.empty()) { + this->dependent_variable_names = {dependent_variable_name}; + } else { + this->dependent_variable_names = {dependent_variable_name, status_variable_name}; + } + } + // Set prediction mode bool prediction_mode = false; if (!load_forest_filename.empty()) { @@ -90,12 +73,15 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode } std::vector sample_fraction_vector = { sample_fraction }; + if (prediction_mode) { + loadDependentVariableNamesFromFile(load_forest_filename); + } + // Call other init function - init(dependent_variable_name, memory_mode, load_data_from_file(input_file, memory_mode, verbose_out), mtry, - output_prefix, num_trees, seed, num_threads, importance_mode, min_node_size, status_variable_name, - prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, - predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, false, - max_depth); + init(memory_mode, loadDataFromFile(input_file), mtry, output_prefix, num_trees, seed, num_threads, importance_mode, + min_node_size, prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, + splitrule, predict_all, sample_fraction_vector, alpha, minprop, holdout, prediction_type, num_random_splits, + false, max_depth); if (prediction_mode) { loadFromFile(load_forest_filename); @@ -111,7 +97,7 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode std::vector> split_select_weights; split_select_weights.resize(1); loadDoubleVectorFromFile(split_select_weights[0], split_select_weights_file); - if (split_select_weights[0].size() != num_variables - 1) { + if (split_select_weights[0].size() != num_independent_variables) { throw std::runtime_error("Number of split select weights is not equal to number of independent variables."); } setSplitWeightVector(split_select_weights); @@ -146,22 +132,21 @@ void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode } // #nocov end -void Forest::initR(std::string dependent_variable_name, std::unique_ptr input_data, uint mtry, uint num_trees, - std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, +void Forest::initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, + uint num_threads, ImportanceMode importance_mode, uint min_node_size, std::vector>& split_select_weights, const std::vector& always_split_variable_names, - std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, - bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, - PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth) { + bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, + bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, + std::vector>& manual_inbag, bool predict_all, bool keep_inbag, + std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, + uint num_random_splits, bool order_snps, uint max_depth) { this->verbose_out = verbose_out; // Call other init function - init(dependent_variable_name, MEM_DOUBLE, std::move(input_data), mtry, "", num_trees, seed, num_threads, - importance_mode, min_node_size, status_variable_name, prediction_mode, sample_with_replacement, - unordered_variable_names, memory_saving_splitting, splitrule, predict_all, sample_fraction, alpha, minprop, - holdout, prediction_type, num_random_splits, order_snps, max_depth); + init(MEM_DOUBLE, std::move(input_data), mtry, "", num_trees, seed, num_threads, importance_mode, min_node_size, + prediction_mode, sample_with_replacement, unordered_variable_names, memory_saving_splitting, splitrule, + predict_all, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth); // Set variables to be always considered for splitting if (!always_split_variable_names.empty()) { @@ -190,12 +175,12 @@ void Forest::initR(std::string dependent_variable_name, std::unique_ptr in this->keep_inbag = keep_inbag; } -void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, std::unique_ptr input_data, - uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, - uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, - PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth) { +void Forest::init(MemoryMode memory_mode, std::unique_ptr input_data, uint mtry, std::string output_prefix, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, + bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, + double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, + uint max_depth) { // Initialize data with memmode this->data = std::move(input_data); @@ -242,23 +227,14 @@ void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, s // Set number of samples and variables num_samples = data->getNumRows(); - num_variables = data->getNumCols(); - - // Convert dependent variable name to ID - if (!prediction_mode && !dependent_variable_name.empty()) { - dependent_varID = data->getVariableID(dependent_variable_name); - } + num_independent_variables = data->getNumCols(); // Set unordered factor variables if (!prediction_mode) { data->setIsOrderedVariable(unordered_variable_names); } - data->addNoSplitVariable(dependent_varID); - - initInternal(status_variable_name); - - num_independent_variables = num_variables - data->getNoSplitVariables().size(); + initInternal(); // Init split select weights split_select_weights.push_back(std::vector()); @@ -267,7 +243,7 @@ void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, s manual_inbag.push_back(std::vector()); // Check if mtry is in valid range - if (this->mtry > num_variables - 1) { + if (this->mtry > num_independent_variables) { throw std::runtime_error("mtry can not be larger than number of variables in data."); } @@ -283,7 +259,7 @@ void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, s // Order SNP levels if in "order" splitting if (!prediction_mode && order_snps) { - data->orderSnpLevels(dependent_variable_name, (importance_mode == IMP_GINI_CORRECTED)); + data->orderSnpLevels((importance_mode == IMP_GINI_CORRECTED)); } } @@ -325,8 +301,9 @@ void Forest::writeOutput() { *verbose_out << std::endl; writeOutputInternal(); if (verbose_out) { - *verbose_out << "Dependent variable name: " << data->getVariableNames()[dependent_varID] << std::endl; - *verbose_out << "Dependent variable ID: " << dependent_varID << std::endl; + if (dependent_variable_names.size() >= 1) { + *verbose_out << "Dependent variable name: " << dependent_variable_names[0] << std::endl; + } *verbose_out << "Number of trees: " << num_trees << std::endl; *verbose_out << "Sample size: " << num_samples << std::endl; *verbose_out << "Number of independent variables: " << num_independent_variables << std::endl; @@ -375,13 +352,7 @@ void Forest::writeImportanceFile() { // Write importance to file for (size_t i = 0; i < variable_importance.size(); ++i) { - size_t varID = i; - for (auto& skip : data->getNoSplitVariables()) { - if (varID >= skip) { - ++varID; - } - } - std::string variable_name = data->getVariableNames()[varID]; + std::string variable_name = data->getVariableNames()[i]; importance_file << variable_name << ": " << variable_importance[i] << std::endl; } @@ -400,8 +371,18 @@ void Forest::saveToFile() { throw std::runtime_error("Could not write to output file: " + filename + "."); } - // Write dependent_varID - outfile.write((char*) &dependent_varID, sizeof(dependent_varID)); + // Write dependent variable names + uint num_dependent_variables = dependent_variable_names.size(); + if (num_dependent_variables >= 1) { + outfile.write((char*) &num_dependent_variables, sizeof(num_dependent_variables)); + for (auto& var_name : dependent_variable_names) { + size_t length = var_name.size(); + outfile.write((char*) &length, sizeof(length)); + outfile.write((char*) var_name.c_str(), length * sizeof(char)); + } + } else { + throw std::runtime_error("Missing dependent variable name."); + } // Write num_trees outfile.write((char*) &num_trees, sizeof(num_trees)); @@ -457,10 +438,10 @@ void Forest::grow() { tree_manual_inbag = &manual_inbag[0]; } - trees[i]->init(data.get(), mtry, dependent_varID, num_samples, tree_seed, &deterministic_varIDs, - &split_select_varIDs, tree_split_select_weights, importance_mode, min_node_size, sample_with_replacement, - memory_saving_splitting, splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, - minprop, holdout, num_random_splits, max_depth); + trees[i]->init(data.get(), mtry, num_samples, tree_seed, &deterministic_varIDs, &split_select_varIDs, + tree_split_select_weights, importance_mode, min_node_size, sample_with_replacement, memory_saving_splitting, + splitrule, &case_weights, tree_manual_inbag, keep_inbag, &sample_fraction, alpha, minprop, holdout, + num_random_splits, max_depth); } // Init variable importance @@ -468,6 +449,7 @@ void Forest::grow() { // Grow trees in multiple threads #ifdef OLD_WIN_R_BUILD + // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); @@ -476,6 +458,7 @@ void Forest::grow() { progress++; showProgress("Growing trees..", start_time, lap_time); } + // #nocov end #else progress = 0; #ifdef R_BUILD @@ -531,6 +514,7 @@ void Forest::predict() { // Predict trees in multiple threads and join the threads with the main thread #ifdef OLD_WIN_R_BUILD + // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); @@ -545,6 +529,7 @@ void Forest::predict() { for (size_t sample_idx = 0; sample_idx < data->getNumRows(); ++sample_idx) { predictInternal(sample_idx); } + // #nocov end #else progress = 0; #ifdef R_BUILD @@ -588,6 +573,7 @@ void Forest::computePredictionError() { // Predict trees in multiple threads #ifdef OLD_WIN_R_BUILD + // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); @@ -596,6 +582,7 @@ void Forest::computePredictionError() { progress++; showProgress("Predicting..", start_time, lap_time); } + // #nocov end #else std::vector threads; threads.reserve(num_threads); @@ -623,6 +610,7 @@ void Forest::computePermutationImportance() { // Compute tree permutation importance in multiple threads #ifdef OLD_WIN_R_BUILD + // #nocov start progress = 0; clock_t start_time = clock(); clock_t lap_time = clock(); @@ -640,6 +628,7 @@ void Forest::computePermutationImportance() { progress++; showProgress("Computing permutation importance..", start_time, lap_time); } + // #nocov end #else progress = 0; #ifdef R_BUILD @@ -812,28 +801,85 @@ void Forest::loadFromFile(std::string filename) { if (verbose_out) *verbose_out << "Loading forest from file " << filename << "." << std::endl; -// Open file for reading + // Open file for reading std::ifstream infile; infile.open(filename, std::ios::binary); if (!infile.good()) { throw std::runtime_error("Could not read from input file: " + filename + "."); } -// Read dependent_varID and num_trees - infile.read((char*) &dependent_varID, sizeof(dependent_varID)); + // Skip dependent variable names (already read) + uint num_dependent_variables; + infile.read((char*) &num_dependent_variables, sizeof(num_dependent_variables)); + for (size_t i = 0; i < num_dependent_variables; ++i) { + size_t length; + infile.read((char*) &length, sizeof(size_t)); + infile.ignore(length); + } + + // Read num_trees infile.read((char*) &num_trees, sizeof(num_trees)); -// Read is_ordered_variable + // Read is_ordered_variable readVector1D(data->getIsOrderedVariable(), infile); -// Read tree data. This is different for tree types -> virtual function + // Read tree data. This is different for tree types -> virtual function loadFromFileInternal(infile); infile.close(); -// Create thread ranges + // Create thread ranges equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } + +void Forest::loadDependentVariableNamesFromFile(std::string filename) { + + // Open file for reading + std::ifstream infile; + infile.open(filename, std::ios::binary); + if (!infile.good()) { + throw std::runtime_error("Could not read from input file: " + filename + "."); + } + + // Read dependent variable names + uint num_dependent_variables = 0; + infile.read((char*) &num_dependent_variables, sizeof(num_dependent_variables)); + for (size_t i = 0; i < num_dependent_variables; ++i) { + size_t length; + infile.read((char*) &length, sizeof(size_t)); + char* temp = new char[length + 1]; + infile.read((char*) temp, length * sizeof(char)); + temp[length] = '\0'; + dependent_variable_names.push_back(temp); + delete[] temp; + } + + infile.close(); +} + +std::unique_ptr Forest::loadDataFromFile(const std::string& data_path) { + std::unique_ptr result { }; + switch (memory_mode) { + case MEM_DOUBLE: + result = make_unique(); + break; + case MEM_FLOAT: + result = make_unique(); + break; + case MEM_CHAR: + result = make_unique(); + break; + } + + if (verbose_out) + *verbose_out << "Loading input file: " << data_path << "." << std::endl; + bool found_rounding_error = result->loadFromFile(data_path, dependent_variable_names); + if (found_rounding_error && verbose_out) { + *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." + << std::endl; + } + return result; +} // #nocov end void Forest::setSplitWeightVector(std::vector>& split_select_weights) { @@ -870,17 +916,10 @@ void Forest::setSplitWeightVector(std::vector>& split_select double weight = split_select_weights[i][j]; if (i == 0) { - size_t varID = j; - for (auto& skip : data->getNoSplitVariables()) { - if (varID >= skip) { - ++varID; - } - } - if (weight == 1) { - deterministic_varIDs.push_back(varID); + deterministic_varIDs.push_back(j); } else if (weight < 1 && weight > 0) { - this->split_select_varIDs[j] = varID; + this->split_select_varIDs[j] = j; this->split_select_weights[i][j] = weight; } else if (weight == 0) { ++num_zero_weights; @@ -903,24 +942,19 @@ void Forest::setSplitWeightVector(std::vector>& split_select std::copy_n(sw->begin(), num_independent_variables, sw->begin() + num_independent_variables); for (size_t k = 0; k < num_independent_variables; ++k) { - split_select_varIDs[num_independent_variables + k] = num_variables + k; + split_select_varIDs[num_independent_variables + k] = num_independent_variables + k; } size_t num_deterministic_varIDs = deterministic_varIDs.size(); for (size_t k = 0; k < num_deterministic_varIDs; ++k) { - size_t varID = deterministic_varIDs[k]; - for (auto& skip : data->getNoSplitVariables()) { - if (varID >= skip) { - --varID; - } - } - deterministic_varIDs.push_back(varID + num_variables); + deterministic_varIDs.push_back(k + num_independent_variables); } } } if (num_weights - deterministic_varIDs.size() - num_zero_weights < mtry) { - throw std::runtime_error("Too many zeros or ones in split select weights. Need at least mtry variables to split at."); + throw std::runtime_error( + "Too many zeros or ones in split select weights. Need at least mtry variables to split at."); } } @@ -942,18 +976,13 @@ void Forest::setAlwaysSplitVariables(const std::vector& always_spli if (importance_mode == IMP_GINI_CORRECTED) { size_t num_deterministic_varIDs = deterministic_varIDs.size(); for (size_t k = 0; k < num_deterministic_varIDs; ++k) { - size_t varID = deterministic_varIDs[k]; - for (auto& skip : data->getNoSplitVariables()) { - if (varID >= skip) { - --varID; - } - } - deterministic_varIDs.push_back(varID + num_variables); + deterministic_varIDs.push_back(k + num_independent_variables); } } } #ifdef OLD_WIN_R_BUILD +// #nocov start void Forest::showProgress(std::string operation, clock_t start_time, clock_t& lap_time) { // Check for user interrupt @@ -973,6 +1002,7 @@ void Forest::showProgress(std::string operation, clock_t start_time, clock_t& la lap_time = clock(); } } +// #nocov end #else void Forest::showProgress(std::string operation, size_t max_progress) { using std::chrono::steady_clock; diff --git a/src/Forest.h b/src/Forest.h index b4f38f32e..b73c5bdbc 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -48,22 +48,21 @@ class Forest { const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, double sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, uint max_depth); - void initR(std::string dependent_variable_name, std::unique_ptr input_data, uint mtry, uint num_trees, - std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + void initR(std::unique_ptr input_data, uint mtry, uint num_trees, std::ostream* verbose_out, uint seed, + uint num_threads, ImportanceMode importance_mode, uint min_node_size, std::vector>& split_select_weights, - const std::vector& always_split_variable_names, std::string status_variable_name, - bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, - bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, - std::vector>& manual_inbag, bool predict_all, bool keep_inbag, - std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, - uint num_random_splits, bool order_snps, uint max_depth); - void init(std::string dependent_variable_name, MemoryMode memory_mode, std::unique_ptr input_data, uint mtry, - std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, - uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + const std::vector& always_split_variable_names, bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - bool predict_all, std::vector& sample_fraction, double alpha, double minprop, bool holdout, + std::vector& case_weights, std::vector>& manual_inbag, bool predict_all, + bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, bool order_snps, uint max_depth); - virtual void initInternal(std::string status_variable_name) = 0; + void init(MemoryMode memory_mode, std::unique_ptr input_data, uint mtry, std::string output_prefix, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + bool prediction_mode, bool sample_with_replacement, const std::vector& unordered_variable_names, + bool memory_saving_splitting, SplitRule splitrule, bool predict_all, std::vector& sample_fraction, + double alpha, double minprop, bool holdout, PredictionType prediction_type, uint num_random_splits, + bool order_snps, uint max_depth); + virtual void initInternal() = 0; // Grow or predict void run(bool verbose, bool compute_oob_error); @@ -109,9 +108,6 @@ class Forest { const std::vector>>& getPredictions() const { return predictions; } - size_t getDependentVarId() const { - return dependent_varID; - } size_t getNumTrees() const { return num_trees; } @@ -165,6 +161,10 @@ class Forest { // Load forest from file void loadFromFile(std::string filename); virtual void loadFromFileInternal(std::ifstream& infile) = 0; + void loadDependentVariableNamesFromFile(std::string filename); + + // Load data from file + std::unique_ptr loadDataFromFile(const std::string& data_path); // Set split select weights and variables to be always considered for splitting void setSplitWeightVector(std::vector>& split_select_weights); @@ -180,13 +180,12 @@ class Forest { // Verbose output stream, cout if verbose==true, logfile if not std::ostream* verbose_out; + std::vector dependent_variable_names; // time,status for survival size_t num_trees; uint mtry; uint min_node_size; - size_t num_variables; size_t num_independent_variables; uint seed; - size_t dependent_varID; size_t num_samples; bool prediction_mode; MemoryMode memory_mode; diff --git a/src/ForestClassification.cpp b/src/ForestClassification.cpp index e33144854..1ca6d92b5 100644 --- a/src/ForestClassification.cpp +++ b/src/ForestClassification.cpp @@ -24,12 +24,11 @@ namespace ranger { -void ForestClassification::loadForest(size_t dependent_varID, size_t num_trees, +void ForestClassification::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector& is_ordered_variable) { - this->dependent_varID = dependent_varID; this->num_trees = num_trees; this->class_values = class_values; data->setIsOrderedVariable(is_ordered_variable); @@ -46,11 +45,11 @@ void ForestClassification::loadForest(size_t dependent_varID, size_t num_trees, equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } -void ForestClassification::initInternal(std::string status_variable_name) { +void ForestClassification::initInternal() { // If mtry not set, use floored square root of number of independent variables. if (mtry == 0) { - unsigned long temp = sqrt((double) (num_variables - 1)); + unsigned long temp = sqrt((double) num_independent_variables); mtry = std::max((unsigned long) 1, temp); } @@ -62,7 +61,7 @@ void ForestClassification::initInternal(std::string status_variable_name) { // Create class_values and response_classIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { - double value = data->get(i, dependent_varID); + double value = data->get_y(i, 0); // If classID is already in class_values, use ID. Else create a new one. uint classID = find(class_values.begin(), class_values.end(), value) - class_values.begin(); @@ -168,7 +167,7 @@ void ForestClassification::computePredictionErrorInternal() { double predicted_value = predictions[0][0][i]; if (!std::isnan(predicted_value)) { ++num_predictions; - double real_value = data->get(i, dependent_varID); + double real_value = data->get_y(i, 0); if (predicted_value != real_value) { ++num_missclassifications; } @@ -268,7 +267,7 @@ void ForestClassification::writePredictionFile() { void ForestClassification::saveToFileInternal(std::ofstream& outfile) { // Write num_variables - outfile.write((char*) &num_variables, sizeof(num_variables)); + outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_CLASSIFICATION; @@ -304,13 +303,9 @@ void ForestClassification::loadFromFileInternal(std::ifstream& infile) { std::vector split_values; readVector1D(split_values, infile); - // If dependent variable not in test data, change variable IDs accordingly - if (num_variables_saved > num_variables) { - for (auto& varID : split_varIDs) { - if (varID >= dependent_varID) { - --varID; - } - } + // If dependent variable not in test data, throw error + if (num_variables_saved != num_independent_variables) { + throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree diff --git a/src/ForestClassification.h b/src/ForestClassification.h index fc23c70ea..5abb8f280 100644 --- a/src/ForestClassification.h +++ b/src/ForestClassification.h @@ -31,8 +31,7 @@ class ForestClassification: public Forest { virtual ~ForestClassification() override = default; - void loadForest(size_t dependent_varID, size_t num_trees, - std::vector> >& forest_child_nodeIDs, + void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector& is_ordered_variable); @@ -45,7 +44,7 @@ class ForestClassification: public Forest { } protected: - void initInternal(std::string status_variable_name) override; + void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; diff --git a/src/ForestProbability.cpp b/src/ForestProbability.cpp index a5f7c8f55..56654990f 100644 --- a/src/ForestProbability.cpp +++ b/src/ForestProbability.cpp @@ -18,13 +18,12 @@ namespace ranger { -void ForestProbability::loadForest(size_t dependent_varID, size_t num_trees, +void ForestProbability::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector>>& forest_terminal_class_counts, std::vector& is_ordered_variable) { - this->dependent_varID = dependent_varID; this->num_trees = num_trees; this->class_values = class_values; data->setIsOrderedVariable(is_ordered_variable); @@ -51,11 +50,11 @@ std::vector>> ForestProbability::getTerminalClas return result; } -void ForestProbability::initInternal(std::string status_variable_name) { +void ForestProbability::initInternal() { // If mtry not set, use floored square root of number of independent variables. if (mtry == 0) { - unsigned long temp = sqrt((double) (num_variables - 1)); + unsigned long temp = sqrt((double) num_independent_variables); mtry = std::max((unsigned long) 1, temp); } @@ -67,7 +66,7 @@ void ForestProbability::initInternal(std::string status_variable_name) { // Create class_values and response_classIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { - double value = data->get(i, dependent_varID); + double value = data->get_y(i, 0); // If classID is already in class_values, use ID. Else create a new one. uint classID = find(class_values.begin(), class_values.end(), value) - class_values.begin(); @@ -262,7 +261,7 @@ void ForestProbability::writePredictionFile() { void ForestProbability::saveToFileInternal(std::ofstream& outfile) { // Write num_variables - outfile.write((char*) &num_variables, sizeof(num_variables)); + outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_PROBABILITY; @@ -311,13 +310,9 @@ void ForestProbability::loadFromFileInternal(std::ifstream& infile) { terminal_class_counts[terminal_nodes[j]] = terminal_class_counts_vector[j]; } - // If dependent variable not in test data, change variable IDs accordingly - if (num_variables_saved > num_variables) { - for (auto& varID : split_varIDs) { - if (varID >= dependent_varID) { - --varID; - } - } + // If dependent variable not in test data, throw error + if (num_variables_saved != num_independent_variables) { + throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree diff --git a/src/ForestProbability.h b/src/ForestProbability.h index aa42c893f..47f0ac865 100644 --- a/src/ForestProbability.h +++ b/src/ForestProbability.h @@ -31,8 +31,7 @@ class ForestProbability: public Forest { virtual ~ForestProbability() override = default; - void loadForest(size_t dependent_varID, size_t num_trees, - std::vector> >& forest_child_nodeIDs, + void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& class_values, std::vector>>& forest_terminal_class_counts, std::vector& is_ordered_variable); @@ -48,7 +47,7 @@ class ForestProbability: public Forest { } protected: - void initInternal(std::string status_variable_name) override; + void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; diff --git a/src/ForestRegression.cpp b/src/ForestRegression.cpp index dbc9e28af..f15c5693c 100644 --- a/src/ForestRegression.cpp +++ b/src/ForestRegression.cpp @@ -20,12 +20,11 @@ namespace ranger { -void ForestRegression::loadForest(size_t dependent_varID, size_t num_trees, +void ForestRegression::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& is_ordered_variable) { - this->dependent_varID = dependent_varID; this->num_trees = num_trees; data->setIsOrderedVariable(is_ordered_variable); @@ -40,11 +39,11 @@ void ForestRegression::loadForest(size_t dependent_varID, size_t num_trees, equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } -void ForestRegression::initInternal(std::string status_variable_name) { +void ForestRegression::initInternal() { // If mtry not set, use floored square root of number of independent variables if (mtry == 0) { - unsigned long temp = sqrt((double) (num_variables - 1)); + unsigned long temp = sqrt((double) num_independent_variables); mtry = std::max((unsigned long) 1, temp); } @@ -122,7 +121,7 @@ void ForestRegression::computePredictionErrorInternal() { ++num_predictions; predictions[0][0][i] /= (double) samples_oob_count[i]; double predicted_value = predictions[0][0][i]; - double real_value = data->get(i, dependent_varID); + double real_value = data->get_y(i, 0); overall_prediction_error += (predicted_value - real_value) * (predicted_value - real_value); } else { predictions[0][0][i] = NAN; @@ -196,7 +195,7 @@ void ForestRegression::writePredictionFile() { void ForestRegression::saveToFileInternal(std::ofstream& outfile) { // Write num_variables - outfile.write((char*) &num_variables, sizeof(num_variables)); + outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_REGRESSION; @@ -226,13 +225,9 @@ void ForestRegression::loadFromFileInternal(std::ifstream& infile) { std::vector split_values; readVector1D(split_values, infile); - // If dependent variable not in test data, change variable IDs accordingly - if (num_variables_saved > num_variables) { - for (auto& varID : split_varIDs) { - if (varID >= dependent_varID) { - --varID; - } - } + // If dependent variable not in test data, throw error + if (num_variables_saved != num_independent_variables) { + throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree diff --git a/src/ForestRegression.h b/src/ForestRegression.h index bd59d22dc..62689d38e 100644 --- a/src/ForestRegression.h +++ b/src/ForestRegression.h @@ -29,13 +29,12 @@ class ForestRegression: public Forest { virtual ~ForestRegression() override = default; - void loadForest(size_t dependent_varID, size_t num_trees, - std::vector> >& forest_child_nodeIDs, + void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, std::vector& is_ordered_variable); private: - void initInternal(std::string status_variable_name) override; + void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; diff --git a/src/ForestSurvival.cpp b/src/ForestSurvival.cpp index 83c357ea9..cae6af865 100644 --- a/src/ForestSurvival.cpp +++ b/src/ForestSurvival.cpp @@ -21,14 +21,11 @@ namespace ranger { -void ForestSurvival::loadForest(size_t dependent_varID, size_t num_trees, - std::vector> >& forest_child_nodeIDs, +void ForestSurvival::loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, - size_t status_varID, std::vector> >& forest_chf, - std::vector& unique_timepoints, std::vector& is_ordered_variable) { + std::vector> >& forest_chf, std::vector& unique_timepoints, + std::vector& is_ordered_variable) { - this->dependent_varID = dependent_varID; - this->status_varID = status_varID; this->num_trees = num_trees; this->unique_timepoints = unique_timepoints; data->setIsOrderedVariable(is_ordered_variable); @@ -55,18 +52,11 @@ std::vector>> ForestSurvival::getChf() const { return result; } -void ForestSurvival::initInternal(std::string status_variable_name) { - - // Convert status variable name to ID - if (!prediction_mode && !status_variable_name.empty()) { - status_varID = data->getVariableID(status_variable_name); - } - - data->addNoSplitVariable(status_varID); +void ForestSurvival::initInternal() { // If mtry not set, use floored square root of number of independent variables. if (mtry == 0) { - unsigned long temp = ceil(sqrt((double) (num_variables - 2))); + unsigned long temp = ceil(sqrt((double) num_independent_variables)); mtry = std::max((unsigned long) 1, temp); } @@ -78,7 +68,7 @@ void ForestSurvival::initInternal(std::string status_variable_name) { // Create unique timepoints std::set unique_timepoint_set; for (size_t i = 0; i < num_samples; ++i) { - unique_timepoint_set.insert(data->get(i, dependent_varID)); + unique_timepoint_set.insert(data->get_y(i, 0)); } unique_timepoints.reserve(unique_timepoint_set.size()); for (auto& t : unique_timepoint_set) { @@ -88,7 +78,7 @@ void ForestSurvival::initInternal(std::string status_variable_name) { // Create response_timepointIDs if (!prediction_mode) { for (size_t i = 0; i < num_samples; ++i) { - double value = data->get(i, dependent_varID); + double value = data->get_y(i, 0); // If timepoint is already in unique_timepoints, use ID. Else create a new one. uint timepointID = find(unique_timepoints.begin(), unique_timepoints.end(), value) - unique_timepoints.begin(); @@ -105,7 +95,7 @@ void ForestSurvival::initInternal(std::string status_variable_name) { void ForestSurvival::growInternal() { trees.reserve(num_trees); for (size_t i = 0; i < num_trees; ++i) { - trees.push_back(make_unique(&unique_timepoints, status_varID, &response_timepointIDs)); + trees.push_back(make_unique(&unique_timepoints, &response_timepointIDs)); } } @@ -187,15 +177,16 @@ void ForestSurvival::computePredictionErrorInternal() { } // Use all samples which are OOB at least once - overall_prediction_error = 1 - computeConcordanceIndex(*data, sum_chf, dependent_varID, status_varID, oob_sampleIDs); + overall_prediction_error = 1 - computeConcordanceIndex(*data, sum_chf, oob_sampleIDs); } // #nocov start void ForestSurvival::writeOutputInternal() { if (verbose_out) { *verbose_out << "Tree type: " << "Survival" << std::endl; - *verbose_out << "Status variable name: " << data->getVariableNames()[status_varID] << std::endl; - *verbose_out << "Status variable ID: " << status_varID << std::endl; + if (dependent_variable_names.size() >= 2) { + *verbose_out << "Status variable name: " << dependent_variable_names[1] << std::endl; + } } } @@ -265,15 +256,12 @@ void ForestSurvival::writePredictionFile() { void ForestSurvival::saveToFileInternal(std::ofstream& outfile) { // Write num_variables - outfile.write((char*) &num_variables, sizeof(num_variables)); + outfile.write((char*) &num_independent_variables, sizeof(num_independent_variables)); // Write treetype TreeType treetype = TREE_SURVIVAL; outfile.write((char*) &treetype, sizeof(treetype)); - // Write status_varID - outfile.write((char*) &status_varID, sizeof(status_varID)); - // Write unique timepoints saveVector1D(unique_timepoints, outfile); } @@ -291,9 +279,6 @@ void ForestSurvival::loadFromFileInternal(std::ifstream& infile) { throw std::runtime_error("Wrong treetype. Loaded file is not a survival forest."); } - // Read status_varID - infile.read((char*) &status_varID, sizeof(status_varID)); - // Read unique timepoints unique_timepoints.clear(); readVector1D(unique_timepoints, infile); @@ -324,20 +309,9 @@ void ForestSurvival::loadFromFileInternal(std::ifstream& infile) { chf[terminal_nodes[j]] = chf_vector[j]; } - // If dependent variable not in test data, change variable IDs accordingly - if (num_variables_saved > num_variables) { - for (auto& varID : split_varIDs) { - if (varID >= dependent_varID) { - --varID; - } - } - } - if (num_variables_saved > num_variables + 1) { - for (auto& varID : split_varIDs) { - if (varID >= status_varID) { - --varID; - } - } + // If dependent variable not in test data, throw error + if (num_variables_saved != num_independent_variables) { + throw std::runtime_error("Number of independent variables in data does not match with the loaded forest."); } // Create tree diff --git a/src/ForestSurvival.h b/src/ForestSurvival.h index 4fc69171f..15b8a87fe 100644 --- a/src/ForestSurvival.h +++ b/src/ForestSurvival.h @@ -30,23 +30,19 @@ class ForestSurvival: public Forest { virtual ~ForestSurvival() override = default; - void loadForest(size_t dependent_varID, size_t num_trees, - std::vector> >& forest_child_nodeIDs, + void loadForest(size_t num_trees, std::vector> >& forest_child_nodeIDs, std::vector>& forest_split_varIDs, std::vector>& forest_split_values, - size_t status_varID, std::vector> >& forest_chf, - std::vector& unique_timepoints, std::vector& is_ordered_variable); + std::vector> >& forest_chf, std::vector& unique_timepoints, + std::vector& is_ordered_variable); std::vector>> getChf() const; - size_t getStatusVarId() const { - return status_varID; - } const std::vector& getUniqueTimepoints() const { return unique_timepoints; } private: - void initInternal(std::string status_variable_name) override; + void initInternal() override; void growInternal() override; void allocatePredictMemory() override; void predictInternal(size_t sample_idx) override; @@ -57,7 +53,6 @@ class ForestSurvival: public Forest { void saveToFileInternal(std::ofstream& outfile) override; void loadFromFileInternal(std::ifstream& infile) override; - size_t status_varID; std::vector unique_timepoints; std::vector response_timepointIDs; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 614e7330a..ca7b8046f 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -8,14 +8,14 @@ using namespace Rcpp; // rangerCpp -Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::NumericMatrix& input_data, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, std::string status_variable_name, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_data, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag); -RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP dependent_variable_nameSEXP, SEXP input_dataSEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP status_variable_nameSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_dataSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP) { +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, uint num_random_splits, Eigen::SparseMatrix& sparse_x, bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag); +RcppExport SEXP _ranger_rangerCpp(SEXP treetypeSEXP, SEXP input_xSEXP, SEXP input_ySEXP, SEXP variable_namesSEXP, SEXP mtrySEXP, SEXP num_treesSEXP, SEXP verboseSEXP, SEXP seedSEXP, SEXP num_threadsSEXP, SEXP write_forestSEXP, SEXP importance_mode_rSEXP, SEXP min_node_sizeSEXP, SEXP split_select_weightsSEXP, SEXP use_split_select_weightsSEXP, SEXP always_split_variable_namesSEXP, SEXP use_always_split_variable_namesSEXP, SEXP prediction_modeSEXP, SEXP loaded_forestSEXP, SEXP snp_dataSEXP, SEXP sample_with_replacementSEXP, SEXP probabilitySEXP, SEXP unordered_variable_namesSEXP, SEXP use_unordered_variable_namesSEXP, SEXP save_memorySEXP, SEXP splitrule_rSEXP, SEXP case_weightsSEXP, SEXP use_case_weightsSEXP, SEXP class_weightsSEXP, SEXP predict_allSEXP, SEXP keep_inbagSEXP, SEXP sample_fractionSEXP, SEXP alphaSEXP, SEXP minpropSEXP, SEXP holdoutSEXP, SEXP prediction_type_rSEXP, SEXP num_random_splitsSEXP, SEXP sparse_xSEXP, SEXP use_sparse_dataSEXP, SEXP order_snpsSEXP, SEXP oob_errorSEXP, SEXP max_depthSEXP, SEXP inbagSEXP, SEXP use_inbagSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< uint >::type treetype(treetypeSEXP); - Rcpp::traits::input_parameter< std::string >::type dependent_variable_name(dependent_variable_nameSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type input_data(input_dataSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type input_x(input_xSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericMatrix& >::type input_y(input_ySEXP); Rcpp::traits::input_parameter< std::vector >::type variable_names(variable_namesSEXP); Rcpp::traits::input_parameter< uint >::type mtry(mtrySEXP); Rcpp::traits::input_parameter< uint >::type num_trees(num_treesSEXP); @@ -29,7 +29,6 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type use_split_select_weights(use_split_select_weightsSEXP); Rcpp::traits::input_parameter< std::vector& >::type always_split_variable_names(always_split_variable_namesSEXP); Rcpp::traits::input_parameter< bool >::type use_always_split_variable_names(use_always_split_variable_namesSEXP); - Rcpp::traits::input_parameter< std::string >::type status_variable_name(status_variable_nameSEXP); Rcpp::traits::input_parameter< bool >::type prediction_mode(prediction_modeSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type loaded_forest(loaded_forestSEXP); Rcpp::traits::input_parameter< Rcpp::RawMatrix >::type snp_data(snp_dataSEXP); @@ -50,14 +49,14 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type holdout(holdoutSEXP); Rcpp::traits::input_parameter< uint >::type prediction_type_r(prediction_type_rSEXP); Rcpp::traits::input_parameter< uint >::type num_random_splits(num_random_splitsSEXP); - Rcpp::traits::input_parameter< Eigen::SparseMatrix& >::type sparse_data(sparse_dataSEXP); + Rcpp::traits::input_parameter< Eigen::SparseMatrix& >::type sparse_x(sparse_xSEXP); Rcpp::traits::input_parameter< bool >::type use_sparse_data(use_sparse_dataSEXP); Rcpp::traits::input_parameter< bool >::type order_snps(order_snpsSEXP); Rcpp::traits::input_parameter< bool >::type oob_error(oob_errorSEXP); Rcpp::traits::input_parameter< uint >::type max_depth(max_depthSEXP); Rcpp::traits::input_parameter< std::vector>& >::type inbag(inbagSEXP); Rcpp::traits::input_parameter< bool >::type use_inbag(use_inbagSEXP); - rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)); + rcpp_result_gen = Rcpp::wrap(rangerCpp(treetype, input_x, input_y, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_x, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)); return rcpp_result_gen; END_RCPP } @@ -75,7 +74,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 44}, + {"_ranger_rangerCpp", (DL_FUNC) &_ranger_rangerCpp, 43}, {"_ranger_numSmaller", (DL_FUNC) &_ranger_numSmaller, 2}, {NULL, NULL, 0} }; diff --git a/src/Tree.cpp b/src/Tree.cpp index c3dbf9399..1116c61d5 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -17,7 +17,7 @@ namespace ranger { Tree::Tree() : - dependent_varID(0), mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_varIDs( + mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_varIDs( 0), split_select_weights(0), case_weights(0), manual_inbag(0), oob_sampleIDs(0), holdout(false), keep_inbag( false), data(0), variable_importance(0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement( true), sample_fraction(0), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), alpha(DEFAULT_ALPHA), minprop( @@ -27,7 +27,7 @@ Tree::Tree() : Tree::Tree(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values) : - dependent_varID(0), mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_varIDs( + mtry(0), num_samples(0), num_samples_oob(0), min_node_size(0), deterministic_varIDs(0), split_select_varIDs( 0), split_select_weights(0), case_weights(0), manual_inbag(0), split_varIDs(split_varIDs), split_values( split_values), child_nodeIDs(child_nodeIDs), oob_sampleIDs(0), holdout(false), keep_inbag(false), data(0), variable_importance( 0), importance_mode(DEFAULT_IMPORTANCE_MODE), sample_with_replacement(true), sample_fraction(0), memory_saving_splitting( @@ -35,7 +35,7 @@ Tree::Tree(std::vector>& child_nodeIDs, std::vector& DEFAULT_NUM_RANDOM_SPLITS), max_depth(DEFAULT_MAXDEPTH), depth(0), last_left_nodeID(0) { } -void Tree::init(const Data* data, uint mtry, size_t dependent_varID, size_t num_samples, uint seed, +void Tree::init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, std::vector* split_select_varIDs, std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, std::vector* case_weights, @@ -44,7 +44,6 @@ void Tree::init(const Data* data, uint mtry, size_t dependent_varID, size_t num_ this->data = data; this->mtry = mtry; - this->dependent_varID = dependent_varID; this->num_samples = num_samples; this->memory_saving_splitting = memory_saving_splitting; @@ -164,7 +163,7 @@ void Tree::predict(const Data* prediction_data, bool oob_prediction) { // Move to child size_t split_varID = split_varIDs[nodeID]; - double value = prediction_data->get(sample_idx, split_varID); + double value = prediction_data->get_x(sample_idx, split_varID); if (prediction_data->isOrderedVariable(split_varID)) { if (value <= split_values[nodeID]) { // Move to left child @@ -194,7 +193,7 @@ void Tree::predict(const Data* prediction_data, bool oob_prediction) { void Tree::computePermutationImportance(std::vector& forest_importance, std::vector& forest_variance) { - size_t num_independent_variables = data->getNumCols() - data->getNoSplitVariables().size(); + size_t num_independent_variables = data->getNumCols(); // Compute normal prediction accuracy for each tree. Predictions already computed.. double accuracy_normal = computePredictionAccuracyInternal(); @@ -208,16 +207,8 @@ void Tree::computePermutationImportance(std::vector& forest_importance, // Randomly permute for all independent variables for (size_t i = 0; i < num_independent_variables; ++i) { - // Skip no split variables - size_t varID = i; - for (auto& skip : data->getNoSplitVariables()) { - if (varID >= skip) { - ++varID; - } - } - // Permute and compute prediction accuracy again for this permutation and save difference - permuteAndPredictOobSamples(varID, permutations); + permuteAndPredictOobSamples(i, permutations); double accuracy_permuted = computePredictionAccuracyInternal(); double accuracy_difference = accuracy_normal - accuracy_permuted; forest_importance[i] += accuracy_difference; @@ -231,6 +222,7 @@ void Tree::computePermutationImportance(std::vector& forest_importance, } } +// #nocov start void Tree::appendToFile(std::ofstream& file) { // Save general fields @@ -241,6 +233,7 @@ void Tree::appendToFile(std::ofstream& file) { // Call special functions for subclasses to save special fields. appendToFileInternal(file); } +// #nocov end void Tree::createPossibleSplitVarSubset(std::vector& result) { @@ -248,20 +241,15 @@ void Tree::createPossibleSplitVarSubset(std::vector& result) { // For corrected Gini importance add dummy variables if (importance_mode == IMP_GINI_CORRECTED) { - num_vars += data->getNumCols() - data->getNoSplitVariables().size(); + num_vars += data->getNumCols(); } // Randomly add non-deterministic variables (according to weights if needed) if (split_select_weights->empty()) { if (deterministic_varIDs->empty()) { - drawWithoutReplacementSkip(result, random_number_generator, num_vars, data->getNoSplitVariables(), mtry); + drawWithoutReplacement(result, random_number_generator, num_vars, mtry); } else { - std::vector skip; - std::copy(data->getNoSplitVariables().begin(), data->getNoSplitVariables().end(), - std::inserter(skip, skip.end())); - std::copy(deterministic_varIDs->begin(), deterministic_varIDs->end(), std::inserter(skip, skip.end())); - std::sort(skip.begin(), skip.end()); - drawWithoutReplacementSkip(result, random_number_generator, num_vars, skip, mtry); + drawWithoutReplacementSkip(result, random_number_generator, num_vars, (*deterministic_varIDs), mtry); } } else { drawWithoutReplacementWeighted(result, random_number_generator, *split_select_varIDs, mtry, *split_select_weights); @@ -307,7 +295,7 @@ bool Tree::splitNode(size_t nodeID) { size_t pos = start_pos[nodeID]; while (pos < start_pos[right_child_nodeID]) { size_t sampleID = sampleIDs[pos]; - if (data->get(sampleID, split_varID) <= split_value) { + if (data->get_x(sampleID, split_varID) <= split_value) { // If going to left, do nothing ++pos; } else { @@ -321,7 +309,7 @@ bool Tree::splitNode(size_t nodeID) { size_t pos = start_pos[nodeID]; while (pos < start_pos[right_child_nodeID]) { size_t sampleID = sampleIDs[pos]; - double level = data->get(sampleID, split_varID); + double level = data->get_x(sampleID, split_varID); size_t factorID = floor(level) - 1; size_t splitID = floor(split_value); @@ -370,7 +358,7 @@ size_t Tree::dropDownSamplePermuted(size_t permuted_varID, size_t sampleID, size } // Move to child - double value = data->get(sampleID_final, split_varID); + double value = data->get_x(sampleID_final, split_varID); if (data->isOrderedVariable(split_varID)) { if (value <= split_values[nodeID]) { // Move to left child diff --git a/src/Tree.h b/src/Tree.h index 0d0a9f7d0..7aea85ca9 100644 --- a/src/Tree.h +++ b/src/Tree.h @@ -35,7 +35,7 @@ class Tree { Tree(const Tree&) = delete; Tree& operator=(const Tree&) = delete; - void init(const Data* data, uint mtry, size_t dependent_varID, size_t num_samples, uint seed, + void init(const Data* data, uint mtry, size_t num_samples, uint seed, std::vector* deterministic_varIDs, std::vector* split_select_varIDs, std::vector* split_select_weights, ImportanceMode importance_mode, uint min_node_size, bool sample_with_replacement, bool memory_saving_splitting, SplitRule splitrule, @@ -102,7 +102,6 @@ class Tree { virtual void cleanUpInternal() = 0; - size_t dependent_varID; uint mtry; // Number of samples (all samples, not only inbag for this tree) diff --git a/src/TreeClassification.cpp b/src/TreeClassification.cpp index d9f7d7c1b..bdb9bf929 100644 --- a/src/TreeClassification.cpp +++ b/src/TreeClassification.cpp @@ -88,7 +88,7 @@ bool TreeClassification::splitNodeInternal(size_t nodeID, std::vector& p double pure_value = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, dependent_varID); + double value = data->get_y(sampleID, 0); if (pos != start_pos[nodeID] && value != pure_value) { pure = false; break; @@ -127,7 +127,7 @@ double TreeClassification::computePredictionAccuracyInternal() { for (size_t i = 0; i < num_predictions; ++i) { size_t terminal_nodeID = prediction_terminal_nodeIDs[i]; double predicted_value = split_values[terminal_nodeID]; - double real_value = data->get(oob_sampleIDs[i], dependent_varID); + double real_value = data->get_y(oob_sampleIDs[i], 0); if (predicted_value != real_value) { ++num_missclassifications; } @@ -229,7 +229,7 @@ void TreeClassification::findBestSplitValueSmallQ(size_t nodeID, size_t varID, s // Count samples in right child per class and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); uint sample_classID = (*response_classIDs)[sampleID]; // Count samples until split_value reached @@ -392,7 +392,7 @@ void TreeClassification::findBestSplitValueUnordered(size_t nodeID, size_t varID for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -518,7 +518,7 @@ void TreeClassification::findBestSplitValueExtraTrees(size_t nodeID, size_t varI // Count samples in right child per class and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); uint sample_classID = (*response_classIDs)[sampleID]; // Count samples until split_value reached @@ -632,7 +632,7 @@ void TreeClassification::findBestSplitValueExtraTreesUnordered(size_t nodeID, si for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -686,11 +686,6 @@ void TreeClassification::addGiniImportance(size_t nodeID, size_t varID, double d // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); - for (auto& skip : data->getNoSplitVariables()) { - if (tempvarID >= skip) { - --tempvarID; - } - } // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { diff --git a/src/TreeProbability.cpp b/src/TreeProbability.cpp index 81da10aed..dbc390fe1 100644 --- a/src/TreeProbability.cpp +++ b/src/TreeProbability.cpp @@ -92,7 +92,7 @@ bool TreeProbability::splitNodeInternal(size_t nodeID, std::vector& poss double pure_value = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, dependent_varID); + double value = data->get_y(sampleID, 0); if (pos != start_pos[nodeID] && value != pure_value) { pure = false; break; @@ -233,7 +233,7 @@ void TreeProbability::findBestSplitValueSmallQ(size_t nodeID, size_t varID, size // Count samples in right child per class and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); uint sample_classID = (*response_classIDs)[sampleID]; // Count samples until split_value reached @@ -396,7 +396,7 @@ void TreeProbability::findBestSplitValueUnordered(size_t nodeID, size_t varID, s for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -522,7 +522,7 @@ void TreeProbability::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, // Count samples in right child per class and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); uint sample_classID = (*response_classIDs)[sampleID]; // Count samples until split_value reached @@ -636,7 +636,7 @@ void TreeProbability::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_ for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; uint sample_classID = (*response_classIDs)[sampleID]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -690,11 +690,6 @@ void TreeProbability::addImpurityImportance(size_t nodeID, size_t varID, double // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); - for (auto& skip : data->getNoSplitVariables()) { - if (tempvarID >= skip) { - --tempvarID; - } - } // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { diff --git a/src/TreeRegression.cpp b/src/TreeRegression.cpp index 4545e52e9..53bac9322 100644 --- a/src/TreeRegression.cpp +++ b/src/TreeRegression.cpp @@ -48,7 +48,7 @@ double TreeRegression::estimate(size_t nodeID) { size_t num_samples_in_node = end_pos[nodeID] - start_pos[nodeID]; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - sum_responses_in_node += data->get(sampleID, dependent_varID); + sum_responses_in_node += data->get_y(sampleID, 0); } return (sum_responses_in_node / (double) num_samples_in_node); } @@ -72,7 +72,7 @@ bool TreeRegression::splitNodeInternal(size_t nodeID, std::vector& possi double pure_value = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, dependent_varID); + double value = data->get_y(sampleID, 0); if (pos != start_pos[nodeID] && value != pure_value) { pure = false; break; @@ -115,7 +115,7 @@ double TreeRegression::computePredictionAccuracyInternal() { for (size_t i = 0; i < num_predictions; ++i) { size_t terminal_nodeID = prediction_terminal_nodeIDs[i]; double predicted_value = split_values[terminal_nodeID]; - double real_value = data->get(oob_sampleIDs[i], dependent_varID); + double real_value = data->get_y(oob_sampleIDs[i], 0); if (predicted_value != real_value) { sum_of_squares += (predicted_value - real_value) * (predicted_value - real_value); } @@ -134,7 +134,7 @@ bool TreeRegression::findBestSplit(size_t nodeID, std::vector& possible_ double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - sum_node += data->get(sampleID, dependent_varID); + sum_node += data->get_y(sampleID, 0); } // For all possible split variables @@ -212,8 +212,8 @@ void TreeRegression::findBestSplitValueSmallQ(size_t nodeID, size_t varID, doubl // Sum in right child and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); - double response = data->get(sampleID, dependent_varID); + double value = data->get_x(sampleID, varID); + double response = data->get_y(sampleID, 0); // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { @@ -265,7 +265,7 @@ void TreeRegression::findBestSplitValueLargeQ(size_t nodeID, size_t varID, doubl size_t sampleID = sampleIDs[pos]; size_t index = data->getIndex(sampleID, varID); - sums[index] += data->get(sampleID, dependent_varID); + sums[index] += data->get_y(sampleID, 0); ++counter[index]; } @@ -350,8 +350,8 @@ void TreeRegression::findBestSplitValueUnordered(size_t nodeID, size_t varID, do // Sum in right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double response = data->get(sampleID, dependent_varID); - double value = data->get(sampleID, varID); + double response = data->get_y(sampleID, 0); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -385,7 +385,7 @@ bool TreeRegression::findBestSplitMaxstat(size_t nodeID, std::vector& po response.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - response.push_back(data->get(sampleID, dependent_varID)); + response.push_back(data->get_y(sampleID, 0)); } std::vector ranks = rank(response); @@ -407,7 +407,7 @@ bool TreeRegression::findBestSplitMaxstat(size_t nodeID, std::vector& po x.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - x.push_back(data->get(sampleID, varID)); + x.push_back(data->get_x(sampleID, varID)); } // Order by x @@ -490,7 +490,7 @@ bool TreeRegression::findBestSplitExtraTrees(size_t nodeID, std::vector& double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - sum_node += data->get(sampleID, dependent_varID); + sum_node += data->get_y(sampleID, 0); } // For all possible split variables @@ -567,8 +567,8 @@ void TreeRegression::findBestSplitValueExtraTrees(size_t nodeID, size_t varID, d // Sum in right child and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); - double response = data->get(sampleID, dependent_varID); + double value = data->get_x(sampleID, varID); + double response = data->get_y(sampleID, 0); // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { @@ -669,8 +669,8 @@ void TreeRegression::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t // Sum in right child for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double response = data->get(sampleID, dependent_varID); - double value = data->get(sampleID, varID); + double response = data->get_y(sampleID, 0); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -706,7 +706,7 @@ bool TreeRegression::findBestSplitBeta(size_t nodeID, std::vector& possi double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - sum_node += data->get(sampleID, dependent_varID); + sum_node += data->get_y(sampleID, 0); } // For all possible split variables find best split value @@ -766,8 +766,8 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double // Sum in right child and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); - double response = data->get(sampleID, dependent_varID); + double value = data->get_x(sampleID, varID); + double response = data->get_y(sampleID, 0); // Count samples until split_value reached for (size_t i = 0; i < num_splits; ++i) { @@ -800,8 +800,8 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double double var_left = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); - double response = data->get(sampleID, dependent_varID); + double value = data->get_x(sampleID, varID); + double response = data->get_y(sampleID, 0); if (value > possible_split_values[i]) { var_right += (response - mean_right) * (response - mean_right); @@ -826,8 +826,8 @@ void TreeRegression::findBestSplitValueBeta(size_t nodeID, size_t varID, double double beta_loglik_left = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); - double response = data->get(sampleID, dependent_varID); + double value = data->get_x(sampleID, varID); + double response = data->get_y(sampleID, 0); if (value > possible_split_values[i]) { beta_loglik_right += betaLogLik(response, mean_right, phi_right); @@ -867,18 +867,13 @@ void TreeRegression::addImpurityImportance(size_t nodeID, size_t varID, double d double sum_node = 0; for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - sum_node += data->get(sampleID, dependent_varID); + sum_node += data->get_y(sampleID, 0); } best_decrease = decrease - sum_node * sum_node / (double) num_samples_node; } // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); - for (auto& skip : data->getNoSplitVariables()) { - if (tempvarID >= skip) { - --tempvarID; - } - } // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { diff --git a/src/TreeSurvival.cpp b/src/TreeSurvival.cpp index 762a2351c..f9525a160 100644 --- a/src/TreeSurvival.cpp +++ b/src/TreeSurvival.cpp @@ -21,17 +21,16 @@ namespace ranger { -TreeSurvival::TreeSurvival(std::vector* unique_timepoints, size_t status_varID, - std::vector* response_timepointIDs) : - status_varID(status_varID), unique_timepoints(unique_timepoints), response_timepointIDs(response_timepointIDs), num_deaths( - 0), num_samples_at_risk(0) { +TreeSurvival::TreeSurvival(std::vector* unique_timepoints, std::vector* response_timepointIDs) : + unique_timepoints(unique_timepoints), response_timepointIDs(response_timepointIDs), num_deaths(0), num_samples_at_risk( + 0) { this->num_timepoints = unique_timepoints->size(); } TreeSurvival::TreeSurvival(std::vector>& child_nodeIDs, std::vector& split_varIDs, std::vector& split_values, std::vector> chf, std::vector* unique_timepoints, std::vector* response_timepointIDs) : - Tree(child_nodeIDs, split_varIDs, split_values), status_varID(0), unique_timepoints(unique_timepoints), response_timepointIDs( + Tree(child_nodeIDs, split_varIDs, split_values), unique_timepoints(unique_timepoints), response_timepointIDs( response_timepointIDs), chf(chf), num_deaths(0), num_samples_at_risk(0) { this->num_timepoints = unique_timepoints->size(); } @@ -84,7 +83,7 @@ double TreeSurvival::computePredictionAccuracyInternal() { } // Return concordance index - return computeConcordanceIndex(*data, sum_chf, dependent_varID, status_varID, oob_sampleIDs); + return computeConcordanceIndex(*data, sum_chf, oob_sampleIDs); } bool TreeSurvival::splitNodeInternal(size_t nodeID, std::vector& possible_split_varIDs) { @@ -169,11 +168,10 @@ bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& poss status.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - time.push_back(data->get(sampleID, dependent_varID)); - status.push_back(data->get(sampleID, status_varID)); + time.push_back(data->get_y(sampleID, 0)); + status.push_back(data->get_y(sampleID, 1)); } std::vector scores = logrankScores(time, status); - //std::vector scores = logrankScoresData(data, dependent_varID, status_varID, sampleIDs[nodeID]); // Save split stats std::vector pvalues; @@ -193,7 +191,7 @@ bool TreeSurvival::findBestSplitMaxstat(size_t nodeID, std::vector& poss x.reserve(num_samples_node); for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - x.push_back(data->get(sampleID, varID)); + x.push_back(data->get_x(sampleID, varID)); } // Order by x @@ -286,7 +284,7 @@ void TreeSurvival::computeDeathCounts(size_t nodeID) { for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double survival_time = data->get(sampleID, dependent_varID); + double survival_time = data->get_y(sampleID, 0); size_t t = 0; while (t < num_timepoints && (*unique_timepoints)[t] < survival_time) { @@ -297,7 +295,7 @@ void TreeSurvival::computeDeathCounts(size_t nodeID) { // Now t is the survival time, add to at risk and to death if death if (t < num_timepoints) { ++num_samples_at_risk[t]; - if (data->get(sampleID, status_varID) == 1) { + if (data->get_y(sampleID, 1) == 1) { ++num_deaths[t]; } } @@ -311,7 +309,7 @@ void TreeSurvival::computeChildDeathCounts(size_t nodeID, size_t varID, std::vec // Count deaths in right child per timepoint and possbile split for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t survival_timeID = (*response_timepointIDs)[sampleID]; // Count deaths until split_value reached @@ -320,7 +318,7 @@ void TreeSurvival::computeChildDeathCounts(size_t nodeID, size_t varID, std::vec if (value > possible_split_values[i]) { ++num_samples_right_child[i]; ++delta_samples_at_risk_right_child[i * num_timepoints + survival_timeID]; - if (data->get(sampleID, status_varID) == 1) { + if (data->get_y(sampleID, 1) == 1) { ++num_deaths_right_child[i * num_timepoints + survival_timeID]; } } else { @@ -448,7 +446,7 @@ void TreeSurvival::findBestSplitValueLogRankUnordered(size_t nodeID, size_t varI for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t survival_timeID = (*response_timepointIDs)[sampleID]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -456,7 +454,7 @@ void TreeSurvival::findBestSplitValueLogRankUnordered(size_t nodeID, size_t varI if ((splitID & (1ULL << factorID))) { ++num_samples_right_child; ++delta_samples_at_risk_right_child[survival_timeID]; - if (data->get(sampleID, status_varID) == 1) { + if (data->get_y(sampleID, 1) == 1) { ++num_deaths_right_child[survival_timeID]; } } @@ -526,9 +524,9 @@ void TreeSurvival::findBestSplitValueAUC(size_t nodeID, size_t varID, double& be // For all pairs for (size_t k = start_pos[nodeID]; k < end_pos[nodeID]; ++k) { size_t sample_k = sampleIDs[k]; - double time_k = data->get(sample_k, dependent_varID); - double status_k = data->get(sample_k, status_varID); - double value_k = data->get(sample_k, varID); + double time_k = data->get_y(sample_k, 0); + double status_k = data->get_y(sample_k, 1); + double value_k = data->get_x(sample_k, varID); // Count samples in left node for (size_t i = 0; i < num_splits; ++i) { @@ -540,9 +538,9 @@ void TreeSurvival::findBestSplitValueAUC(size_t nodeID, size_t varID, double& be for (size_t l = k + 1; l < end_pos[nodeID]; ++l) { size_t sample_l = sampleIDs[l]; - double time_l = data->get(sample_l, dependent_varID); - double status_l = data->get(sample_l, status_varID); - double value_l = data->get(sample_l, varID); + double time_l = data->get_y(sample_l, 0); + double status_l = data->get_y(sample_l, 1); + double value_l = data->get_x(sample_l, varID); // Compute split computeAucSplit(time_k, time_l, status_k, status_l, value_k, value_l, num_splits, possible_split_values, @@ -838,7 +836,7 @@ void TreeSurvival::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t v for (size_t pos = start_pos[nodeID]; pos < end_pos[nodeID]; ++pos) { size_t sampleID = sampleIDs[pos]; size_t survival_timeID = (*response_timepointIDs)[sampleID]; - double value = data->get(sampleID, varID); + double value = data->get_x(sampleID, varID); size_t factorID = floor(value) - 1; // If in right child, count @@ -846,7 +844,7 @@ void TreeSurvival::findBestSplitValueExtraTreesUnordered(size_t nodeID, size_t v if ((splitID & (1ULL << factorID))) { ++num_samples_right_child; ++delta_samples_at_risk_right_child[survival_timeID]; - if (data->get(sampleID, status_varID) == 1) { + if (data->get_y(sampleID, 1) == 1) { ++num_deaths_right_child[survival_timeID]; } } @@ -896,11 +894,6 @@ void TreeSurvival::addImpurityImportance(size_t nodeID, size_t varID, double dec // No variable importance for no split variables size_t tempvarID = data->getUnpermutedVarID(varID); - for (auto& skip : data->getNoSplitVariables()) { - if (tempvarID >= skip) { - --tempvarID; - } - } // Subtract if corrected importance and permuted variable, else add if (importance_mode == IMP_GINI_CORRECTED && varID >= data->getNumCols()) { diff --git a/src/TreeSurvival.h b/src/TreeSurvival.h index 613b476ff..5e0ed6781 100644 --- a/src/TreeSurvival.h +++ b/src/TreeSurvival.h @@ -21,7 +21,7 @@ namespace ranger { class TreeSurvival: public Tree { public: - TreeSurvival(std::vector* unique_timepoints, size_t status_varID, std::vector* response_timepointIDs); + TreeSurvival(std::vector* unique_timepoints, std::vector* response_timepointIDs); // Create from loaded forest TreeSurvival(std::vector>& child_nodeIDs, std::vector& split_varIDs, @@ -97,8 +97,6 @@ class TreeSurvival: public Tree { num_samples_at_risk.shrink_to_fit(); } - size_t status_varID; - // Unique time points for all individuals (not only this bootstrap), sorted const std::vector* unique_timepoints; size_t num_timepoints; diff --git a/src/rangerCpp.cpp b/src/rangerCpp.cpp index ef3e916fb..6dc59843a 100644 --- a/src/rangerCpp.cpp +++ b/src/rangerCpp.cpp @@ -48,19 +48,20 @@ using namespace ranger; // [[Rcpp::depends(RcppEigen)]] // [[Rcpp::export]] -Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::NumericMatrix& input_data, +Rcpp::List rangerCpp(uint treetype, Rcpp::NumericMatrix& input_x, Rcpp::NumericMatrix& input_y, std::vector variable_names, uint mtry, uint num_trees, bool verbose, uint seed, uint num_threads, bool write_forest, uint importance_mode_r, uint min_node_size, std::vector>& split_select_weights, bool use_split_select_weights, std::vector& always_split_variable_names, bool use_always_split_variable_names, - std::string status_variable_name, bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, + bool prediction_mode, Rcpp::List loaded_forest, Rcpp::RawMatrix snp_data, bool sample_with_replacement, bool probability, std::vector& unordered_variable_names, bool use_unordered_variable_names, bool save_memory, uint splitrule_r, std::vector& case_weights, bool use_case_weights, std::vector& class_weights, bool predict_all, bool keep_inbag, std::vector& sample_fraction, double alpha, double minprop, bool holdout, uint prediction_type_r, - uint num_random_splits, Eigen::SparseMatrix& sparse_data, bool use_sparse_data, bool order_snps, - bool oob_error, uint max_depth, std::vector>& inbag, bool use_inbag) { - + uint num_random_splits, Eigen::SparseMatrix& sparse_x, + bool use_sparse_data, bool order_snps, bool oob_error, uint max_depth, + std::vector>& inbag, bool use_inbag) { + Rcpp::List result; try { @@ -94,19 +95,18 @@ Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::N size_t num_rows; size_t num_cols; if (use_sparse_data) { - num_rows = sparse_data.rows(); - num_cols = sparse_data.cols(); + num_rows = sparse_x.rows(); + num_cols = sparse_x.cols(); } else { - num_rows = input_data.nrow(); - num_cols = input_data.ncol(); + num_rows = input_x.nrow(); + num_cols = input_x.ncol(); } // Initialize data if (use_sparse_data) { - data = make_unique(sparse_data, variable_names, num_rows, num_cols); + data = make_unique(sparse_x, input_y, variable_names, num_rows, num_cols); } else { - data = make_unique(input_data, variable_names, num_rows, - num_cols); + data = make_unique(input_x, input_y, variable_names, num_rows, num_cols); } // If there is snp data, add it @@ -144,16 +144,14 @@ Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::N PredictionType prediction_type = (PredictionType) prediction_type_r; // Init Ranger - forest->initR(dependent_variable_name, std::move(data), mtry, num_trees, verbose_out, seed, num_threads, - importance_mode, min_node_size, split_select_weights, always_split_variable_names, status_variable_name, + forest->initR(std::move(data), mtry, num_trees, verbose_out, seed, num_threads, + importance_mode, min_node_size, split_select_weights, always_split_variable_names, prediction_mode, sample_with_replacement, unordered_variable_names, save_memory, splitrule, case_weights, inbag, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type, num_random_splits, order_snps, max_depth); // Load forest object if in prediction mode if (prediction_mode) { - size_t dependent_varID = loaded_forest["dependent.varID"]; - //size_t num_trees = loaded_forest["num.trees"]; std::vector> > child_nodeIDs = loaded_forest["child.nodeIDs"]; std::vector> split_varIDs = loaded_forest["split.varIDs"]; std::vector> split_values = loaded_forest["split.values"]; @@ -162,23 +160,22 @@ Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::N if (treetype == TREE_CLASSIFICATION) { std::vector class_values = loaded_forest["class.values"]; auto& temp = dynamic_cast(*forest); - temp.loadForest(dependent_varID, num_trees, child_nodeIDs, split_varIDs, split_values, class_values, + temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, class_values, is_ordered); } else if (treetype == TREE_REGRESSION) { auto& temp = dynamic_cast(*forest); - temp.loadForest(dependent_varID, num_trees, child_nodeIDs, split_varIDs, split_values, is_ordered); + temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, is_ordered); } else if (treetype == TREE_SURVIVAL) { - size_t status_varID = loaded_forest["status.varID"]; std::vector> > chf = loaded_forest["chf"]; std::vector unique_timepoints = loaded_forest["unique.death.times"]; auto& temp = dynamic_cast(*forest); - temp.loadForest(dependent_varID, num_trees, child_nodeIDs, split_varIDs, split_values, status_varID, chf, + temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, chf, unique_timepoints, is_ordered); } else if (treetype == TREE_PROBABILITY) { std::vector class_values = loaded_forest["class.values"]; std::vector>> terminal_class_counts = loaded_forest["terminal.class.counts"]; auto& temp = dynamic_cast(*forest); - temp.loadForest(dependent_varID, num_trees, child_nodeIDs, split_varIDs, split_values, class_values, + temp.loadForest(num_trees, child_nodeIDs, split_varIDs, split_values, class_values, terminal_class_counts, is_ordered); } } else { @@ -238,7 +235,6 @@ Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::N // Save forest if needed if (write_forest) { Rcpp::List forest_object; - forest_object.push_back(forest->getDependentVarId(), "dependent.varID"); forest_object.push_back(forest->getNumTrees(), "num.trees"); forest_object.push_back(forest->getChildNodeIDs(), "child.nodeIDs"); forest_object.push_back(forest->getSplitVarIDs(), "split.varIDs"); @@ -260,7 +256,6 @@ Rcpp::List rangerCpp(uint treetype, std::string dependent_variable_name, Rcpp::N forest_object.push_back(temp.getTerminalClassCounts(), "terminal.class.counts"); } else if (treetype == TREE_SURVIVAL) { auto& temp = dynamic_cast(*forest); - forest_object.push_back(temp.getStatusVarId(), "status.varID"); forest_object.push_back(temp.getChf(), "chf"); forest_object.push_back(temp.getUniqueTimepoints(), "unique.death.times"); } diff --git a/src/utility.cpp b/src/utility.cpp index 3725e5263..416ca7ede 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -81,6 +81,16 @@ void loadDoubleVectorFromFile(std::vector& result, std::string filename) } } // #nocov end +void drawWithoutReplacement(std::vector& result, std::mt19937_64& random_number_generator, size_t max, + size_t num_samples) { + if (num_samples < max / 10) { + drawWithoutReplacementSimple(result, random_number_generator, max, num_samples); + } else { + //drawWithoutReplacementKnuth(result, random_number_generator, max, skip, num_samples); + drawWithoutReplacementFisherYates(result, random_number_generator, max, num_samples); + } +} + void drawWithoutReplacementSkip(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples) { if (num_samples < max / 10) { @@ -91,6 +101,26 @@ void drawWithoutReplacementSkip(std::vector& result, std::mt19937_64& ra } } +void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, + size_t num_samples) { + + result.reserve(num_samples); + + // Set all to not selected + std::vector temp; + temp.resize(max, false); + + std::uniform_int_distribution unif_dist(0, max - 1); + for (size_t i = 0; i < num_samples; ++i) { + size_t draw; + do { + draw = unif_dist(random_number_generator); + } while (temp[draw]); + temp[draw] = true; + result.push_back(draw); + } +} + void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples) { @@ -116,6 +146,23 @@ void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& } } +void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, + size_t max, size_t num_samples) { + + // Create indices + result.resize(max); + std::iota(result.begin(), result.end(), 0); + + // Draw without replacement using Fisher Yates algorithm + std::uniform_real_distribution distribution(0.0, 1.0); + for (size_t i = 0; i < num_samples; ++i) { + size_t j = i + distribution(random_number_generator) * (max - i); + std::swap(result[i], result[j]); + } + + result.resize(num_samples); +} + void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, size_t max, const std::vector& skip, size_t num_samples) { @@ -203,8 +250,8 @@ double mostFrequentValue(const std::unordered_map& class_count, } } -double computeConcordanceIndex(const Data& data, const std::vector& sum_chf, size_t dependent_varID, - size_t status_varID, const std::vector& sample_IDs) { +double computeConcordanceIndex(const Data& data, const std::vector& sum_chf, + const std::vector& sample_IDs) { // Compute concordance index double concordance = 0; @@ -214,16 +261,16 @@ double computeConcordanceIndex(const Data& data, const std::vector& sum_ if (!sample_IDs.empty()) { sample_i = sample_IDs[i]; } - double time_i = data.get(sample_i, dependent_varID); - double status_i = data.get(sample_i, status_varID); + double time_i = data.get_y(sample_i, 0); + double status_i = data.get_y(sample_i, 1); for (size_t j = i + 1; j < sum_chf.size(); ++j) { size_t sample_j = j; if (!sample_IDs.empty()) { sample_j = sample_IDs[j]; } - double time_j = data.get(sample_j, dependent_varID); - double status_j = data.get(sample_j, status_varID); + double time_j = data.get_y(sample_j, 0); + double status_j = data.get_y(sample_j, 1); if (time_i < time_j && status_i == 0) { continue; @@ -295,6 +342,7 @@ std::string beautifyTime(uint seconds) { // #nocov start return result; } // #nocov end +// #nocov start size_t roundToNextMultiple(size_t value, uint multiple) { if (multiple == 0) { @@ -308,6 +356,7 @@ size_t roundToNextMultiple(size_t value, uint multiple) { return value + multiple - remainder; } +// #nocov end void splitString(std::vector& result, const std::string& input, char split_char) { // #nocov start @@ -579,12 +628,14 @@ std::vector numSamplesLeftOfCutpoint(std::vector& x, const std:: return num_samples_left; } +// #nocov start std::stringstream& readFromStream(std::stringstream& in, double& token) { if (!(in >> token) && (std::fpclassify(token) == FP_SUBNORMAL)) { in.clear(); } return in; } +// #nocov end double betaLogLik(double y, double mean, double phi) { diff --git a/src/utility.h b/src/utility.h index b151e53ce..04709e305 100644 --- a/src/utility.h +++ b/src/utility.h @@ -148,6 +148,16 @@ inline void readVector2D(std::vector>& result, std::ifstream& fil void loadDoubleVectorFromFile(std::vector& result, std::string filename); // #nocov end +/** + * Draw random numbers in a range without replacements. + * @param result Vector to add results to. Will not be cleaned before filling. + * @param random_number_generator Random number generator + * @param range_length Length of range. Interval to draw from: 0..max-1 + * @param num_samples Number of samples to draw + */ +void drawWithoutReplacement(std::vector& result, std::mt19937_64& random_number_generator, size_t range_length, + size_t num_samples); + /** * Draw random numbers in a range without replacement and skip values. * @param result Vector to add results to. Will not be cleaned before filling. @@ -164,6 +174,16 @@ void drawWithoutReplacementSkip(std::vector& result, std::mt19937_64& ra * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param range_length Length of range. Interval to draw from: 0..max-1 + * @param num_samples Number of samples to draw + */ +void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& random_number_generator, size_t max, + size_t num_samples); + +/** + * Simple algorithm for sampling without replacement (skip values), faster for smaller num_samples + * @param result Vector to add results to. Will not be cleaned before filling. + * @param random_number_generator Random number generator + * @param range_length Length of range. Interval to draw from: 0..max-1 * @param skip Values to skip * @param num_samples Number of samples to draw */ @@ -175,6 +195,16 @@ void drawWithoutReplacementSimple(std::vector& result, std::mt19937_64& * @param result Vector to add results to. Will not be cleaned before filling. * @param random_number_generator Random number generator * @param max Length of range. Interval to draw from: 0..max-1 + * @param num_samples Number of samples to draw + */ +void drawWithoutReplacementFisherYates(std::vector& result, std::mt19937_64& random_number_generator, + size_t max, size_t num_samples); + +/** + * Fisher Yates algorithm for sampling without replacement (skip values). + * @param result Vector to add results to. Will not be cleaned before filling. + * @param random_number_generator Random number generator + * @param max Length of range. Interval to draw from: 0..max-1 * @param skip Values to skip * @param num_samples Number of samples to draw */ @@ -273,13 +303,11 @@ double mostFrequentValue(const std::unordered_map& class_count, * Compute concordance index for given data and summed cumulative hazard function/estimate * @param data Reference to Data object * @param sum_chf Summed chf over timepoints for each sample - * @param dependent_varID ID of dependent variable - * @param status_varID ID of status variable * @param sample_IDs IDs of samples, for example OOB samples * @return concordance index */ -double computeConcordanceIndex(const Data& data, const std::vector& sum_chf, size_t dependent_varID, - size_t status_varID, const std::vector& sample_IDs); +double computeConcordanceIndex(const Data& data, const std::vector& sum_chf, + const std::vector& sample_IDs); /** * Convert a unsigned integer to string diff --git a/tests/testthat/test_interface.R b/tests/testthat/test_interface.R index 0df46ba17..940dc4b83 100644 --- a/tests/testthat/test_interface.R +++ b/tests/testthat/test_interface.R @@ -69,6 +69,52 @@ test_that("Working if dependent variable is matrix with one column", { expect_silent(ranger(data = iris2, dependent.variable = "Sepal.Width")) }) +test_that("Same result with x/y interface, classification", { + set.seed(300) + rf_formula <- ranger(Species ~ ., iris, num.trees = 5) + + set.seed(300) + rf_xy <- ranger(y = iris[, 5], x = iris[, -5], num.trees = 5) + + expect_equal(rf_formula$prediction.error, rf_xy$prediction.error) + expect_equal(rf_formula$predictions, rf_xy$predictions) +}) + +test_that("Same result with x/y interface, regression", { + set.seed(300) + rf_formula <- ranger(Sepal.Length ~ ., iris, num.trees = 5) + + set.seed(300) + rf_xy <- ranger(y = iris[, 1], x = iris[, -1], num.trees = 5) + + expect_equal(rf_formula$prediction.error, rf_xy$prediction.error) + expect_equal(rf_formula$predictions, rf_xy$predictions) +}) + +test_that("Same result with x/y interface, survival", { + set.seed(300) + rf_formula <- ranger(Surv(time, status) ~ ., veteran, num.trees = 5) + + set.seed(300) + rf_xy <- ranger(y = veteran[, c(3, 4)], x = veteran[, c(-3, -4)], num.trees = 5) + + expect_equal(rf_formula$prediction.error, rf_xy$prediction.error) + expect_equal(rf_formula$predictions, rf_xy$predictions) +}) + +test_that("Column order does not change prediction", { + dat <- iris[, c(sample(1:4), 5)] + rf <- ranger(dependent.variable.name = "Species", data = iris) + + set.seed(42) + pred1 <- predict(rf, iris)$predictions + + set.seed(42) + pred2 <- predict(rf, dat)$predictions + + expect_equal(pred1, pred2) +}) + # Tibbles # This is failing on Rdevel. Possible without suggesting tibble package? # if (requireNamespace("tibble", quietly = TRUE)) { diff --git a/tests/testthat/test_print.R b/tests/testthat/test_print.R index f41b6114c..fd9f3512b 100644 --- a/tests/testthat/test_print.R +++ b/tests/testthat/test_print.R @@ -19,4 +19,4 @@ expect_that(print(predict(rf, iris)), prints_text("Ranger prediction")) expect_that(str(rf), prints_text("List of 14")) ## Test str forest function -expect_that(str(rf$forest), prints_text("List of 10")) +expect_that(str(rf$forest), prints_text("List of 9")) diff --git a/tests/testthat/test_ranger.R b/tests/testthat/test_ranger.R index 0b4a6136b..80a6fa429 100644 --- a/tests/testthat/test_ranger.R +++ b/tests/testthat/test_ranger.R @@ -157,9 +157,12 @@ test_that("OOB error is correct for 1 tree, regression", { test_that("Missing value columns detected in training", { dat <- iris - dat[4, 5] <- NA dat[25, 1] <- NA - expect_error(ranger(Species ~ ., dat, num.trees = 5), "Missing data in columns: Species, Sepal.Length") + expect_error(ranger(Species ~ ., dat, num.trees = 5), "Missing data in columns: Sepal.Length") + + dat <- iris + dat[4, 5] <- NA + expect_error(ranger(Species ~ ., dat, num.trees = 5), "Missing data in dependent variable.") }) test_that("No error if missing value in irrelevant column, training", { @@ -178,84 +181,70 @@ test_that("No error if missing value in irrelevant column, prediction", { test_that("Split points are at (A+B)/2 for numeric features, regression variance splitting", { dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10) - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, regression maxstat splitting", { dat <- data.frame(y = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10, splitrule = "maxstat", alpha = 1) - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, classification", { dat <- data.frame(y = factor(rbinom(100, 1, .5)), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10) - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, probability", { dat <- data.frame(y = factor(rbinom(100, 1, .5)), x = rbinom(100, 1, .5)) rf <- ranger(y ~ x, dat, num.trees = 10, probability = TRUE) - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, survival logrank splitting", { dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "logrank") - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, survival C-index splitting", { dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "C") - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) test_that("Split points are at (A+B)/2 for numeric features, survival maxstat splitting", { dat <- data.frame(time = runif(100, 1, 10), status = rbinom(100, 1, .5), x = rbinom(100, 1, .5)) rf <- ranger(Surv(time, status) ~ x, dat, num.trees = 10, splitrule = "maxstat", alpha = 1) - split_points <- mapply(function(varID, value) { - value[varID > 0] - }, - rf$forest$split.varIDs, - rf$forest$split.values - ) + split_points <- sapply(1:rf$num.trees, function(i) { + res <- treeInfo(rf, i)$splitval + res[!is.na(res)] + }) expect_equal(split_points, rep(0.5, rf$num.trees)) }) diff --git a/tests/testthat/test_treeInfo.R b/tests/testthat/test_treeInfo.R index a49b56278..1c8a069c5 100644 --- a/tests/testthat/test_treeInfo.R +++ b/tests/testthat/test_treeInfo.R @@ -234,4 +234,20 @@ test_that("Spitting value is numeric for order splitting", { expect_is(ti.order$splitval[!ti.order$terminal & ti.order$splitvarName == "Species"], "numeric") }) +test_that("treeInfo works for 31 unordered factor levels but not for 32", { + n <- 31 + dt <- data.frame(x = factor(1:n, ordered = FALSE), + y = rbinom(n, 1, 0.5)) + rf <- ranger(y ~ ., data = dt, num.trees = 10, splitrule = "extratrees") + expect_silent(treeInfo(rf)) + + n <- 32 + dt <- data.frame(x = factor(1:n, ordered = FALSE), + y = rbinom(n, 1, 0.5)) + rf <- ranger(y ~ ., data = dt, num.trees = 10, splitrule = "extratrees") + expect_warning(treeInfo(rf), "Unordered splitting levels can only be shown for up to 31 levels.") +}) + + + diff --git a/tests/testthat/test_unordered.R b/tests/testthat/test_unordered.R index e7f17bf54..b7441c2cc 100644 --- a/tests/testthat/test_unordered.R +++ b/tests/testthat/test_unordered.R @@ -337,7 +337,7 @@ test_that("Partition splitting working for large number of levels", { rf <- ranger(y ~ ., data = dt, num.trees = 10, splitrule = "extratrees") max_split <- max(sapply(1:rf$num.trees, function(i) { - max(log2(rf$forest$split.values[[i]][rf$forest$split.varIDs[[i]] > 0])) + max(log2(rf$forest$split.values[[i]]), na.rm = TRUE) })) expect_lte(max_split, n)