Skip to content

Commit

Permalink
new aproach to save sampleIDs in nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
mnwright committed Dec 5, 2018
1 parent 976c058 commit e901ca8
Show file tree
Hide file tree
Showing 13 changed files with 224 additions and 141 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: ranger
Type: Package
Title: A Fast Implementation of Random Forests
Version: 0.10.5
Date: 2018-09-13
Version: 0.10.6
Date: 2018-12-5
Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb]
Maintainer: Marvin N. Wright <cran@wrig.de>
Description: A fast implementation of Random Forests, particularly suited for high
Expand Down
3 changes: 3 additions & 0 deletions NEWS
@@ -1,3 +1,6 @@
##### Version 0.10.6
* Internal changes (slightly improved computation speed)

##### Version 0.10.5
* Add support of splitting weights for corrected impurity importance
* Bug fixes
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
@@ -1,3 +1,6 @@
##### Version 0.10.6
* Internal changes (slightly improved computation speed)

##### Version 0.10.5
* Add support of splitting weights for corrected impurity importance
* Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion cpp_version/src/version.h
@@ -1,3 +1,3 @@
#ifndef RANGER_VERSION
#define RANGER_VERSION "0.10.5"
#define RANGER_VERSION "0.10.6"
#endif
15 changes: 8 additions & 7 deletions src/Data.cpp
Expand Up @@ -151,14 +151,14 @@ bool Data::loadFromFileOther(std::ifstream& input_file, std::string header_line,
}
// #nocov end

void Data::getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID) const {
void Data::getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID, size_t start, size_t end) const {

// All values for varID (no duplicates) for given sampleIDs
if (getUnpermutedVarID(varID) < num_cols_no_snp) {

all_values.reserve(sampleIDs.size());
for (size_t i = 0; i < sampleIDs.size(); ++i) {
all_values.push_back(get(sampleIDs[i], varID));
all_values.reserve(end-start);
for (size_t pos = start; pos < end; ++pos) {
all_values.push_back(get(sampleIDs[pos], varID));
}
std::sort(all_values.begin(), all_values.end());
all_values.erase(std::unique(all_values.begin(), all_values.end()), all_values.end());
Expand All @@ -168,13 +168,14 @@ void Data::getAllValues(std::vector<double>& all_values, std::vector<size_t>& sa
}
}

void Data::getMinMaxValues(double& min, double&max, std::vector<size_t>& sampleIDs, size_t varID) const {
// TODO: Better way without two versions?
void Data::getMinMaxValues(double& min, double&max, std::vector<size_t>& sampleIDs, size_t varID, size_t start, size_t end) const {
if (sampleIDs.size() > 0) {
min = get(sampleIDs[0], varID);
max = min;
}
for (size_t i = 1; i < sampleIDs.size(); ++i) {
double value = get(sampleIDs[i], varID);
for (size_t pos = start; pos < end; ++pos) {
double value = get(sampleIDs[pos], varID);
if (value < min) {
min = value;
}
Expand Down
4 changes: 2 additions & 2 deletions src/Data.h
Expand Up @@ -44,9 +44,9 @@ class Data {
bool loadFromFileWhitespace(std::ifstream& input_file, std::string header_line);
bool loadFromFileOther(std::ifstream& input_file, std::string header_line, char seperator);

void getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID) const;
void getAllValues(std::vector<double>& all_values, std::vector<size_t>& sampleIDs, size_t varID, size_t start, size_t end) const;

void getMinMaxValues(double& min, double&max, std::vector<size_t>& sampleIDs, size_t varID) const;
void getMinMaxValues(double& min, double&max, std::vector<size_t>& sampleIDs, size_t varID, size_t start, size_t end) const;

size_t getIndex(size_t row, size_t col) const {
// Use permuted data for corrected impurity importance
Expand Down
66 changes: 43 additions & 23 deletions src/Tree.cpp
Expand Up @@ -103,6 +103,10 @@ void Tree::grow(std::vector<double>* variable_importance) {
}
}

// Init start and end positions
start_pos[0] = 0;
end_pos[0] = sampleIDs.size();

// While not all nodes terminal, split next node
size_t num_open_nodes = 1;
size_t i = 0;
Expand All @@ -117,7 +121,7 @@ void Tree::grow(std::vector<double>* variable_importance) {
if (i >= last_left_nodeID) {
// If new level, increase depth
// (left_node saves left-most node in current level, new level reached if that node is splitted)
last_left_nodeID = sampleIDs.size() - 2;
last_left_nodeID = split_varIDs.size() - 2;
++depth;
}
}
Expand Down Expand Up @@ -253,7 +257,8 @@ void Tree::createPossibleSplitVarSubset(std::vector<size_t>& result) {
drawWithoutReplacementSkip(result, random_number_generator, num_vars, data->getNoSplitVariables(), mtry);
} else {
std::vector<size_t> skip;
std::copy(data->getNoSplitVariables().begin(), data->getNoSplitVariables().end(), std::inserter(skip, skip.end()));
std::copy(data->getNoSplitVariables().begin(), data->getNoSplitVariables().end(),
std::inserter(skip, skip.end()));
std::copy(deterministic_varIDs->begin(), deterministic_varIDs->end(), std::inserter(skip, skip.end()));
std::sort(skip.begin(), skip.end());
drawWithoutReplacementSkip(result, random_number_generator, num_vars, skip, mtry);
Expand Down Expand Up @@ -286,41 +291,56 @@ bool Tree::splitNode(size_t nodeID) {
split_varIDs[nodeID] = data->getUnpermutedVarID(split_varID);

// Create child nodes
size_t left_child_nodeID = sampleIDs.size();
size_t left_child_nodeID = split_varIDs.size();
child_nodeIDs[0][nodeID] = left_child_nodeID;
createEmptyNode();
start_pos[left_child_nodeID] = start_pos[nodeID];

size_t right_child_nodeID = sampleIDs.size();
size_t right_child_nodeID = split_varIDs.size();
child_nodeIDs[1][nodeID] = right_child_nodeID;
createEmptyNode();
start_pos[right_child_nodeID] = end_pos[nodeID];

// For each sample in node, assign to left or right child
if (data->isOrderedVariable(split_varID)) {
// Ordered: left is <= splitval and right is > splitval
for (auto& sampleID : sampleIDs[nodeID]) {
size_t pos = start_pos[nodeID];
while (pos < start_pos[right_child_nodeID]) {
size_t sampleID = sampleIDs[pos];
if (data->get(sampleID, split_varID) <= split_value) {
sampleIDs[left_child_nodeID].push_back(sampleID);
// If going to left, do nothing
++pos;
} else {
sampleIDs[right_child_nodeID].push_back(sampleID);
// If going to right, move to right end
--start_pos[right_child_nodeID];
std::swap(sampleIDs[pos], sampleIDs[start_pos[right_child_nodeID]]);
}
}
} else {
// Unordered: If bit at position is 1 -> right, 0 -> left
for (auto& sampleID : sampleIDs[nodeID]) {

size_t pos = start_pos[nodeID];
while (pos < start_pos[right_child_nodeID]) {
size_t sampleID = sampleIDs[pos];
double level = data->get(sampleID, split_varID);
size_t factorID = floor(level) - 1;
size_t splitID = floor(split_value);

// Left if 0 found at position factorID
if (!(splitID & (1 << factorID))) {
sampleIDs[left_child_nodeID].push_back(sampleID);
// If going to left, do nothing
++pos;
} else {
sampleIDs[right_child_nodeID].push_back(sampleID);
// If going to right, move to right end
--start_pos[right_child_nodeID];
std::swap(sampleIDs[pos], sampleIDs[start_pos[right_child_nodeID]]);
}
}
}

// End position of left child is start position of right child
end_pos[left_child_nodeID] = start_pos[right_child_nodeID];
end_pos[right_child_nodeID] = end_pos[nodeID];

// No terminal node
return false;
}
Expand All @@ -330,7 +350,8 @@ void Tree::createEmptyNode() {
split_values.push_back(0);
child_nodeIDs[0].push_back(0);
child_nodeIDs[1].push_back(0);
sampleIDs.push_back(std::vector<size_t>());
start_pos.push_back(0);
end_pos.push_back(0);

createEmptyNodeInternal();
}
Expand Down Expand Up @@ -395,7 +416,7 @@ void Tree::bootstrap() {
size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0];

// Reserve space, reserve a little more to be save)
sampleIDs[0].reserve(num_samples_inbag);
sampleIDs.reserve(num_samples_inbag);
oob_sampleIDs.reserve(num_samples * (exp(-(*sample_fraction)[0]) + 0.1));

std::uniform_int_distribution<size_t> unif_dist(0, num_samples - 1);
Expand All @@ -406,7 +427,7 @@ void Tree::bootstrap() {
// Draw num_samples samples with replacement (num_samples_inbag out of n) as inbag and mark as not OOB
for (size_t s = 0; s < num_samples_inbag; ++s) {
size_t draw = unif_dist(random_number_generator);
sampleIDs[0].push_back(draw);
sampleIDs.push_back(draw);
++inbag_counts[draw];
}

Expand All @@ -430,7 +451,7 @@ void Tree::bootstrapWeighted() {
size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0];

// Reserve space, reserve a little more to be save)
sampleIDs[0].reserve(num_samples_inbag);
sampleIDs.reserve(num_samples_inbag);
oob_sampleIDs.reserve(num_samples * (exp(-(*sample_fraction)[0]) + 0.1));

std::discrete_distribution<> weighted_dist(case_weights->begin(), case_weights->end());
Expand All @@ -441,7 +462,7 @@ void Tree::bootstrapWeighted() {
// Draw num_samples samples with replacement (n out of n) as inbag and mark as not OOB
for (size_t s = 0; s < num_samples_inbag; ++s) {
size_t draw = weighted_dist(random_number_generator);
sampleIDs[0].push_back(draw);
sampleIDs.push_back(draw);
++inbag_counts[draw];
}

Expand Down Expand Up @@ -471,7 +492,7 @@ void Tree::bootstrapWithoutReplacement() {

// Use fraction (default 63.21%) of the samples
size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0];
shuffleAndSplit(sampleIDs[0], oob_sampleIDs, num_samples, num_samples_inbag, random_number_generator);
shuffleAndSplit(sampleIDs, oob_sampleIDs, num_samples, num_samples_inbag, random_number_generator);
num_samples_oob = oob_sampleIDs.size();

if (keep_inbag) {
Expand All @@ -487,12 +508,11 @@ void Tree::bootstrapWithoutReplacementWeighted() {

// Use fraction (default 63.21%) of the samples
size_t num_samples_inbag = (size_t) num_samples * (*sample_fraction)[0];
drawWithoutReplacementWeighted(sampleIDs[0], random_number_generator, num_samples - 1, num_samples_inbag,
*case_weights);
drawWithoutReplacementWeighted(sampleIDs, random_number_generator, num_samples - 1, num_samples_inbag, *case_weights);

// All observation are 0 or 1 times inbag
inbag_counts.resize(num_samples, 0);
for (auto& sampleID : sampleIDs[0]) {
for (auto& sampleID : sampleIDs) {
inbag_counts[sampleID] = 1;
}

Expand Down Expand Up @@ -528,13 +548,13 @@ void Tree::bootstrapWithoutReplacementClassWise() {

void Tree::setManualInbag() {
// Select observation as specified in manual_inbag vector
sampleIDs[0].reserve(manual_inbag->size());
sampleIDs.reserve(manual_inbag->size());
inbag_counts.resize(num_samples, 0);
for (size_t i = 0; i < manual_inbag->size(); ++i) {
size_t inbag_count = (*manual_inbag)[i];
if ((*manual_inbag)[i] > 0) {
for (size_t j = 0; j < inbag_count; ++j) {
sampleIDs[0].push_back(i);
sampleIDs.push_back(i);
}
inbag_counts[i] = inbag_count;
} else {
Expand All @@ -544,7 +564,7 @@ void Tree::setManualInbag() {
num_samples_oob = oob_sampleIDs.size();

// Shuffle samples
std::shuffle(sampleIDs[0].begin(), sampleIDs[0].end(), random_number_generator);
std::shuffle(sampleIDs.begin(), sampleIDs.end(), random_number_generator);

if (!keep_inbag) {
inbag_counts.clear();
Expand Down
8 changes: 6 additions & 2 deletions src/Tree.h
Expand Up @@ -136,8 +136,12 @@ class Tree {
// Vector of left and right child node IDs, 0 for no child
std::vector<std::vector<size_t>> child_nodeIDs;

// For each node a vector with IDs of samples in node
std::vector<std::vector<size_t>> sampleIDs;
// All sampleIDs in the tree, will be re-ordered while splitting
std::vector<size_t> sampleIDs;

// For each node a vector with start and end positions
std::vector<size_t> start_pos;
std::vector<size_t> end_pos;

// IDs of OOB individuals, sorted
std::vector<size_t> oob_sampleIDs;
Expand Down

0 comments on commit e901ca8

Please sign in to comment.