In [1]:
import sklearn.datasets
import sklearn.ensemble
import numpy
import treelite
import treelite_runtime

In [2]:
%cd treelite-oob

[Errno 2] No such file or directory: 'treelite-oob'
/home/jovyan/treelite-oob


In [3]:
d = sklearn.datasets.fetch_california_housing(data_home = './bin')

In [4]:
x = d['data']
y = d['target']
x.shape

(20640, 8)

In [5]:
forest = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 3, oob_score = True, random_state = 0)
forest.fit(x, y)
(forest.oob_score_, len(forest.estimators_))

(0.5539804583170151, 100)

In [6]:
# check that my reproduction of bootstrap sampling indices is accurate

oob_preds = numpy.zeros(y.shape[0])
n_oob_preds = numpy.zeros(y.shape[0])

for i in range(forest.n_estimators):
    tree = forest.estimators_[i]
    unsampled_indices = sklearn.ensemble._forest._generate_unsampled_indices(tree.random_state, y.shape[0], y.shape[0])
    oob_pred = tree.predict(x[unsampled_indices, :])
    oob_preds[unsampled_indices] += oob_pred
    n_oob_preds[unsampled_indices] += 1

n_oob_preds[n_oob_preds == 0] = 1
oob_preds /= n_oob_preds

In [7]:
numpy.all(oob_preds == forest.oob_prediction_)

True

In [8]:
# vanilla treelite

In [9]:
model_sklearn = treelite.sklearn.import_model(forest)
model_sklearn.export_lib(toolchain = 'gcc', libpath = './bin/model_sklearn.so', params = {'parallel_comp': 32})
predictor_sklearn = treelite_runtime.Predictor('./bin/model_sklearn.so')



[15:37:28] ../src/compiler/ast/split.cc:29: Parallel compilation enabled; member trees will be divided into 32 translation units.


In [10]:
pred_truth = forest.predict(x)
pred_sklearn = predictor_sklearn.predict(treelite_runtime.DMatrix(x, dtype = 'float32'), verbose = True)

numpy.allclose(pred_truth, pred_sklearn)

[15:37:31] ../src/predictor/predictor.cc:464: Treelite: Finished prediction in 0.0034318 sec


True

In [11]:
# hand constructed treelite

In [12]:
def process_node(treelite_tree, sklearn_tree, node_id, sklearn_model):
    if sklearn_tree.children_left[node_id] == -1:  # leaf node
        process_leaf_node(treelite_tree, sklearn_tree, node_id, sklearn_model)
    else:  # test node
        process_test_node(treelite_tree, sklearn_tree, node_id, sklearn_model)

def process_test_node(treelite_tree, sklearn_tree, node_id, sklearn_model):
    # Process a test node with a given node ID. We shall assume that all tree ensembles in
    # scikit-learn use only numerical splits.
    treelite_tree[node_id].set_numerical_test_node(
        feature_id = sklearn_tree.feature[node_id],
        opname = '<=',
        threshold = sklearn_tree.threshold[node_id],
        threshold_type = 'float64',
        default_left = True,
        left_child_key = sklearn_tree.children_left[node_id],
        right_child_key = sklearn_tree.children_right[node_id]
    )

def process_leaf_node(treelite_tree, sklearn_tree, node_id, sklearn_model):
    # Process a test node with a given node ID
    # The `value` attribute stores the output for every leaf node.
    leaf_value = sklearn_tree.value[node_id].squeeze()
    # Initialize the leaf node with given node ID
    treelite_tree[node_id].set_leaf_node(leaf_value, leaf_value_type = 'float64')


builder_hand = treelite.ModelBuilder(num_feature = forest.n_features_in_, average_tree_output = True, threshold_type = 'float64', leaf_output_type = 'float64')
for i in range(forest.n_estimators):
    sklearn_tree = forest.estimators_[i].tree_
    treelite_tree = treelite.ModelBuilder.Tree(threshold_type = 'float64', leaf_output_type = 'float64')
    
    for node_id in range(sklearn_tree.node_count):
        process_node(treelite_tree, sklearn_tree, node_id, sklearn_model = forest)
    treelite_tree[0].set_root()
    builder_hand.append(treelite_tree)

model_hand = builder_hand.commit()

In [13]:
model_hand.export_lib(toolchain = 'gcc', libpath = './bin/model_hand.so', params = {'parallel_comp': 32})
predictor_hand = treelite_runtime.Predictor('./bin/model_hand.so')

[15:37:32] ../src/compiler/ast/split.cc:29: Parallel compilation enabled; member trees will be divided into 32 translation units.


In [14]:
print(predictor_hand.leaf_output_type)

pred_truth = forest.predict(x)
pred_hand = predictor_hand.predict(treelite_runtime.DMatrix(x, dtype = 'float32'), verbose = True)

numpy.allclose(pred_truth, pred_hand)

float64
[15:37:36] ../src/predictor/predictor.cc:464: Treelite: Finished prediction in 0.00633693 sec


True

In [15]:
# oob

In [16]:
def process_node_oob(treelite_tree, sklearn_tree, node_id, sklearn_model, tree_index):
    if sklearn_tree.children_left[node_id] == -1:  # leaf node
        process_leaf_node_oob(treelite_tree, sklearn_tree, node_id, sklearn_model, tree_index)
    else:  # test node
        process_test_node_oob(treelite_tree, sklearn_tree, node_id, sklearn_model)

def process_test_node_oob(treelite_tree, sklearn_tree, node_id, sklearn_model):
    # Process a test node with a given node ID. We shall assume that all tree ensembles in
    # scikit-learn use only numerical splits.
    treelite_tree[node_id].set_numerical_test_node(
        feature_id = sklearn_tree.feature[node_id],
        opname = '<=',
        threshold = sklearn_tree.threshold[node_id],
        threshold_type = 'float64',
        default_left = True,
        left_child_key = sklearn_tree.children_left[node_id],
        right_child_key = sklearn_tree.children_right[node_id]
    )

def process_leaf_node_oob(treelite_tree, sklearn_tree, node_id, sklearn_model, tree_index):
    # Process a test node with a given node ID
    # The `value` attribute stores the output for every leaf node.
    leaf_value = sklearn_tree.value[node_id].squeeze()
    # Initialize the leaf node with given node ID
    # use a vector by tree index
    leaf_vector = numpy.zeros(sklearn_model.n_estimators)
    leaf_vector[tree_index] = leaf_value
    treelite_tree[node_id].set_leaf_node(leaf_vector, leaf_value_type = 'float64')


builder_oob = treelite.ModelBuilder(num_feature = forest.n_features_in_, average_tree_output = True, 
                                    pred_transform = 'identity_multiclass', num_class = forest.n_estimators,
                                    threshold_type = 'float64', leaf_output_type = 'float64')
for i in range(forest.n_estimators):
    sklearn_tree = forest.estimators_[i].tree_
    treelite_tree = treelite.ModelBuilder.Tree(threshold_type = 'float64', leaf_output_type = 'float64')
    
    for node_id in range(sklearn_tree.node_count):
        process_node_oob(treelite_tree, sklearn_tree, node_id, sklearn_model = forest, tree_index = i)
    treelite_tree[0].set_root()
    builder_oob.append(treelite_tree)

model_oob = builder_oob.commit()

In [17]:
model_oob.export_lib(toolchain = 'gcc', libpath = './bin/model_oob.so', params = {'parallel_comp': 32})
predictor_oob = treelite_runtime.Predictor('./bin/model_oob.so')

[15:37:38] ../src/compiler/ast/split.cc:29: Parallel compilation enabled; member trees will be divided into 32 translation units.


In [18]:
print(predictor_oob.leaf_output_type)

pred_truth = forest.predict(x)
pred_oob = predictor_oob.predict(treelite_runtime.DMatrix(x, dtype = 'float32'), verbose = True)

float64
[15:37:54] ../src/predictor/predictor.cc:464: Treelite: Finished prediction in 0.0161564 sec


In [19]:
sampled_by_tree = numpy.ones((forest.n_estimators, y.shape[0]), dtype = 'bool')
for i in range(forest.n_estimators):
    tree = forest.estimators_[i]
    unsampled_indices = sklearn.ensemble._forest._generate_unsampled_indices(tree.random_state, y.shape[0], y.shape[0])
    sampled_by_tree[i, unsampled_indices] = 0

print(sampled_by_tree.shape)
sampled_by_tree

(100, 20640)


array([[False, False,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False,  True],
       [ True, False,  True, ..., False,  True, False],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [ True, False, False, ..., False,  True, False],
       [False, False, False, ...,  True,  True,  True]])

In [20]:
# first observation were sampled by these trees
sampled_by_tree[:, 0]

array([False,  True,  True,  True, False, False,  True,  True, False,
       False,  True,  True,  True,  True, False,  True,  True, False,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
       False,  True, False, False,  True,  True, False,  True,  True,
       False,  True,  True, False,  True, False,  True, False, False,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True,  True, False,  True,  True,  True,  True, False,  True,
       False])

In [21]:
# prediction for first observation by these trees
pred_oob.T[:, 0]

array([0.04560552, 0.04570032, 0.04586979, 0.04569512, 0.03821047,
       0.04371979, 0.04589362, 0.0402859 , 0.04581989, 0.04577618,
       0.0400397 , 0.04544045, 0.03926476, 0.04001247, 0.04592836,
       0.04122305, 0.0412437 , 0.03999852, 0.04589456, 0.0392282 ,
       0.04590753, 0.04582622, 0.03946538, 0.0397702 , 0.04303944,
       0.04600728, 0.04095347, 0.04560237, 0.04599909, 0.04586869,
       0.03972882, 0.03990556, 0.04738166, 0.04646981, 0.03913978,
       0.04602887, 0.04108574, 0.04012991, 0.04545299, 0.03930916,
       0.04622483, 0.04627873, 0.04550603, 0.04582898, 0.0462196 ,
       0.04567977, 0.04682332, 0.04071481, 0.04121475, 0.04610376,
       0.04565651, 0.04593056, 0.0458457 , 0.04548754, 0.04324523,
       0.04683947, 0.03986283, 0.04567417, 0.04694492, 0.04606315,
       0.04620698, 0.03963573, 0.0394904 , 0.0390622 , 0.03992085,
       0.04229458, 0.04585559, 0.04605572, 0.04589857, 0.03961649,
       0.04561612, 0.04726357, 0.04569323, 0.04021279, 0.04561

In [22]:
numpy.mean(pred_oob.T[:, 0][numpy.invert(sampled_by_tree[:, 0])]) * forest.n_estimators

4.268926388720941

In [23]:
pred_oob_mean = numpy.array([numpy.mean(pred_oob.T[:, i][numpy.invert(sampled_by_tree[:, i])]) * forest.n_estimators for i in range(y.shape[0])])
numpy.allclose(forest.oob_prediction_, pred_oob_mean)

True