# Train "large" RF Model
Save as both pkl and onnx files



In [1]:
import glob
import os
import numpy as np
import pandas as pd
from io import StringIO
import joblib
import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from skl2onnx import convert_sklearn, to_onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import update_registered_converter
from skl2onnx.common.data_types import guess_numpy_type
from skl2onnx.algebra.onnx_ops import OnnxAdd, OnnxMul, OnnxReduceSum

import onnxruntime as rt
import onnxmltools

from mlprodict.onnxrt import OnnxInference
from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
import matplotlib.pyplot as plt

In [2]:
RANDOM_SEED = 123

## make training data

In [3]:
X, y = make_regression(
    n_samples=10000,
    n_features=35,
    n_informative=20,
    n_targets=1,
    random_state=RANDOM_SEED
)
print(X.shape, y.shape)

(10000, 35) (10000,)


## Random Forest Model


In [4]:
rf = RandomForestRegressor(
    n_estimators=1000,
    n_jobs=4,
    random_state=RANDOM_SEED,
    verbose=1
)


In [5]:
%%time
rf.fit(X, y)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   22.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   38.5s


CPU times: user 3min 11s, sys: 753 ms, total: 3min 12s
Wall time: 48.3 s


[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   48.0s finished


RandomForestRegressor(n_estimators=1000, n_jobs=4, random_state=123, verbose=1)

In [6]:
try:
    os.remove('./large_rf_model.pkl')
except FileNotFoundError:
    pass

In [7]:
%%time
joblib.dump(rf,'./large_rf_model.pkl')

CPU times: user 780 ms, sys: 475 ms, total: 1.26 s
Wall time: 5.51 s


['./large_rf_model.pkl']

In [8]:
try:
    os.remove('./large_rf_model.onnx')
except FileNotFoundError:
    pass

In [9]:
%%time
# save model as onnx
explanatory_var = [('float_input', FloatTensorType([None, 35]))]
onnx_model = convert_sklearn(rf, initial_types=explanatory_var)

with open('./large_rf_model.onnx', 'wb') as f:
    f.write(onnx_model.SerializeToString())

CPU times: user 4min 28s, sys: 21 s, total: 4min 49s
Wall time: 4min 54s


In [10]:
del rf
del onnx_model

## Model Metrics

In [11]:
%%time
# collect data on RF tree structure
tree_metrics = []
rf_models = glob.glob('./large_rf_model.pkl')
for model in rf_models:
    # get file sizes
    fp_parts = os.path.splitext(model)
    metrics = {'model': fp_parts[0].split('/')[-1]}
    metrics['sklearn_file_size_mb'] = os.path.getsize(model) / (1024 * 1024)
    metrics['onnx_file_size_mb'] = os.path.getsize(os.path.join(fp_parts[0] + '.onnx')) / (1024 * 1024)
    
    t0 = time.perf_counter()
    sess = rt.InferenceSession(os.path.join(fp_parts[0] + '.onnx'))
    metrics['onnx_load_time_ms'] = 1000 * (time.perf_counter() - t0)
    del sess
    
    # extract tree structure
    t0 = time.perf_counter()
    rf = joblib.load(model)
    metrics['sklearn_load_time_ms'] = 1000 * (time.perf_counter() - t0)
    metrics['number_of_trees'] = len(rf.estimators_)
    tree_depth = [tree.tree_.max_depth for tree in rf.estimators_]
    metrics['tree_min_depth'] = np.min(tree_depth)
    metrics['tree_max_depth'] = np.max(tree_depth)
    metrics['tree_mean_depth'] = np.mean(tree_depth)
    del rf

    # collect metrics
    tree_metrics.append(metrics)

CPU times: user 38.1 s, sys: 2.94 s, total: 41.1 s
Wall time: 48.6 s


In [12]:
# overview of tree structure
tree_metrics_df = pd.DataFrame(tree_metrics)
# reorder the columns
tree_metrics_df = tree_metrics_df[
    [
        'model', 'sklearn_file_size_mb', 'onnx_file_size_mb', 'sklearn_load_time_ms', 'onnx_load_time_ms',
        'number_of_trees', 'tree_min_depth', 'tree_mean_depth', 'tree_max_depth'
    ]
]
tree_metrics_df

Unnamed: 0,model,sklearn_file_size_mb,onnx_file_size_mb,sklearn_load_time_ms,onnx_load_time_ms,number_of_trees,tree_min_depth,tree_mean_depth,tree_max_depth
0,large_rf_model,771.87106,485.680079,6768.917385,40344.782985,1000,22,24.976,32
