In [None]:
"""
Uncomment this if your environment does not have all runtime dependencies installed
skl2onnx must be installed to support scikit-learn pipelines.
"""

"""
import pip
pip.main(['install', 'skl2onnx', 'pydot', 'onnxruntime'])
"""

In [None]:
import onnx
import ebm2onnx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from interpret import glassbox
from sklearn import compose, impute, pipeline, preprocessing

from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
)

from skl2onnx.common.data_types import Int64TensorType, FloatTensorType, StringTensorType
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator

import ebm2onnx
import ebm2onnx.operators as ops

import onnxruntime as rt

# Train a model with feature engineering

We use a scikit-learn pipeline for the feature engineering part. The whole pipeline is serialized in the final ONNX graph. So, the ONNX graph contains both the model and the feature-engineering transforms.

We use a fake dataset here as an example.

In [None]:
features = [
    "feature_a",
    "feature_b",
    "feature_c",
    "feature_d",
    "feature_e",
    "feature_f",
    "feature_g",
]

df_train = pd.DataFrame(
    {
        "feature_a": [0, 0.5, 2, 5],
        "feature_b": [0, 0.5, 2, 5],
        "feature_c": [0, 0.5, 2, 5],
        "feature_d": [0, 0.5, 2, 5],
        "feature_e": [0, 1, 0, 1],
        "feature_f": [1, 0, 1, 0],
        "feature_g": ["a", "b", "can_not_determine", "can_not_determine"],
        "target": [1, 1, 0, 0],
    }
)

In [None]:
numeric_mean_transformer = pipeline.Pipeline(
    steps=[
        ("imputer", impute.SimpleImputer(strategy="mean")),
        ("scaler", preprocessing.StandardScaler()),
    ]
)

numeric_median_transformer = pipeline.Pipeline(
    steps=[
        ("imputer", impute.SimpleImputer(strategy="median")),
        ("scaler", preprocessing.StandardScaler()),
    ]
)

categorical_transformer = pipeline.Pipeline(
    steps=[
        (
            "onehot",
            preprocessing.OneHotEncoder(
                sparse_output=True,
                # Assumes I have 2 bool and 1 cat feature, and I'm specifying what
                # values I want to drop when one hot encoding.
                drop=list([0, 0, "can_not_determine"]),
                handle_unknown="ignore",
            ),
        )
    ]
)

preprocessor = compose.ColumnTransformer(
    transformers=[
        (
            "num_mean",
            numeric_mean_transformer,
            ["feature_a", "feature_b"],
        ),
        (
            "num_median",
            numeric_median_transformer,
            ["feature_c", "feature_d"],
        ),
        ("cat", categorical_transformer, ["feature_e", "feature_f", "feature_g"]),
        
    ]
)

my_pipeline = pipeline.Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "model",
            glassbox.ExplainableBoostingClassifier(
                max_bins=8,
                min_samples_leaf=2,
                max_leaves=2,
                learning_rate=0.5,
                validation_size=0.5,
                early_stopping_rounds=5,
                interactions=0,
                random_state=42,
            ),
        ),
    ]
)



In [None]:
my_pipeline.fit(df_train[features], df_train["target"])

# Convert the pipeline to ONNX

We register the EBM model to skl2onnx. This allows for serializing it as part of the whole pipeline conversion to ONNX.

In [None]:
update_registered_converter(
    glassbox.ExplainableBoostingClassifier,
    "ExplainableBoostingClassifier",
    ebm2onnx.sklearn.ebm_output_shape_calculator,
    ebm2onnx.sklearn.convert_ebm_classifier,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

model_onnx = convert_sklearn(
    my_pipeline,
    "pipeline_ebm",
    [
        ("feature_a", FloatTensorType([None, 1])),
        ("feature_b", FloatTensorType([None, 1])),
        ("feature_c", FloatTensorType([None, 1])),
        ("feature_d", FloatTensorType([None, 1])),
        ("feature_e", Int64TensorType([None, 1])),
        ("feature_f", Int64TensorType([None, 1])),
        ("feature_g", StringTensorType([None, 1])),
    ],
    target_opset={"": 21, "ai.onnx.ml": 3},
    options={id(my_pipeline): {"zipmap": False}}
)

In [None]:
pydot_graph = GetPydotGraph(
        model_onnx.graph, name=model_onnx.graph.name, rankdir="TB", node_producer=GetOpNodeProducer("docstring")
    )
pydot_graph.write_png('model.png')

plt.figure(figsize = (80,30), dpi=300)
plt.imshow(plt.imread('model.png'), interpolation='nearest')

# Execute the pipeline with ONNX-Runtime

In [None]:
with open("model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [None]:
sess = rt.InferenceSession("model.onnx", providers=["CPUExecutionProvider"])

pred_onx = sess.run(None, {
    "feature_a": df_train["feature_a"].values.reshape([-1, 1]).astype(np.float32),
    "feature_b": df_train["feature_b"].values.reshape([-1, 1]).astype(np.float32),
    "feature_c": df_train["feature_c"].values.reshape([-1, 1]).astype(np.float32),
    "feature_d": df_train["feature_d"].values.reshape([-1, 1]).astype(np.float32),
    "feature_e": df_train["feature_e"].values.reshape([-1, 1]),
    "feature_f": df_train["feature_f"].values.reshape([-1, 1]),
    "feature_g": df_train["feature_g"].values.reshape([-1, 1]),
})

outputs = sess.get_outputs()

for i, o in enumerate(outputs):
    print(f"{o.name}: {pred_onx[i]}")