# Working with Onnx on Databricks

Get Started: https://onnx.ai/get-started.html

Example Notebook: http://onnx.ai/sklearn-onnx/auto_tutorial/plot_gbegin_dataframe.html#sphx-glr-download-auto-tutorial-plot-gbegin-dataframe-py

Convert your Sklearn model to Onnx: http://onnx.ai/sklearn-onnx/?msclkid=7fbc8e9dc0a811ec915bce257aa9bb30

In [0]:
!pip install skl2onnx
!pip install onnxmltools
!pip install onnxruntime
!pip install mlprodict
!pip install mlinsights
!pip install pyquickhelper
!pip install docutils

# Dataframe as an input

.. index:: dataframe

A pipeline usually ingests data as a matrix. It may be converted in a matrix
if all the data share the same type. But data held in a dataframe
have usually multiple types, float, integer or string for categories.
ONNX also supports that case.

## A dataset with categories

In [0]:
%matplotlib inline

In [0]:
from mlinsights.plotting import pipeline2dot
import numpy
import pprint
from mlprodict.onnx_conv import guess_schema_from_data
from onnxruntime import InferenceSession
from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from mlprodict.onnxrt import OnnxInference
from mlprodict.onnx_conv import to_onnx as to_onnx_ext
from skl2onnx import to_onnx
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier


data = DataFrame([
    dict(CAT1='a', CAT2='c', num1=0.5, num2=0.6, y=0),
    dict(CAT1='b', CAT2='d', num1=0.4, num2=0.8, y=1),
    dict(CAT1='a', CAT2='d', num1=0.5, num2=0.56, y=0),
    dict(CAT1='a', CAT2='d', num1=0.55, num2=0.56, y=1),
    dict(CAT1='a', CAT2='c', num1=0.35, num2=0.86, y=0),
    dict(CAT1='a', CAT2='c', num1=0.5, num2=0.68, y=1),
])

cat_cols = ['CAT1', 'CAT2']
train_data = data.drop('y', axis=1)


categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols)],
    remainder='passthrough')
pipe = Pipeline([('preprocess', preprocessor),
                 ('rf', RandomForestClassifier())])
pipe.fit(train_data, data['y'])

## Track on MLFlow

In [0]:
import mlflow
import numpy as np
from sklearn.ensemble import RandomForestClassifier

experiment_name = '/churn-prediction'

if(not(mlflow.get_experiment_by_name(experiment_name))):
    mlflow.create_experiment(experiment_name)

mlflow.set_experiment(experiment_name)
run_name = 'RandomForest-model'

# With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
mlflow.sklearn.autolog()

with mlflow.start_run():
    categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols)],
    remainder='passthrough')
    pipe_mlflow = Pipeline([('preprocess', preprocessor),
                 ('rf', RandomForestClassifier())])
    pipe_mlflow.fit(train_data, data['y'])

## Conversion to ONNX

In [0]:
onx = to_onnx_ext(
    pipe_mlflow, train_data[:1],
    options={RandomForestClassifier: {'zipmap': False}})

## Prediction with ONNX

In [0]:
oinf = OnnxInference(onx)
got = oinf.run(train_data)

print('Predictions \n')
print('Original Model:', pipe_mlflow.predict(train_data))
print('Onnx Model:', got['label'])

And probilities.

In [0]:
print('Predictions \n')
print('Original Model:', pipe_mlflow.predict_proba(train_data))
print('\n')
print('Onnx Model:', got['probabilities'])

## Persist the model to an ADLS (using MLFLow)

Remove the folder if existing

In [0]:
import os, re, os.path
import shutil

onnx_model_path = "/dbfs/mnt/documents/models/onnx-churn-model"

for root, dirs, files in os.walk(onnx_model_path):
    for file in files:
        os.remove(os.path.join(root, file))

try:
    shutil.rmtree(onnx_model_path)
except:
    print('Folder not found')
os.mkdir(onnx_model_path)

### Persist the model

In [0]:
with open(onnx_model_path + '/onnx-churn-model.onnx', "wb") as f:
    f.write(onx.SerializeToString())

## Load the model and make the predictions

In [0]:
oinf = OnnxInference(onnx_model_path + '/onnx-churn-model.onnx')
got = oinf.run(train_data)

print('Predictions \n')
print('Original Model:', pipe_mlflow.predict(train_data))
print('Loaded Onnx Model:', got['label'])

We can also deploy this model to an AKS cluster on Azure Machine Learning: https://docs.microsoft.com/en-us/azure/machine-learning/concept-onnx