In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import timeit

#models
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#datasets
from sklearn.datasets import load_digits, load_wine, load_breast_cancer, load_iris

#pmml libs
from sklearn2pmml import PMMLPipeline, sklearn2pmml
from pypmml import Model
from sklearn_pmml_model.linear_model import PMMLLogisticRegression, PMMLRidgeClassifier
from sklearn_pmml_model.naive_bayes import PMMLGaussianNB
from sklearn_pmml_model.tree import PMMLTreeClassifier
from sklearn_pmml_model.ensemble import PMMLForestClassifier, PMMLGradientBoostingClassifier

In [21]:
models = {
    'Linear model': LogisticRegression(), 
    'Naive Bayes': GaussianNB(),
    'Decision tree': DecisionTreeClassifier(), 
    'Random Forest': RandomForestClassifier(), 
    'Gradient boosting': GradientBoostingClassifier()
}
pmmlModels = {
    'Linear model': PMMLLogisticRegression, 
    'Naive Bayes': PMMLGaussianNB,
    'Decision tree': PMMLTreeClassifier, 
    'Random Forest': PMMLForestClassifier, 
    'Gradient boosting': PMMLGradientBoostingClassifier
}
datasets = {
#    'Iris': load_iris(as_frame=True), 
#    'Digits': load_digits(as_frame=True),
    'Wine': load_wine(as_frame=True), 
    'Breast cancer': load_breast_cancer(as_frame=True)
}
algorithms = ['PyPMML', 'sklearn-pmml-model']

load_times = pd.DataFrame(index=algorithms, columns=pd.MultiIndex.from_product([datasets, models]))
pred_times = pd.DataFrame(index=algorithms, columns=pd.MultiIndex.from_product([datasets, models]))

In [22]:
repeat = 100  # Note: this will take a long time

for dataset_name, dataset in datasets.items():
    X = StandardScaler().fit_transform(dataset.data)
    X = pd.DataFrame(data=X, columns=dataset.data.columns)
    y = dataset.target
    
    for model_name, model in models.items():
        print(f"{dataset_name} - {model_name}")
        
        if isinstance(model, LogisticRegression):
            model.multi_class = 'ovr' if len(np.unique(y)) == 2 else 'multinomial'

        # Create PMML
        pipeline = PMMLPipeline([
          ("classifier", model)
        ])
        pipeline.fit(X, y)
        sklearn2pmml(pipeline, "test.pmml", with_repr=True)
        
        # Record load times
        pypmml_time_load = np.mean(timeit.repeat(lambda: Model.load("test.pmml"), number=1, repeat=repeat))
        spm_time_load = np.mean(timeit.repeat(lambda: pmmlModels[model_name](pmml="test.pmml"), number=1, repeat=repeat))
        load_times[(dataset_name, model_name)][algorithms[0]] = pypmml_time_load
        load_times[(dataset_name, model_name)][algorithms[1]] = spm_time_load
        
        pypmml = Model.load("test.pmml")
        spm = pmmlModels[model_name](pmml="test.pmml")
        
        # Record predict times
        pypmml_time_predict = np.mean(timeit.repeat(lambda: pypmml.predict(X), number=1, repeat=repeat))
        spm_time_predict = np.mean(timeit.repeat(lambda: spm.predict_proba(X), number=1, repeat=repeat))
        pred_times[(dataset_name, model_name)][algorithms[0]] = pypmml_time_predict
        pred_times[(dataset_name, model_name)][algorithms[1]] = spm_time_predict

Wine - Linear model
Wine - Naive Bayes
Wine - Decision tree
Wine - Random Forest
Wine - Gradient boosting
Breast cancer - Linear model
Breast cancer - Naive Bayes
Breast cancer - Decision tree
Breast cancer - Random Forest
Breast cancer - Gradient boosting


In [24]:
load_times

Unnamed: 0_level_0,Wine,Wine,Wine,Wine,Wine,Breast cancer,Breast cancer,Breast cancer,Breast cancer,Breast cancer
Unnamed: 0_level_1,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting
PyPMML,0.006819,0.001778,0.001593,0.019264,0.019817,0.000766,0.001296,0.000952,0.027526,0.007866
sklearn-pmml-model,0.002418,0.002494,0.000467,0.027426,0.031997,0.004775,0.005375,0.000717,0.040238,0.011672


In [25]:
pred_times

Unnamed: 0_level_0,Wine,Wine,Wine,Wine,Wine,Breast cancer,Breast cancer,Breast cancer,Breast cancer,Breast cancer
Unnamed: 0_level_1,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting
PyPMML,0.006219,0.003896,0.003995,0.01347,0.014832,0.009072,0.010233,0.008416,0.031415,0.02333
sklearn-pmml-model,0.001622,0.001565,0.000497,0.002582,0.000952,0.005974,0.003106,0.000389,0.003783,0.001738


In [10]:
load_times + pred_times

Unnamed: 0_level_0,Wine,Wine,Wine,Wine,Wine,Breast cancer,Breast cancer,Breast cancer,Breast cancer,Breast cancer
Unnamed: 0_level_1,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting
PyPMML,0.013038,0.005674,0.005587,0.032734,0.034649,0.009838,0.01153,0.009367,0.058941,0.031196
sklearn-pmml-model,0.00404,0.004059,0.000964,0.030008,0.032949,0.010749,0.008481,0.001106,0.044021,0.013411


In [12]:
table = load_times + pred_times
table.loc['PyPMML',:] / table.loc['sklearn-pmml-model',:]


Wine           Linear model         3.226932
               Naive Bayes          1.398021
               Decision tree        5.796108
               Random Forest        1.090864
               Gradient boosting    1.051594
Breast cancer  Linear model         0.915217
               Naive Bayes          1.359418
               Decision tree        8.471196
               Random Forest        1.338924
               Gradient boosting    2.326224
dtype: object