In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import timeit

#models
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#datasets
from sklearn.datasets import load_boston, load_digits, load_wine, load_breast_cancer, load_iris

#pmml libs
from sklearn2pmml import PMMLPipeline, sklearn2pmml
from pypmml import Model
from sklearn_pmml_model.linear_model import PMMLLogisticRegression, PMMLRidgeClassifier
from sklearn_pmml_model.naive_bayes import PMMLGaussianNB
from sklearn_pmml_model.tree import PMMLTreeClassifier
from sklearn_pmml_model.ensemble import PMMLForestClassifier, PMMLGradientBoostingClassifier

In [21]:
models = {
    'Linear model': LogisticRegression(), 
    'Naive Bayes': GaussianNB(),
    'Decision tree': DecisionTreeClassifier(), 
    'Random Forest': RandomForestClassifier(), 
    'Gradient boosting': GradientBoostingClassifier()
}
pmmlModels = {
    'Linear model': PMMLLogisticRegression, 
    'Naive Bayes': PMMLGaussianNB,
    'Decision tree': PMMLTreeClassifier, 
    'Random Forest': PMMLForestClassifier, 
    'Gradient boosting': PMMLGradientBoostingClassifier
}
datasets = {
#    'Iris': load_iris(as_frame=True), 
#    'Digits': load_digits(as_frame=True),
    'Wine': load_wine(as_frame=True), 
    'Breast cancer': load_breast_cancer(as_frame=True)
}
algorithms = ['PyPMML', 'sklearn-pmml-model']

load_times = pd.DataFrame(index=algorithms, columns=pd.MultiIndex.from_product([datasets, models]))
pred_times = pd.DataFrame(index=algorithms, columns=pd.MultiIndex.from_product([datasets, models]))

In [22]:
repeat = 100  # Note: this will take a long time

for dataset_name, dataset in datasets.items():
    X = StandardScaler().fit_transform(dataset.data)
    X = pd.DataFrame(data=X, columns=dataset.data.columns)
    y = dataset.target
    
    for model_name, model in models.items():
        print(f"{dataset_name} - {model_name}")
        
        if isinstance(model, LogisticRegression):
            model.multi_class = 'ovr' if len(np.unique(y)) == 2 else 'multinomial'

        # Create PMML
        pipeline = PMMLPipeline([
          ("classifier", model)
        ])
        pipeline.fit(X, y)
        sklearn2pmml(pipeline, "test.pmml", with_repr=True)
        
        # Record load times
        pypmml_time_load = np.mean(timeit.repeat(lambda: Model.load("test.pmml"), number=1, repeat=repeat))
        spm_time_load = np.mean(timeit.repeat(lambda: pmmlModels[model_name](pmml="test.pmml"), number=1, repeat=repeat))
        load_times[(dataset_name, model_name)][algorithms[0]] = pypmml_time_load
        load_times[(dataset_name, model_name)][algorithms[1]] = spm_time_load
        
        pypmml = Model.load("test.pmml")
        spm = pmmlModels[model_name](pmml="test.pmml")
        
        # Record predict times
        pypmml_time_predict = np.mean(timeit.repeat(lambda: pypmml.predict(X), number=1, repeat=repeat))
        spm_time_predict = np.mean(timeit.repeat(lambda: spm.predict_proba(X), number=1, repeat=repeat))
        pred_times[(dataset_name, model_name)][algorithms[0]] = pypmml_time_predict
        pred_times[(dataset_name, model_name)][algorithms[1]] = spm_time_predict

Wine - Linear model
Wine - Naive Bayes
Wine - Decision tree
Wine - Random Forest
Wine - Gradient boosting
Breast cancer - Linear model
Breast cancer - Naive Bayes
Breast cancer - Decision tree
Breast cancer - Random Forest
Breast cancer - Gradient boosting


In [24]:
load_times

Unnamed: 0_level_0,Wine,Wine,Wine,Wine,Wine,Breast cancer,Breast cancer,Breast cancer,Breast cancer,Breast cancer
Unnamed: 0_level_1,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting
PyPMML,0.002,0.002453,0.002322,0.042778,0.046875,0.00208,0.003304,0.001954,0.067751,0.018387
sklearn-pmml-model,0.001441,0.00161,0.001309,0.097291,0.120141,0.002286,0.003365,0.001482,0.133827,0.042554


In [25]:
pred_times

Unnamed: 0_level_0,Wine,Wine,Wine,Wine,Wine,Breast cancer,Breast cancer,Breast cancer,Breast cancer,Breast cancer
Unnamed: 0_level_1,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting
PyPMML,0.771291,0.771387,0.775104,0.852426,0.85548,3.847776,3.875145,3.834276,4.09583,4.119273
sklearn-pmml-model,0.004373,0.004747,0.001384,0.011591,0.001682,0.013437,0.007913,0.001325,0.012407,0.001462


In [26]:
load_times + pred_times

Unnamed: 0_level_0,Wine,Wine,Wine,Wine,Wine,Breast cancer,Breast cancer,Breast cancer,Breast cancer,Breast cancer
Unnamed: 0_level_1,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting,Linear model,Naive Bayes,Decision tree,Random Forest,Gradient boosting
PyPMML,0.773291,0.77384,0.777425,0.895204,0.902355,3.849855,3.878448,3.83623,4.16358,4.13766
sklearn-pmml-model,0.005813,0.006357,0.002693,0.108882,0.121823,0.015723,0.011278,0.002807,0.146234,0.044016
