# Dataset

In [None]:
from docs_jpmml_org import fetch_bank_marketing

dataset = fetch_bank_marketing()
#print(dataset)

X, y = dataset.getData()

categorical_cols = dataset.getCategoricalColumns()
continuous_cols = dataset.getContinuousColumns()

# Workflow A: ordinal encoding

Recommended for NumPy data.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier

transformer = ColumnTransformer([
    ("continuous", "passthrough", continuous_cols),
    ("categorical", OrdinalEncoder(), categorical_cols)
])

feature_types = ["q"] * len(continuous_cols) + ["c"] * len(categorical_cols)

le = LabelEncoder()
y_xgb = le.fit_transform(y).reshape((-1, ))

classifier = XGBClassifier(feature_types = feature_types, random_state = 42)
classifier._le = le

pipeline = Pipeline([
    ("transformer", transformer),
    ("classifier", classifier)
])
pipeline.fit(X, y_xgb)

# Workflow B: casting

Recommended for Pandas data.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn2pmml.preprocessing import CastTransformer
from xgboost import XGBClassifier

import numpy

transformer = ColumnTransformer(
    [("continuous", CastTransformer(dtype = numpy.float32), continuous_cols)] +
    [(categorical_col, CastTransformer(dtype = "category"), [categorical_col]) for categorical_col in categorical_cols]
)
transformer.set_output(transform = "pandas")

le = LabelEncoder()
y_xgb = le.fit_transform(y).reshape((-1, ))

classifier = XGBClassifier(enable_categorical = True, random_state = 42)
classifier._le = le

pipeline = Pipeline([
    ("transformer", transformer),
    ("classifier", classifier)
])
pipeline.fit(X, y_xgb)

# Export to PMML

In [None]:
from sklearn2pmml import sklearn2pmml

sklearn2pmml(pipeline, "XGBoostPipeline.pmml")