Skip to content

Commit

Permalink
feat: validate input array shape, and attempt to subscript columns fo…
Browse files Browse the repository at this point in the history
…r data frames

This commit comes with some internal refactoring to show more explicitly which estimators use
one-hot-encoding and which use integer encoding for categorical variable support.
  • Loading branch information
iamDecode committed Jul 9, 2021
1 parent 4b6e11c commit 2555898
Show file tree
Hide file tree
Showing 11 changed files with 299 additions and 519 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -30,7 +30,7 @@ This library is in beta, and currently not all models are supported. The library
| [Ridge](sklearn_pmml_model/linear_model) | ✅<sup>2</sup> || ✅<sup>3</sup> |
| [Lasso](sklearn_pmml_model/linear_model) | ✅<sup>2</sup> || ✅<sup>3</sup> |
| [ElasticNet](sklearn_pmml_model/linear_model) | ✅<sup>2</sup> |||
| [Gaussian Naive Bayes](sklearn_pmml_model/naive_bayes) || | |
| [Gaussian Naive Bayes](sklearn_pmml_model/naive_bayes) || | ✅<sup>3</sup> |

<sup>1</sup> Categorical feature support using slightly modified internals, based on [scikit-learn#12866](https://github.com/scikit-learn/scikit-learn/pull/12866).

Expand Down
455 changes: 85 additions & 370 deletions models/tree-iris.pmml

Large diffs are not rendered by default.

78 changes: 71 additions & 7 deletions sklearn_pmml_model/base.py
@@ -1,11 +1,13 @@
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xml.etree import cElementTree as eTree
from cached_property import cached_property
from sklearn_pmml_model.datatypes import Category
from collections import OrderedDict
import datetime
import numpy as np
import pandas as pd


class PMMLBaseEstimator(BaseEstimator):
Expand Down Expand Up @@ -137,13 +139,18 @@ def fit(self, x, y):
raise Exception('Not supported.')

def _prepare_data(self, X):
X = np.asarray(X)
pmml_features = [f for f,e in self.fields.items() if e is not self.target_field and e.tag == 'DataField']

for column, (index, field_type) in self.field_mapping.items():
if type(field_type) is Category and index is not None and type(X[0,index]) is str:
categories = [str(v) for v in field_type.categories]
categories += [c for c in np.unique(X[:,index]) if c not in categories]
X[:,index] = [categories.index(x) for x in X[:,index]]
if isinstance(X, pd.DataFrame):
X.columns = X.columns.map(str)

try:
X = X[pmml_features]
except KeyError:
raise Exception('The features in the input data do not match features expected by the PMML model.')
elif X.shape[1] != len(pmml_features):
raise Exception('The number of features in provided data does not match expected number of features in the PMML. '
'Provide pandas.Dataframe, or provide data matching the DataFields in the PMML document.')

return X

Expand Down Expand Up @@ -258,3 +265,60 @@ def findall(element, path):
if element is None:
return []
return element.findall(path)


class OneHotEncodingMixin:
"""
Mixin class to automatically one-hot encode categorical variables.
"""
def __init__(self):
# Setup a column transformer to encode categorical variables
target = self.target_field.get('name')
fields = [field for name, field in self.fields.items() if name != target]

def encoder_for(field):
if field.get('optype') != 'categorical':
return 'passthrough'

encoder = OneHotEncoder()
encoder.categories_ = np.array([self.field_mapping[field.get('name')][1].categories])
encoder.drop_idx_ = np.array([None for x in encoder.categories_])
encoder._legacy_mode = False
return encoder

transformer = ColumnTransformer(
transformers=[
(field.get('name'), encoder_for(field), [self.field_mapping[field.get('name')][0]])
for field in fields
if field.tag == 'DataField'
]
)

X = np.array([[0 for field in fields if field.tag == "DataField"]])
transformer._validate_transformers()
transformer._validate_column_callables(X)
transformer._validate_remainder(X)
transformer.transformers_ = transformer.transformers
transformer.sparse_output_ = False
transformer._feature_names_in = None

self.transformer = transformer

def _prepare_data(self, X):
X = super()._prepare_data(X)
return self.transformer.transform(X)


class IntegerEncodingMixin:
def _prepare_data(self, X):
X = super()._prepare_data(X)
X = np.asarray(X)

for column, (index, field_type) in self.field_mapping.items():
if type(field_type) is Category and index is not None and type(X[0, index]) is str:
categories = [str(v) for v in field_type.categories]
categories += [c for c in np.unique(X[:, index]) if c not in categories]
X[:, index] = [categories.index(x) for x in X[:, index]]

return X
6 changes: 3 additions & 3 deletions sklearn_pmml_model/ensemble/forest.py
@@ -1,11 +1,11 @@
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor
from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor, IntegerEncodingMixin
from sklearn_pmml_model.tree import get_tree


class PMMLForestClassifier(PMMLBaseClassifier, RandomForestClassifier):
class PMMLForestClassifier(IntegerEncodingMixin, PMMLBaseClassifier, RandomForestClassifier):
"""
A random forest classifier.
Expand Down Expand Up @@ -92,7 +92,7 @@ def _more_tags(self):
return RandomForestClassifier._more_tags(self)


class PMMLForestRegressor(PMMLBaseRegressor, RandomForestRegressor):
class PMMLForestRegressor(IntegerEncodingMixin, PMMLBaseRegressor, RandomForestRegressor):
"""
A random forest regressor.
Expand Down
6 changes: 3 additions & 3 deletions sklearn_pmml_model/ensemble/gb.py
Expand Up @@ -3,13 +3,13 @@
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, _gb_losses
from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor
from sklearn_pmml_model.base import PMMLBaseClassifier, PMMLBaseRegressor, IntegerEncodingMixin
from sklearn_pmml_model.tree import get_tree
from scipy.special import expit
from ._gradient_boosting import predict_stages


class PMMLGradientBoostingClassifier(PMMLBaseClassifier, GradientBoostingClassifier, ABC):
class PMMLGradientBoostingClassifier(IntegerEncodingMixin, PMMLBaseClassifier, GradientBoostingClassifier, ABC):
"""
Gradient Boosting for classification.
Expand Down Expand Up @@ -135,7 +135,7 @@ def _more_tags(self):
return GradientBoostingClassifier._more_tags(self)


class PMMLGradientBoostingRegressor(PMMLBaseRegressor, GradientBoostingRegressor, ABC):
class PMMLGradientBoostingRegressor(IntegerEncodingMixin, PMMLBaseRegressor, GradientBoostingRegressor, ABC):
"""
Gradient Boosting for regression.
Expand Down
110 changes: 7 additions & 103 deletions sklearn_pmml_model/linear_model/base.py
@@ -1,107 +1,9 @@
from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier, OneHotEncodingMixin
import numpy as np
from itertools import chain


class PMMLLinearModel(PMMLBaseRegressor):
"""
Abstract class for linear models.
"""
def __init__(self, pmml):
PMMLBaseRegressor.__init__(self, pmml)

# Setup a column transformer to deal with categorical variables
target = self.target_field.get('name')
fields = [field for name, field in self.fields.items() if name != target]

def encoder_for(field):
if field.get('optype') != 'categorical':
return 'passthrough'

encoder = OneHotEncoder()
encoder.categories_ = np.array([self.field_mapping[field.get('name')][1].categories])
encoder.drop_idx_ = np.array([None for x in encoder.categories_])
encoder._legacy_mode = False
return encoder

transformer = ColumnTransformer(
transformers=[
(field.get('name'), encoder_for(field), [self.field_mapping[field.get('name')][0]])
for field in fields
if field.tag == 'DataField'
]
)

X = np.array([[0 for field in fields if field.tag == "DataField"]])
transformer._validate_transformers()
transformer._validate_column_callables(X)
transformer._validate_remainder(X)
transformer.transformers_ = transformer.transformers
transformer.sparse_output_ = False
transformer._feature_names_in = None

self.transformer = transformer

def _prepare_data(self, X):
"""
Overrides the default data preparation operation by one-hot encoding
categorical variables.
"""
return self.transformer.transform(X)


class PMMLLinearClassifier(PMMLBaseClassifier):
"""
Abstract class for linear models.
"""
def __init__(self, pmml):
PMMLBaseClassifier.__init__(self, pmml)

# Setup a column transformer to deal with categorical variables
target = self.target_field.get('name')
fields = [field for name, field in self.fields.items() if name != target]

def encoder_for(field):
if field.get('optype') != 'categorical':
return 'passthrough'

encoder = OneHotEncoder()
encoder.categories_ = np.array([self.field_mapping[field.get('name')][1].categories])
encoder.drop_idx_ = np.array([None for x in encoder.categories_])
encoder._legacy_mode = False
return encoder

transformer = ColumnTransformer(
transformers=[
(field.get('name'), encoder_for(field), [self.field_mapping[field.get('name')][0]])
for field in fields
if field.tag == 'DataField'
]
)

X = np.array([[0 for field in fields if field.tag == "DataField"]])
transformer._validate_transformers()
transformer._validate_column_callables(X)
transformer._validate_remainder(X)
transformer.transformers_ = transformer.transformers
transformer.sparse_output_ = False
transformer._feature_names_in = None

self.transformer = transformer

def _prepare_data(self, X):
"""
Overrides the default data preparation operation by one-hot encoding
categorical variables.
"""
return self.transformer.transform(X)


class PMMLGeneralizedLinearRegressor(PMMLLinearModel):
class PMMLGeneralizedLinearRegressor(OneHotEncodingMixin, PMMLBaseRegressor):
"""
Abstract class for Generalized Linear Models (GLMs).
Expand All @@ -122,7 +24,8 @@ class PMMLGeneralizedLinearRegressor(PMMLLinearModel):
"""
def __init__(self, pmml):
PMMLLinearModel.__init__(self, pmml)
PMMLBaseRegressor.__init__(self, pmml)
OneHotEncodingMixin.__init__(self)

# Import coefficients and intercepts
model = self.root.find('GeneralRegressionModel')
Expand All @@ -134,7 +37,7 @@ def __init__(self, pmml):
self.intercept_ = _get_intercept(model)


class PMMLGeneralizedLinearClassifier(PMMLLinearClassifier):
class PMMLGeneralizedLinearClassifier(OneHotEncodingMixin, PMMLBaseClassifier):
"""
Abstract class for Generalized Linear Models (GLMs).
Expand All @@ -155,7 +58,8 @@ class PMMLGeneralizedLinearClassifier(PMMLLinearClassifier):
"""
def __init__(self, pmml):
PMMLLinearClassifier.__init__(self, pmml)
PMMLBaseClassifier.__init__(self, pmml)
OneHotEncodingMixin.__init__(self)

# Import coefficients and intercepts
model = self.root.find('GeneralRegressionModel')
Expand Down
18 changes: 10 additions & 8 deletions sklearn_pmml_model/linear_model/implementations.py
@@ -1,11 +1,11 @@
from sklearn.linear_model import LinearRegression, Ridge, RidgeClassifier, Lasso, ElasticNet, LogisticRegression
from sklearn_pmml_model.linear_model.base import PMMLLinearModel, PMMLLinearClassifier, PMMLGeneralizedLinearRegressor,\
PMMLGeneralizedLinearClassifier
from sklearn_pmml_model.base import PMMLBaseRegressor, PMMLBaseClassifier, OneHotEncodingMixin
from sklearn_pmml_model.linear_model.base import PMMLGeneralizedLinearRegressor, PMMLGeneralizedLinearClassifier
from itertools import chain
import numpy as np


class PMMLLinearRegression(PMMLLinearModel, LinearRegression):
class PMMLLinearRegression(OneHotEncodingMixin, PMMLBaseRegressor, LinearRegression):
"""
Ordinary least squares Linear Regression.
Expand All @@ -25,7 +25,8 @@ class PMMLLinearRegression(PMMLLinearModel, LinearRegression):
"""
def __init__(self, pmml):
PMMLLinearModel.__init__(self, pmml)
PMMLBaseRegressor.__init__(self, pmml)
OneHotEncodingMixin.__init__(self)

# Import coefficients and intercepts
model = self.root.find('RegressionModel')
Expand All @@ -51,13 +52,13 @@ def __init__(self, pmml):
self.intercept_ = self.intercept_[0]

def fit(self, x, y):
return PMMLLinearModel.fit(self, x, y)
return PMMLBaseRegressor.fit(self, x, y)

def _more_tags(self):
return LinearRegression._more_tags(self)


class PMMLLogisticRegression(PMMLLinearClassifier, LogisticRegression):
class PMMLLogisticRegression(OneHotEncodingMixin, PMMLBaseClassifier, LogisticRegression):
"""
Logistic Regression (aka logit, MaxEnt) classifier.
Expand All @@ -77,7 +78,8 @@ class PMMLLogisticRegression(PMMLLinearClassifier, LogisticRegression):
"""
def __init__(self, pmml):
PMMLLinearClassifier.__init__(self, pmml)
PMMLBaseClassifier.__init__(self, pmml)
OneHotEncodingMixin.__init__(self)

# Import coefficients and intercepts
model = self.root.find('RegressionModel')
Expand Down Expand Up @@ -111,7 +113,7 @@ def __init__(self, pmml):
self.solver = 'lbfgs'

def fit(self, x, y):
return PMMLLinearClassifier.fit(self, x, y)
return PMMLBaseClassifier.fit(self, x, y)

def _more_tags(self):
return LogisticRegression._more_tags(self)
Expand Down
5 changes: 3 additions & 2 deletions sklearn_pmml_model/naive_bayes/implementations.py
@@ -1,10 +1,10 @@
from sklearn_pmml_model.base import PMMLBaseClassifier
from sklearn_pmml_model.base import PMMLBaseClassifier, OneHotEncodingMixin
from sklearn.naive_bayes import GaussianNB
import numpy as np
from itertools import chain


class PMMLGaussianNB(PMMLBaseClassifier, GaussianNB):
class PMMLGaussianNB(OneHotEncodingMixin, PMMLBaseClassifier, GaussianNB):
"""
Gaussian Naive Bayes (GaussianNB)
Expand All @@ -26,6 +26,7 @@ class PMMLGaussianNB(PMMLBaseClassifier, GaussianNB):
"""
def __init__(self, pmml):
PMMLBaseClassifier.__init__(self, pmml)
OneHotEncodingMixin.__init__(self)

model = self.root.find('NaiveBayesModel')

Expand Down

0 comments on commit 2555898

Please sign in to comment.