Skip to content

Commit

Permalink
Fix broken warning and reduce complexity of feature validation #35
Browse files Browse the repository at this point in the history
This overhauls the feature validation to dramatically simplify
changes that were introduced by #22 and #34. I think the easiest way
to explain the changes of this commit are to explain how we ended up
with that complexity.

In #22, we needed to ensure that our estimators got feature names
when fitted with dataframe inputs. This was complicated by the fact
that they received transformed inputs, and our transformers cast
dataframes to arrays, dropping the names. We dealt with this using a
`NamedFeatureArray` class that would retain names after being cast.

In #34, we implemented the `set_output` API and used this instead
to ensure that our transformers returned dataframe outputs. At the
same time, we decided that estimator feature names should match the
names that original names, not the transformed names, and implemented
the `feature_names_in_` property.

What we failed to realize is that the `feature_names_in_` property
removed any need to extract feature names from the transformed inputs
because that extraction is handled entirely by the transformers! This
removed the need to use the `set_output` API.

The other major change is that the `_check_feature_names` method that
we overrode in #34 does not need to be implemented directly because
we can instead use that method from the transformer (via `_validate_data`)
without having to worry about mismatched feature names during fitting.
  • Loading branch information
aazuspan committed Jun 8, 2023
1 parent 58396ff commit b213566
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 92 deletions.
49 changes: 7 additions & 42 deletions src/sknnr/_base.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import warnings

import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import _get_feature_names, check_is_fitted

from .transformers._base import set_temp_output
from sklearn.utils.validation import check_is_fitted


class IDNeighborsRegressor(KNeighborsRegressor):
Expand All @@ -25,48 +21,17 @@ def feature_names_in_(self):
def _check_feature_names(self, X, *, reset):
"""Override BaseEstimator._check_feature_names to prevent errors.
REFERENCE: https://github.com/scikit-learn/scikit-learn/blob/849c2f10f56b908abd9abbbffca8941494cc0bb0/sklearn/base.py#L409 # noqa: E501
This is identical to the sklearn implementation except that:
1. The reset option is ignored to avoid setting feature names.
2. The second half of the method that validates X feature names against the
fitted feature names is removed. This is because we want estimators to
return the feature names that were used to fit the transformer, not the
feature names that were used to fit the estimator.
This check would fail during fitting because `feature_names_in_` stores original
names while X contains transformed names. We instead rely on the transformer to
check feature names and warn or raise for mismatches.
"""
fitted_feature_names = getattr(self, "feature_names_in_", None)
X_feature_names = _get_feature_names(X)

if fitted_feature_names is None and X_feature_names is None:
return

if X_feature_names is not None and fitted_feature_names is None:
warnings.warn(
f"X has feature names, but {self.__class__.__name__} was fitted without"
" feature names",
stacklevel=2,
)
return

if X_feature_names is None and fitted_feature_names is not None:
warnings.warn(
"X does not have valid feature names, but"
f" {self.__class__.__name__} was fitted with feature names",
stacklevel=2,
)
return
return

def _apply_transform(self, X) -> np.ndarray:
"""Apply the stored transform to the input data."""
check_is_fitted(self, "transform_")

# Temporarily run the transformer in pandas mode for dataframe inputs to ensure
# that features are passed through to subsequent steps.
output_mode = "pandas" if hasattr(X, "iloc") else "default"
with set_temp_output(self.transform_, temp_mode=output_mode): # type: ignore
X_transformed = self.transform_.transform(X)

return X_transformed
self.transform_._validate_data(X, reset=False)
return self.transform_.transform(X)

def fit(self, X, y):
"""Fit using transformed feature data."""
Expand Down
18 changes: 0 additions & 18 deletions src/sknnr/transformers/_base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
from contextlib import contextmanager
from typing import Literal

import numpy as np
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler


Expand All @@ -15,17 +11,3 @@ def fit(self, X, y=None, sample_weight=None):
scaler = super().fit(X, y, sample_weight)
scaler.scale_ = np.std(np.asarray(X), axis=0, ddof=self.ddof)
return scaler


@contextmanager
def set_temp_output(
transformer: TransformerMixin, temp_mode: Literal["default", "pandas"]
):
"""Temporarily set the output mode of a transformer."""
previous_config = getattr(transformer, "_sklearn_output_config", {}).copy()

transformer.set_output(transform=temp_mode)
try:
yield
finally:
transformer._sklearn_output_config = previous_config
28 changes: 16 additions & 12 deletions tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,26 @@ def test_estimators_support_continuous_multioutput(estimator, moscow_euclidean):
estimator.predict(moscow_euclidean.X)


@pytest.mark.parametrize("with_names", [True, False])
@pytest.mark.parametrize("estimator", get_kneighbor_estimator_classes())
def test_estimators_support_dataframes(estimator, moscow_euclidean):
def test_estimators_support_dataframes(estimator, with_names, moscow_euclidean):
"""All estimators should fit and predict data stored as dataframes."""
estimator = estimator()
num_features = moscow_euclidean.X.shape[1]
feature_names = [f"col_{i}" for i in range(num_features)]
feature_names = [f"col_{i}" for i in range(num_features)] if with_names else None

X_df = pd.DataFrame(moscow_euclidean.X, columns=feature_names)
y_df = pd.DataFrame(moscow_euclidean.y)

estimator.fit(X_df, y_df)
estimator.predict(X_df)
assert_array_equal(estimator.feature_names_in_, feature_names)

assert_array_equal(getattr(estimator, "feature_names_in_", None), feature_names)


@pytest.mark.parametrize("fit_names", [True, False])
@pytest.mark.parametrize("estimator", get_kneighbor_estimator_classes())
def test_estimators_warn_for_missing_features(estimator, moscow_euclidean):
def test_estimators_warn_for_missing_features(estimator, fit_names, moscow_euclidean):
"""All estimators should warn when fitting and predicting feature names mismatch."""
estimator = estimator()
num_features = moscow_euclidean.X.shape[1]
Expand All @@ -95,15 +98,16 @@ def test_estimators_warn_for_missing_features(estimator, moscow_euclidean):
y = moscow_euclidean.y
X_df = pd.DataFrame(X, columns=feature_names)

# Fit without feature names, predict with feature names
with pytest.warns(UserWarning, match="fitted without feature names"):
estimator.fit(X, y)
estimator.predict(X_df)
if fit_names:
msg = "fitted with feature names"
fit_X, predict_X = X_df, X
else:
msg = "fitted without feature names"
fit_X, predict_X = X, X_df

# Fit with feature names, predict without feature names
with pytest.warns(UserWarning, match="fitted with feature names"):
estimator.fit(X_df, y)
estimator.predict(X)
with pytest.warns(UserWarning, match=msg):
estimator.fit(fit_X, y)
estimator.predict(predict_X)


@pytest.mark.parametrize("output_mode", ["default", "pandas"])
Expand Down
20 changes: 0 additions & 20 deletions tests/test_transformers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Type

import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_array_equal
Expand All @@ -9,7 +8,6 @@
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import NotFittedError

from sknnr._base import set_temp_output
from sknnr.transformers import (
CCATransformer,
CCorATransformer,
Expand Down Expand Up @@ -106,21 +104,3 @@ def test_transformers_raise_notfitted_transform(transformer, moscow_euclidean):
"""Attempting to call transform on an unfitted transformer should raise."""
with pytest.raises(NotFittedError):
transformer().transform(moscow_euclidean.X)


@pytest.mark.parametrize("config_type", ["global", "transformer"])
def test_set_temp_output(moscow_euclidean, config_type):
"""Test that set_temp_output works as expected."""
transformer = StandardScaler().fit(moscow_euclidean.X, moscow_euclidean.y)

if config_type == "global":
set_config(transform_output="pandas")
else:
transformer.set_output(transform="pandas")

# Temp output mode should override previously set config
with set_temp_output(transformer=transformer, temp_mode="default"):
assert isinstance(transformer.transform(moscow_euclidean.X), np.ndarray)

# Previous config should be restored
assert isinstance(transformer.transform(moscow_euclidean.X), pd.DataFrame)

0 comments on commit b213566

Please sign in to comment.