Fix broken warning and reduce complexity of feature validation #35

This overhauls the feature validation to dramatically simplify changes that were introduced by #22 and #34. I think the easiest way to explain the changes of this commit are to explain how we ended up with that complexity. In #22, we needed to ensure that our estimators got feature names when fitted with dataframe inputs. This was complicated by the fact that they received transformed inputs, and our transformers cast dataframes to arrays, dropping the names. We dealt with this using a `NamedFeatureArray` class that would retain names after being cast. In #34, we implemented the `set_output` API and used this instead to ensure that our transformers returned dataframe outputs. At the same time, we decided that estimator feature names should match the names that original names, not the transformed names, and implemented the `feature_names_in_` property. What we failed to realize is that the `feature_names_in_` property removed any need to extract feature names from the transformed inputs because that extraction is handled entirely by the transformers! This removed the need to use the `set_output` API. The other major change is that the `_check_feature_names` method that we overrode in #34 does not need to be implemented directly because we can instead use that method from the transformer (via `_validate_data`) without having to worry about mismatched feature names during fitting.
lemma-osu · Jun 8, 2023 · b213566 · b213566
1 parent 58396ff
commit b213566
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 92 deletions.
diff --git a/src/sknnr/_base.py b/src/sknnr/_base.py
@@ -1,10 +1,6 @@
-import warnings
-
 import numpy as np
 from sklearn.neighbors import KNeighborsRegressor
-from sklearn.utils.validation import _get_feature_names, check_is_fitted
-
-from .transformers._base import set_temp_output
+from sklearn.utils.validation import check_is_fitted
 
 
 class IDNeighborsRegressor(KNeighborsRegressor):
@@ -25,48 +21,17 @@ def feature_names_in_(self):
     def _check_feature_names(self, X, *, reset):
         """Override BaseEstimator._check_feature_names to prevent errors.
 
-        REFERENCE: https://github.com/scikit-learn/scikit-learn/blob/849c2f10f56b908abd9abbbffca8941494cc0bb0/sklearn/base.py#L409  # noqa: E501
-        This is identical to the sklearn implementation except that:
-
-        1. The reset option is ignored to avoid setting feature names.
-        2. The second half of the method that validates X feature names against the
-              fitted feature names is removed. This is because we want estimators to
-              return the feature names that were used to fit the transformer, not the
-              feature names that were used to fit the estimator.
+        This check would fail during fitting because `feature_names_in_` stores original
+        names while X contains transformed names. We instead rely on the transformer to
+        check feature names and warn or raise for mismatches.
         """
-        fitted_feature_names = getattr(self, "feature_names_in_", None)
-        X_feature_names = _get_feature_names(X)
-
-        if fitted_feature_names is None and X_feature_names is None:
-            return
-
-        if X_feature_names is not None and fitted_feature_names is None:
-            warnings.warn(
-                f"X has feature names, but {self.__class__.__name__} was fitted without"
-                " feature names",
-                stacklevel=2,
-            )
-            return
-
-        if X_feature_names is None and fitted_feature_names is not None:
-            warnings.warn(
-                "X does not have valid feature names, but"
-                f" {self.__class__.__name__} was fitted with feature names",
-                stacklevel=2,
-            )
-            return
+        return
 
     def _apply_transform(self, X) -> np.ndarray:
         """Apply the stored transform to the input data."""
         check_is_fitted(self, "transform_")
-
-        # Temporarily run the transformer in pandas mode for dataframe inputs to ensure
-        # that features are passed through to subsequent steps.
-        output_mode = "pandas" if hasattr(X, "iloc") else "default"
-        with set_temp_output(self.transform_, temp_mode=output_mode):  # type: ignore
-            X_transformed = self.transform_.transform(X)
-
-        return X_transformed
+        self.transform_._validate_data(X, reset=False)
+        return self.transform_.transform(X)
 
     def fit(self, X, y):
         """Fit using transformed feature data."""

diff --git a/src/sknnr/transformers/_base.py b/src/sknnr/transformers/_base.py
@@ -1,8 +1,4 @@
-from contextlib import contextmanager
-from typing import Literal
-
 import numpy as np
-from sklearn.base import TransformerMixin
 from sklearn.preprocessing import StandardScaler
 
 
@@ -15,17 +11,3 @@ def fit(self, X, y=None, sample_weight=None):
         scaler = super().fit(X, y, sample_weight)
         scaler.scale_ = np.std(np.asarray(X), axis=0, ddof=self.ddof)
         return scaler
-
-
-@contextmanager
-def set_temp_output(
-    transformer: TransformerMixin, temp_mode: Literal["default", "pandas"]
-):
-    """Temporarily set the output mode of a transformer."""
-    previous_config = getattr(transformer, "_sklearn_output_config", {}).copy()
-
-    transformer.set_output(transform=temp_mode)
-    try:
-        yield
-    finally:
-        transformer._sklearn_output_config = previous_config
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
@@ -69,23 +69,26 @@ def test_estimators_support_continuous_multioutput(estimator, moscow_euclidean):
     estimator.predict(moscow_euclidean.X)
 
 
+@pytest.mark.parametrize("with_names", [True, False])
 @pytest.mark.parametrize("estimator", get_kneighbor_estimator_classes())
-def test_estimators_support_dataframes(estimator, moscow_euclidean):
+def test_estimators_support_dataframes(estimator, with_names, moscow_euclidean):
     """All estimators should fit and predict data stored as dataframes."""
     estimator = estimator()
     num_features = moscow_euclidean.X.shape[1]
-    feature_names = [f"col_{i}" for i in range(num_features)]
+    feature_names = [f"col_{i}" for i in range(num_features)] if with_names else None
 
     X_df = pd.DataFrame(moscow_euclidean.X, columns=feature_names)
     y_df = pd.DataFrame(moscow_euclidean.y)
 
     estimator.fit(X_df, y_df)
     estimator.predict(X_df)
-    assert_array_equal(estimator.feature_names_in_, feature_names)
+
+    assert_array_equal(getattr(estimator, "feature_names_in_", None), feature_names)
 
 
+@pytest.mark.parametrize("fit_names", [True, False])
 @pytest.mark.parametrize("estimator", get_kneighbor_estimator_classes())
-def test_estimators_warn_for_missing_features(estimator, moscow_euclidean):
+def test_estimators_warn_for_missing_features(estimator, fit_names, moscow_euclidean):
     """All estimators should warn when fitting and predicting feature names mismatch."""
     estimator = estimator()
     num_features = moscow_euclidean.X.shape[1]
@@ -95,15 +98,16 @@ def test_estimators_warn_for_missing_features(estimator, moscow_euclidean):
     y = moscow_euclidean.y
     X_df = pd.DataFrame(X, columns=feature_names)
 
-    # Fit without feature names, predict with feature names
-    with pytest.warns(UserWarning, match="fitted without feature names"):
-        estimator.fit(X, y)
-        estimator.predict(X_df)
+    if fit_names:
+        msg = "fitted with feature names"
+        fit_X, predict_X = X_df, X
+    else:
+        msg = "fitted without feature names"
+        fit_X, predict_X = X, X_df
 
-    # Fit with feature names, predict without feature names
-    with pytest.warns(UserWarning, match="fitted with feature names"):
-        estimator.fit(X_df, y)
-        estimator.predict(X)
+    with pytest.warns(UserWarning, match=msg):
+        estimator.fit(fit_X, y)
+        estimator.predict(predict_X)
 
 
 @pytest.mark.parametrize("output_mode", ["default", "pandas"])

diff --git a/tests/test_transformers.py b/tests/test_transformers.py
@@ -1,6 +1,5 @@
 from typing import List, Type
 
-import numpy as np
 import pandas as pd
 import pytest
 from numpy.testing import assert_array_equal
@@ -9,7 +8,6 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import NotFittedError
 
-from sknnr._base import set_temp_output
 from sknnr.transformers import (
     CCATransformer,
     CCorATransformer,
@@ -106,21 +104,3 @@ def test_transformers_raise_notfitted_transform(transformer, moscow_euclidean):
     """Attempting to call transform on an unfitted transformer should raise."""
     with pytest.raises(NotFittedError):
         transformer().transform(moscow_euclidean.X)
-
-
-@pytest.mark.parametrize("config_type", ["global", "transformer"])
-def test_set_temp_output(moscow_euclidean, config_type):
-    """Test that set_temp_output works as expected."""
-    transformer = StandardScaler().fit(moscow_euclidean.X, moscow_euclidean.y)
-
-    if config_type == "global":
-        set_config(transform_output="pandas")
-    else:
-        transformer.set_output(transform="pandas")
-
-    # Temp output mode should override previously set config
-    with set_temp_output(transformer=transformer, temp_mode="default"):
-        assert isinstance(transformer.transform(moscow_euclidean.X), np.ndarray)
-
-    # Previous config should be restored
-    assert isinstance(transformer.transform(moscow_euclidean.X), pd.DataFrame)