lemma-osu · aazuspan · May 15, 2023 · May 15, 2023 · May 15, 2023
diff --git a/src/sknnr/_base.py b/src/sknnr/_base.py
@@ -1,7 +1,26 @@
+import numpy as np
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.utils.validation import check_is_fitted
 
 
+class NamedFeatureArray(np.ndarray):
+    """An array with a columns attribute indicating feature names.
+
+    Storing a `columns` attribute allows this array to act like  a dataframe for the
+    purpose of extracting feature names when passed to sklearn estimators.
+    """
+
+    def __new__(cls, array, columns=None):
+        obj = np.asarray(array).view(cls)
+        obj.columns = columns
+        return obj
+
+    def __array_finalize__(self, obj):
+        if obj is None:
+            return
+        self.columns = getattr(obj, "columns", None)
+
+
 class IDNeighborsRegressor(KNeighborsRegressor):
     """
     Placeholder class for implementing plot ID access.
@@ -10,14 +29,39 @@ class IDNeighborsRegressor(KNeighborsRegressor):
 
 class TransformedKNeighborsMixin(KNeighborsRegressor):
     """
-    Mixin for KNeighbors regressors that store a `transform_` during fitting (e.g.
-    GNN).
+    Mixin for KNeighbors regressors that apply transformations to the feature data.
     """
 
+    def _apply_transform(self, X) -> NamedFeatureArray:
+        """Apply the stored transform to the input data.
+
+        Note
+        ----
+        Transforming will cast input data to numpy arrays. To preserve feature names
+        in the case of dataframe inputs, this method will wrap the transformed array
+        in a `NamedFeatureArray` with a `columns` attribute, allowing `sklearn` to
+        parse and store feature names.
+        """
+        check_is_fitted(self, "transform_")
+        X_transformed = self.transform_.transform(X)
+        if hasattr(X, "columns"):
+            X_transformed = NamedFeatureArray(X_transformed, columns=X.columns)
+
+        return X_transformed
+
+    def fit(self, X, y):
+        """Fit using transformed feature data."""
+        X_transformed = self._apply_transform(X)
+        return super().fit(X_transformed, y)
+
+    def predict(self, X):
+        """Predict using transformed feature data."""
+        X_transformed = self._apply_transform(X)
+        return super().predict(X_transformed)
+
     def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
-        if X is not None:
-            check_is_fitted(self, "transform_")
-            X = self.transform_.transform(X)
+        """Return neighbor indices and distances using transformed feature data."""
+        X_transformed = self._apply_transform(X) if X is not None else X
         return super().kneighbors(
-            X=X, n_neighbors=n_neighbors, return_distance=return_distance
+            X=X_transformed, n_neighbors=n_neighbors, return_distance=return_distance
         )
diff --git a/src/sknnr/_euclidean.py b/src/sknnr/_euclidean.py
@@ -1,16 +1,8 @@
-from sklearn.utils.validation import check_is_fitted
-
 from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin
 from .transformers import StandardScalerWithDOF
 
 
 class EuclideanKNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin):
     def fit(self, X, y):
         self.transform_ = StandardScalerWithDOF(ddof=1).fit(X)
-        X = self.transform_.transform(X)
         return super().fit(X, y)
-
-    def predict(self, X):
-        check_is_fitted(self)
-        X = self.transform_.transform(X)
-        return super().predict(X)
diff --git a/src/sknnr/_gnn.py b/src/sknnr/_gnn.py
@@ -1,16 +1,8 @@
-from sklearn.utils.validation import check_is_fitted
-
 from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin
 from .transformers import CCATransformer
 
 
 class GNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin):
     def fit(self, X, y, spp=None):
         self.transform_ = CCATransformer().fit(X, y=y, spp=spp)
-        X = self.transform_.transform(X)
         return super().fit(X, y)
-
-    def predict(self, X):
-        check_is_fitted(self)
-        X = self.transform_.transform(X)
-        return super().predict(X)
diff --git a/src/sknnr/_mahalanobis.py b/src/sknnr/_mahalanobis.py
@@ -1,16 +1,8 @@
-from sklearn.utils.validation import check_is_fitted
-
 from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin
 from .transformers import MahalanobisTransformer
 
 
 class MahalanobisKNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin):
     def fit(self, X, y):
         self.transform_ = MahalanobisTransformer().fit(X)
-        X = self.transform_.transform(X)
         return super().fit(X, y)
-
-    def predict(self, X):
-        check_is_fitted(self)
-        X = self.transform_.transform(X)
-        return super().predict(X)
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 import pytest
+from numpy.testing import assert_array_equal
 
 # from sklearn.utils.estimator_checks import parametrize_with_checks
 from sklearn.utils.validation import NotFittedError
@@ -59,6 +60,12 @@ def test_estimators_support_continuous_multioutput(estimator, moscow_euclidean):
 @pytest.mark.parametrize("estimator", get_kneighbor_estimator_instances())
 def test_estimators_support_dataframes(estimator, moscow_euclidean):
     """All estimators should fit and predict data stored as dataframes."""
-    X_df, y_df = pd.DataFrame(moscow_euclidean.X), pd.DataFrame(moscow_euclidean.y)
+    num_features = moscow_euclidean.X.shape[1]
+    feature_names = [f"col_{i}" for i in range(num_features)]
+
+    X_df = pd.DataFrame(moscow_euclidean.X, columns=feature_names)
+    y_df = pd.DataFrame(moscow_euclidean.y)
+
     estimator.fit(X_df, y_df)
     estimator.predict(X_df)
+    assert_array_equal(estimator.feature_names_in_, feature_names)