diff --git a/src/sknnr/_base.py b/src/sknnr/_base.py index 066e098..d240f39 100644 --- a/src/sknnr/_base.py +++ b/src/sknnr/_base.py @@ -1,7 +1,26 @@ +import numpy as np from sklearn.neighbors import KNeighborsRegressor from sklearn.utils.validation import check_is_fitted +class NamedFeatureArray(np.ndarray): + """An array with a columns attribute indicating feature names. + + Storing a `columns` attribute allows this array to act like a dataframe for the + purpose of extracting feature names when passed to sklearn estimators. + """ + + def __new__(cls, array, columns=None): + obj = np.asarray(array).view(cls) + obj.columns = columns + return obj + + def __array_finalize__(self, obj): + if obj is None: + return + self.columns = getattr(obj, "columns", None) + + class IDNeighborsRegressor(KNeighborsRegressor): """ Placeholder class for implementing plot ID access. @@ -10,14 +29,39 @@ class IDNeighborsRegressor(KNeighborsRegressor): class TransformedKNeighborsMixin(KNeighborsRegressor): """ - Mixin for KNeighbors regressors that store a `transform_` during fitting (e.g. - GNN). + Mixin for KNeighbors regressors that apply transformations to the feature data. """ + def _apply_transform(self, X) -> NamedFeatureArray: + """Apply the stored transform to the input data. + + Note + ---- + Transforming will cast input data to numpy arrays. To preserve feature names + in the case of dataframe inputs, this method will wrap the transformed array + in a `NamedFeatureArray` with a `columns` attribute, allowing `sklearn` to + parse and store feature names. + """ + check_is_fitted(self, "transform_") + X_transformed = self.transform_.transform(X) + if hasattr(X, "columns"): + X_transformed = NamedFeatureArray(X_transformed, columns=X.columns) + + return X_transformed + + def fit(self, X, y): + """Fit using transformed feature data.""" + X_transformed = self._apply_transform(X) + return super().fit(X_transformed, y) + + def predict(self, X): + """Predict using transformed feature data.""" + X_transformed = self._apply_transform(X) + return super().predict(X_transformed) + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - if X is not None: - check_is_fitted(self, "transform_") - X = self.transform_.transform(X) + """Return neighbor indices and distances using transformed feature data.""" + X_transformed = self._apply_transform(X) if X is not None else X return super().kneighbors( - X=X, n_neighbors=n_neighbors, return_distance=return_distance + X=X_transformed, n_neighbors=n_neighbors, return_distance=return_distance ) diff --git a/src/sknnr/_euclidean.py b/src/sknnr/_euclidean.py index 468d86e..5144b59 100644 --- a/src/sknnr/_euclidean.py +++ b/src/sknnr/_euclidean.py @@ -1,5 +1,3 @@ -from sklearn.utils.validation import check_is_fitted - from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin from .transformers import StandardScalerWithDOF @@ -7,10 +5,4 @@ class EuclideanKNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin): def fit(self, X, y): self.transform_ = StandardScalerWithDOF(ddof=1).fit(X) - X = self.transform_.transform(X) return super().fit(X, y) - - def predict(self, X): - check_is_fitted(self) - X = self.transform_.transform(X) - return super().predict(X) diff --git a/src/sknnr/_gnn.py b/src/sknnr/_gnn.py index 138341b..4822ceb 100644 --- a/src/sknnr/_gnn.py +++ b/src/sknnr/_gnn.py @@ -1,5 +1,3 @@ -from sklearn.utils.validation import check_is_fitted - from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin from .transformers import CCATransformer @@ -7,10 +5,4 @@ class GNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin): def fit(self, X, y, spp=None): self.transform_ = CCATransformer().fit(X, y=y, spp=spp) - X = self.transform_.transform(X) return super().fit(X, y) - - def predict(self, X): - check_is_fitted(self) - X = self.transform_.transform(X) - return super().predict(X) diff --git a/src/sknnr/_mahalanobis.py b/src/sknnr/_mahalanobis.py index dcf2530..410444b 100644 --- a/src/sknnr/_mahalanobis.py +++ b/src/sknnr/_mahalanobis.py @@ -1,5 +1,3 @@ -from sklearn.utils.validation import check_is_fitted - from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin from .transformers import MahalanobisTransformer @@ -7,10 +5,4 @@ class MahalanobisKNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin): def fit(self, X, y): self.transform_ = MahalanobisTransformer().fit(X) - X = self.transform_.transform(X) return super().fit(X, y) - - def predict(self, X): - check_is_fitted(self) - X = self.transform_.transform(X) - return super().predict(X) diff --git a/tests/test_estimators.py b/tests/test_estimators.py index e5fd82a..38101e8 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -2,6 +2,7 @@ import pandas as pd import pytest +from numpy.testing import assert_array_equal # from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.validation import NotFittedError @@ -59,6 +60,12 @@ def test_estimators_support_continuous_multioutput(estimator, moscow_euclidean): @pytest.mark.parametrize("estimator", get_kneighbor_estimator_instances()) def test_estimators_support_dataframes(estimator, moscow_euclidean): """All estimators should fit and predict data stored as dataframes.""" - X_df, y_df = pd.DataFrame(moscow_euclidean.X), pd.DataFrame(moscow_euclidean.y) + num_features = moscow_euclidean.X.shape[1] + feature_names = [f"col_{i}" for i in range(num_features)] + + X_df = pd.DataFrame(moscow_euclidean.X, columns=feature_names) + y_df = pd.DataFrame(moscow_euclidean.y) + estimator.fit(X_df, y_df) estimator.predict(X_df) + assert_array_equal(estimator.feature_names_in_, feature_names)