Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store feature names for transformed estimators #22

Merged
merged 2 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 50 additions & 6 deletions src/sknnr/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_is_fitted


class NamedFeatureArray(np.ndarray):
"""An array with a columns attribute indicating feature names.

Storing a `columns` attribute allows this array to act like a dataframe for the
purpose of extracting feature names when passed to sklearn estimators.
"""

def __new__(cls, array, columns=None):
obj = np.asarray(array).view(cls)
obj.columns = columns
return obj

def __array_finalize__(self, obj):
if obj is None:
return
self.columns = getattr(obj, "columns", None)


class IDNeighborsRegressor(KNeighborsRegressor):
"""
Placeholder class for implementing plot ID access.
Expand All @@ -10,14 +29,39 @@ class IDNeighborsRegressor(KNeighborsRegressor):

class TransformedKNeighborsMixin(KNeighborsRegressor):
"""
Mixin for KNeighbors regressors that store a `transform_` during fitting (e.g.
GNN).
Mixin for KNeighbors regressors that apply transformations to the feature data.
"""

def _apply_transform(self, X) -> NamedFeatureArray:
"""Apply the stored transform to the input data.

Note
----
Transforming will cast input data to numpy arrays. To preserve feature names
in the case of dataframe inputs, this method will wrap the transformed array
in a `NamedFeatureArray` with a `columns` attribute, allowing `sklearn` to
parse and store feature names.
"""
check_is_fitted(self, "transform_")
X_transformed = self.transform_.transform(X)
if hasattr(X, "columns"):
X_transformed = NamedFeatureArray(X_transformed, columns=X.columns)

return X_transformed

def fit(self, X, y):
"""Fit using transformed feature data."""
X_transformed = self._apply_transform(X)
return super().fit(X_transformed, y)

def predict(self, X):
"""Predict using transformed feature data."""
X_transformed = self._apply_transform(X)
return super().predict(X_transformed)

def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
if X is not None:
check_is_fitted(self, "transform_")
X = self.transform_.transform(X)
"""Return neighbor indices and distances using transformed feature data."""
X_transformed = self._apply_transform(X) if X is not None else X
return super().kneighbors(
X=X, n_neighbors=n_neighbors, return_distance=return_distance
X=X_transformed, n_neighbors=n_neighbors, return_distance=return_distance
)
8 changes: 0 additions & 8 deletions src/sknnr/_euclidean.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
from sklearn.utils.validation import check_is_fitted

from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin
from .transformers import StandardScalerWithDOF


class EuclideanKNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin):
def fit(self, X, y):
self.transform_ = StandardScalerWithDOF(ddof=1).fit(X)
X = self.transform_.transform(X)
return super().fit(X, y)

def predict(self, X):
check_is_fitted(self)
X = self.transform_.transform(X)
return super().predict(X)
8 changes: 0 additions & 8 deletions src/sknnr/_gnn.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
from sklearn.utils.validation import check_is_fitted

from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin
from .transformers import CCATransformer


class GNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin):
def fit(self, X, y, spp=None):
self.transform_ = CCATransformer().fit(X, y=y, spp=spp)
X = self.transform_.transform(X)
return super().fit(X, y)

def predict(self, X):
check_is_fitted(self)
X = self.transform_.transform(X)
return super().predict(X)
8 changes: 0 additions & 8 deletions src/sknnr/_mahalanobis.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
from sklearn.utils.validation import check_is_fitted

from ._base import IDNeighborsRegressor, TransformedKNeighborsMixin
from .transformers import MahalanobisTransformer


class MahalanobisKNNRegressor(IDNeighborsRegressor, TransformedKNeighborsMixin):
def fit(self, X, y):
self.transform_ = MahalanobisTransformer().fit(X)
X = self.transform_.transform(X)
return super().fit(X, y)

def predict(self, X):
check_is_fitted(self)
X = self.transform_.transform(X)
return super().predict(X)
9 changes: 8 additions & 1 deletion tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd
import pytest
from numpy.testing import assert_array_equal

# from sklearn.utils.estimator_checks import parametrize_with_checks
from sklearn.utils.validation import NotFittedError
Expand Down Expand Up @@ -59,6 +60,12 @@ def test_estimators_support_continuous_multioutput(estimator, moscow_euclidean):
@pytest.mark.parametrize("estimator", get_kneighbor_estimator_instances())
def test_estimators_support_dataframes(estimator, moscow_euclidean):
"""All estimators should fit and predict data stored as dataframes."""
X_df, y_df = pd.DataFrame(moscow_euclidean.X), pd.DataFrame(moscow_euclidean.y)
num_features = moscow_euclidean.X.shape[1]
feature_names = [f"col_{i}" for i in range(num_features)]

X_df = pd.DataFrame(moscow_euclidean.X, columns=feature_names)
y_df = pd.DataFrame(moscow_euclidean.y)

estimator.fit(X_df, y_df)
estimator.predict(X_df)
assert_array_equal(estimator.feature_names_in_, feature_names)
Loading