-
Notifications
You must be signed in to change notification settings - Fork 114
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bugfix for #158 #159
Bugfix for #158 #159
Changes from 6 commits
51241c6
afffefb
4766672
17f6b70
7103101
ca9e5fe
8f0a918
0f6dcb5
e099b67
54abb09
7a82b98
a027ce6
49d99b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,6 +54,53 @@ def __init__(self, estimator, groups, use_fallback=True): | |
self.groups = groups | ||
self.use_fallback = use_fallback | ||
|
||
def __check_group_cols_exist(self, X): | ||
try: | ||
x_cols = set(X.columns) | ||
except AttributeError: | ||
try: | ||
ncols = X.shape[1] | ||
except IndexError: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it could be. groupby-mean as a model? |
||
ncols = 1 | ||
x_cols = set(range(ncols)) | ||
|
||
# Check whether grouping columns exist | ||
diff = set(as_list(self.groups)) - x_cols | ||
|
||
if len(diff) > 0: | ||
raise KeyError(f'{diff} not in columns of X ({x_cols})') | ||
|
||
@staticmethod | ||
def __check_missing_and_inf(X): | ||
"""Check that all elements of X are non-missing and finite""" | ||
koaning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if np.any(pd.isnull(X)): | ||
raise ValueError("X has NaN values") | ||
try: | ||
koaning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if np.any(np.isinf(X)): | ||
raise ValueError("X has infinite values") | ||
except TypeError: # X cannot be converted to numeric, so checking infinites does not make sense | ||
pass | ||
|
||
def __validate(self, X, y=None): | ||
try: | ||
X_data = X.drop(columns=self.groups, inplace=False) | ||
except AttributeError: # np.array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it feels cleaner to test this using |
||
X_data = np.delete(X, self.groups, axis=1) | ||
|
||
if y is not None: | ||
koaning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
check_X_y(X_data, y) | ||
else: | ||
check_array(X_data) | ||
|
||
self.__check_missing_and_inf(X) | ||
self.__check_group_cols_exist(X) | ||
|
||
def __remove_groups_from_x(self, X): | ||
try: | ||
return X.drop(columns=self.groups, inplace=False) | ||
except AttributeError: # np.array | ||
return np.delete(X, self.groups, axis=1) | ||
|
||
def fit(self, X, y): | ||
""" | ||
Fit the model using X, y as training data. Will also learn the groups | ||
|
@@ -63,15 +110,17 @@ def fit(self, X, y): | |
:param y: array-like, shape=(n_samples,) training data. | ||
:return: Returns an instance of self. | ||
""" | ||
check_X_y(X, y) | ||
self.__validate(X, y) | ||
|
||
pred_col = 'the-column-that-i-want-to-predict-but-dont-have-the-name-for' | ||
if isinstance(X, np.ndarray): | ||
X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])]) | ||
X = X.assign(**{pred_col: y}) | ||
|
||
self.group_colnames_ = [str(_) for _ in as_list(self.groups)] | ||
if any([c not in X.columns for c in self.group_colnames_]): | ||
raise ValueError(f"{self.group_colnames_} not in {X.columns}") | ||
raise KeyError(f"{self.group_colnames_} not in {X.columns}") | ||
|
||
self.X_colnames_ = [_ for _ in X.columns if _ not in self.group_colnames_ and _ is not pred_col] | ||
self.fallback_ = None | ||
if self.use_fallback: | ||
|
@@ -93,7 +142,8 @@ def predict(self, X): | |
:param X: array-like, shape=(n_columns, n_samples,) training data. | ||
:return: array, shape=(n_samples,) the predicted data | ||
""" | ||
check_array(X) | ||
self.__validate(X) | ||
|
||
check_is_fitted(self, ['estimators_', 'groups_', 'group_colnames_', | ||
'X_colnames_', 'fallback_']) | ||
if isinstance(X, np.ndarray): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
checking the type of
X
is probably better to do viaisinstance
instead waiting for anAttributeError
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that in this case the intent is clearer with an isinstance check.