forked from deepchecks/deepchecks
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
simplify weak segments (deepchecks#2485)
- Loading branch information
1 parent
2dc9314
commit ed8c53f
Showing
9 changed files
with
243 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) | ||
# | ||
# This file is part of Deepchecks. | ||
# Deepchecks is distributed under the terms of the GNU Affero General | ||
# Public License (version 3 or later). | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>. | ||
# ---------------------------------------------------------------------------- | ||
# | ||
"""Utils module for auto-detecting interesting segments in text.""" | ||
|
||
import warnings | ||
from typing import Hashable, List, Optional, Union | ||
|
||
import pandas as pd | ||
|
||
from deepchecks.core.errors import DeepchecksProcessError | ||
from deepchecks.nlp import TextData | ||
from deepchecks.utils.dataframes import select_from_dataframe | ||
|
||
|
||
def get_relevant_data_table(text_data: TextData, data_type: str, columns: Union[Hashable, List[Hashable], None], | ||
ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int], | ||
add_label: bool = True): | ||
"""Get relevant data table from the database.""" | ||
if data_type == 'metadata': | ||
features = select_from_dataframe(text_data.metadata, columns, ignore_columns) | ||
cat_features = [col for col in features.columns if col in text_data.categorical_metadata_columns] | ||
|
||
elif data_type == 'properties': | ||
features = select_from_dataframe(text_data.properties, columns, ignore_columns) | ||
cat_features = [col for col in features.columns if col in text_data.categorical_properties] | ||
else: | ||
raise DeepchecksProcessError(f'Unknown segment_by value: {data_type}') | ||
|
||
if n_top_features is not None and n_top_features < features.shape[1]: | ||
_warn_n_top_columns(data_type, n_top_features) | ||
|
||
if add_label: # most commonly used for target encoding | ||
features['label'] = pd.Series(text_data.label, index=features.index) | ||
|
||
return features, cat_features | ||
|
||
|
||
def _warn_n_top_columns(data_type: str, n_top_features: int): | ||
"""Warn if n_top_columns is smaller than the number of segmenting features (metadata or properties).""" | ||
if data_type == 'metadata': | ||
features_name = 'metadata columns' | ||
n_top_columns_parameter = 'n_top_columns' | ||
columns_parameter = 'columns' | ||
else: | ||
features_name = 'properties' | ||
n_top_columns_parameter = 'n_top_properties' | ||
columns_parameter = 'properties' | ||
|
||
warnings.warn( | ||
f'Parameter {n_top_columns_parameter} is set to {n_top_features} to avoid long computation time. ' | ||
f'This means that the check will run on the first {n_top_features} {features_name}. ' | ||
f'If you want to run on all {features_name}, set {n_top_columns_parameter} to None. ' | ||
f'Alternatively, you can set parameter {columns_parameter} to a list of the specific {features_name} ' | ||
f'you want to run on.', UserWarning) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.