diff --git a/dice_ml/constants.py b/dice_ml/constants.py index bd7fe1bf..2a1f515d 100644 --- a/dice_ml/constants.py +++ b/dice_ml/constants.py @@ -14,6 +14,7 @@ class SamplingStrategy: Random = 'random' Genetic = 'genetic' KdTree = 'kdtree' + Gradient = 'gradient' class ModelTypes: diff --git a/dice_ml/data_interfaces/base_data_interface.py b/dice_ml/data_interfaces/base_data_interface.py index f0abfa54..9143c7e6 100644 --- a/dice_ml/data_interfaces/base_data_interface.py +++ b/dice_ml/data_interfaces/base_data_interface.py @@ -2,6 +2,9 @@ from abc import ABC, abstractmethod +from dice_ml.utils.exception import (SystemException, + UserConfigValidationException) + class _BaseData(ABC): @@ -27,6 +30,46 @@ def set_continuous_feature_indexes(self, query_instance): self.continuous_feature_indexes = [query_instance.columns.get_loc(name) for name in self.continuous_feature_names] + def check_features_to_vary(self, features_to_vary): + if features_to_vary is not None and features_to_vary != 'all': + not_training_features = set(features_to_vary) - set(self.feature_names) + if len(not_training_features) > 0: + raise UserConfigValidationException("Got features {0} which are not present in training data".format( + not_training_features)) + + def check_permitted_range(self, permitted_range): + if permitted_range is not None: + permitted_range_features = list(permitted_range) + not_training_features = set(permitted_range_features) - set(self.feature_names) + if len(not_training_features) > 0: + raise UserConfigValidationException("Got features {0} which are not present in training data".format( + not_training_features)) + + for feature in permitted_range_features: + if feature in self.categorical_feature_names: + train_categories = self.permitted_range[feature] + for test_category in permitted_range[feature]: + if test_category not in train_categories: + raise UserConfigValidationException( + 'The category {0} does not occur in the training data for feature {1}.' + ' Allowed categories are {2}'.format(test_category, feature, train_categories)) + + def _validate_and_set_permitted_range(self, params, features_dict=None): + """Validate and set the dictionary of permitted ranges for continuous features.""" + input_permitted_range = None + if 'permitted_range' in params: + input_permitted_range = params['permitted_range'] + + if not hasattr(self, 'feature_names'): + raise SystemException('Feature names not correctly set in public data interface') + + for input_permitted_range_feature_name in input_permitted_range: + if input_permitted_range_feature_name not in self.feature_names: + raise UserConfigValidationException( + "permitted_range contains some feature names which are not part of columns in dataframe" + ) + self.permitted_range, _ = self.get_features_range(input_permitted_range, features_dict) + @abstractmethod def __init__(self, params): """The init method needs to be implemented by the inherting classes.""" diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py index 2960c17b..b3ce377f 100644 --- a/dice_ml/data_interfaces/private_data_interface.py +++ b/dice_ml/data_interfaces/private_data_interface.py @@ -3,6 +3,7 @@ import collections import logging import sys +from collections import defaultdict import numpy as np import pandas as pd @@ -46,21 +47,18 @@ def __init__(self, params): self._validate_and_set_type_and_precision(params=params) self.continuous_feature_names = [] - self.permitted_range = {} self.categorical_feature_names = [] self.categorical_levels = {} for feature in features_dict: if type(features_dict[feature][0]) is int: # continuous feature self.continuous_feature_names.append(feature) - self.permitted_range[feature] = features_dict[feature] else: self.categorical_feature_names.append(feature) self.categorical_levels[feature] = features_dict[feature] self._validate_and_set_mad(params=params) - - # self.continuous_feature_names + self.categorical_feature_names + self._validate_and_set_permitted_range(params=params, features_dict=features_dict) self.feature_names = list(features_dict.keys()) self.continuous_feature_indexes = [list(features_dict.keys()).index( @@ -73,20 +71,6 @@ def __init__(self, params): if feature_name not in self.type_and_precision: self.type_and_precision[feature_name] = 'int' - # # Initializing a label encoder to obtain label-encoded values for categorical variables - # self.labelencoder = {} - # - # self.label_encoded_data = {} - # - # for column in self.categorical_feature_names: - # self.labelencoder[column] = LabelEncoder() - # self.label_encoded_data[column] = \ - # self.labelencoder[column].fit_transform(self.categorical_levels[column]) - - # self.max_range = -np.inf - # for feature in self.continuous_feature_names: - # self.max_range = max(self.max_range, self.permitted_range[feature][1]) - self._validate_and_set_data_name(params=params) def _validate_and_set_type_and_precision(self, params): @@ -176,7 +160,22 @@ def get_valid_mads(self, normalized=False, display_warnings=False, return_mads=T if return_mads: return mads - def create_ohe_params(self): + def get_features_range(self, permitted_range_input=None, features_dict=None): + ranges = {} + # Getting default ranges based on the dataset + for feature in features_dict: + if type(features_dict[feature][0]) is int: # continuous feature + ranges[feature] = features_dict[feature] + else: + ranges[feature] = features_dict[feature] + feature_ranges_orig = ranges.copy() + # Overwriting the ranges for a feature if input provided + if permitted_range_input is not None: + for feature_name, feature_range in permitted_range_input.items(): + ranges[feature_name] = feature_range + return ranges, feature_ranges_orig + + def create_ohe_params(self, one_hot_encoded_data=None): if len(self.categorical_feature_names) > 0: # simulating sklearn's one-hot-encoding # continuous features on the left @@ -265,16 +264,22 @@ def from_dummies(self, data, prefix_sep='_'): out.drop(cols, axis=1, inplace=True) return out - def get_decimal_precisions(self): + def get_decimal_precisions(self, output_type="list"): """"Gets the precision of continuous features in the data.""" + precisions_dict = defaultdict(int) precisions = [0]*len(self.continuous_feature_names) for ix, feature_name in enumerate(self.continuous_feature_names): type_prec = self.type_and_precision[feature_name] if type_prec == 'int': - precisions[ix] = 0 + prec = 0 else: - precisions[ix] = self.type_and_precision[feature_name][1] - return precisions + prec = self.type_and_precision[feature_name][1] + precisions[ix] = prec + precisions_dict[feature_name] = prec + if output_type == "list": + return precisions + elif output_type == "dict": + return precisions_dict def get_decoded_data(self, data, encoding='one-hot'): """Gets the original data from encoded data.""" @@ -284,11 +289,11 @@ def get_decoded_data(self, data, encoding='one-hot'): index = [i for i in range(0, len(data))] if encoding == 'one-hot': if isinstance(data, pd.DataFrame): - return self.from_dummies(data) + return data elif isinstance(data, np.ndarray): data = pd.DataFrame(data=data, index=index, columns=self.ohe_encoded_feature_names) - return self.from_dummies(data) + return data else: raise ValueError("data should be a pandas dataframe or a numpy array") @@ -347,7 +352,8 @@ def get_ohe_min_max_normalized_data(self, query_instance): """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts""" query_instance = self.prepare_query_instance(query_instance) - temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False) + ohe_base_df = self.prepare_df_for_ohe_encoding() + temp = ohe_base_df.append(query_instance, ignore_index=True, sort=False) temp = self.one_hot_encode_data(temp) temp = temp.tail(query_instance.shape[0]).reset_index(drop=True) # returns a pandas dataframe @@ -356,7 +362,7 @@ def get_ohe_min_max_normalized_data(self, query_instance): def get_inverse_ohe_min_max_normalized_data(self, transformed_data): """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array""" - raw_data = self.get_decoded_data(transformed_data, encoding='one-hot') + raw_data = self.from_dummies(transformed_data) raw_data = self.de_normalize_data(raw_data) precisions = self.get_decimal_precisions() for ix, feature in enumerate(self.continuous_feature_names): diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py index 7d0e3297..b7cc0e42 100644 --- a/dice_ml/data_interfaces/public_data_interface.py +++ b/dice_ml/data_interfaces/public_data_interface.py @@ -54,28 +54,7 @@ def __init__(self, params): self.categorical_feature_names, self.continuous_feature_names) - # should move the below snippet to gradient based dice interfaces - # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df) - # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist( - # ) if x not in np.array([self.outcome_name])] - - # should move the below snippet to model agnostic dice interfaces - # # Initializing a label encoder to obtain label-encoded values for categorical variables - # self.labelencoder = {} - # - # self.label_encoded_data = self.data_df.copy() - # - # for column in self.categorical_feature_names: - # self.labelencoder[column] = LabelEncoder() - # self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column]) - self._validate_and_set_permitted_range(params=params) - - # should move the below snippet to model agnostic dice interfaces - # self.max_range = -np.inf - # for feature in self.continuous_feature_names: - # self.max_range = max(self.max_range, self.permitted_range[feature][1]) - self._validate_and_set_data_name(params=params) def _validate_and_set_dataframe(self, params): @@ -122,22 +101,6 @@ def _validate_and_set_continuous_features_precision(self, params): else: self.continuous_features_precision = None - def _validate_and_set_permitted_range(self, params): - """Validate and set the dictionary of permitted ranges for continuous features.""" - input_permitted_range = None - if 'permitted_range' in params: - input_permitted_range = params['permitted_range'] - - if not hasattr(self, 'feature_names'): - raise SystemException('Feature names not correctly set in public data interface') - - for input_permitted_range_feature_name in input_permitted_range: - if input_permitted_range_feature_name not in self.feature_names: - raise UserConfigValidationException( - "permitted_range contains some feature names which are not part of columns in dataframe" - ) - self.permitted_range, _ = self.get_features_range(input_permitted_range) - def _set_feature_dtypes(self, data_df, categorical_feature_names, continuous_feature_names): """Set the correct type of each feature column.""" @@ -157,38 +120,7 @@ def _set_feature_dtypes(self, data_df, categorical_feature_names, np.int32) return data_df - def check_features_to_vary(self, features_to_vary): - if features_to_vary is not None and features_to_vary != 'all': - not_training_features = set(features_to_vary) - set(self.feature_names) - if len(not_training_features) > 0: - raise UserConfigValidationException("Got features {0} which are not present in training data".format( - not_training_features)) - - def check_permitted_range(self, permitted_range): - if permitted_range is not None: - permitted_range_features = list(permitted_range) - not_training_features = set(permitted_range_features) - set(self.feature_names) - if len(not_training_features) > 0: - raise UserConfigValidationException("Got features {0} which are not present in training data".format( - not_training_features)) - - for feature in permitted_range_features: - if feature in self.categorical_feature_names: - train_categories = self.permitted_range[feature] - for test_category in permitted_range[feature]: - if test_category not in train_categories: - raise UserConfigValidationException( - 'The category {0} does not occur in the training data for feature {1}.' - ' Allowed categories are {2}'.format(test_category, feature, train_categories)) - - def check_mad_validity(self, feature_weights): - """checks feature MAD validity and throw warnings. - TODO: add comments as to where this is used if this function is necessary, else remove. - """ - if feature_weights == "inverse_mad": - self.get_valid_mads(display_warnings=True, return_mads=False) - - def get_features_range(self, permitted_range_input=None): + def get_features_range(self, permitted_range_input=None, features_dict=None): ranges = {} # Getting default ranges based on the dataset for feature_name in self.continuous_feature_names: @@ -307,25 +239,6 @@ def get_minx_maxx(self, normalized=True): minx[0][idx] = self.permitted_range[feature_name][0] maxx[0][idx] = self.permitted_range[feature_name][1] return minx, maxx - # if encoding=='one-hot': - # minx = np.array([[0.0] * len(self.ohe_encoded_feature_names)]) - # maxx = np.array([[1.0] * len(self.ohe_encoded_feature_names)]) - - # for idx, feature_name in enumerate(self.continuous_feature_names): - # max_value = self.train_df[feature_name].max() - # min_value = self.train_df[feature_name].min() - - # if normalized: - # minx[0][idx] = (self.permitted_range[feature_name] - # [0] - min_value) / (max_value - min_value) - # maxx[0][idx] = (self.permitted_range[feature_name] - # [1] - min_value) / (max_value - min_value) - # else: - # minx[0][idx] = self.permitted_range[feature_name][0] - # maxx[0][idx] = self.permitted_range[feature_name][1] - # else: - # minx = np.array([[0.0] * len(self.feature_names)]) - # maxx = np.array([[1.0] * len(self.feature_names)]) def get_mads(self, normalized=False): """Computes Median Absolute Deviation of features.""" @@ -370,24 +283,17 @@ def get_quantiles_from_training_data(self, quantile=0.05, normalized=False): list(set(normalized_train_df[feature].tolist())))), quantile) return quantiles - def create_ohe_params(self): + def create_ohe_params(self, one_hot_encoded_data): if len(self.categorical_feature_names) > 0: - one_hot_encoded_data = self.one_hot_encode_data(self.data_df) self.ohe_encoded_feature_names = [x for x in one_hot_encoded_data.columns.tolist( ) if x not in np.array([self.outcome_name])] else: # one-hot-encoded data is same as original data if there is no categorical features. self.ohe_encoded_feature_names = [feat for feat in self.feature_names] - # base dataframe for doing one-hot-encoding - # ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters) - # when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers - self.ohe_base_df = self.prepare_df_for_ohe_encoding() - def get_data_params_for_gradient_dice(self): """Gets all data related params for DiCE.""" - self.create_ohe_params() minx, maxx = self.get_minx_maxx(normalized=True) # get the column indexes of categorical and continuous features after one-hot-encoding @@ -497,11 +403,11 @@ def get_decoded_data(self, data, encoding='one-hot'): index = [i for i in range(0, len(data))] if encoding == 'one-hot': if isinstance(data, pd.DataFrame): - return self.from_dummies(data) + return data elif isinstance(data, np.ndarray): data = pd.DataFrame(data=data, index=index, columns=self.ohe_encoded_feature_names) - return self.from_dummies(data) + return data else: raise ValueError("data should be a pandas dataframe or a numpy array") @@ -560,35 +466,21 @@ def prepare_query_instance(self, query_instance): self.continuous_feature_names) return test - # TODO: create a new method, get_LE_min_max_normalized_data() to get label-encoded and normalized data. Keep this - # method only for converting query_instance to pd.DataFrame - # if encoding == 'label': - # for column in self.categorical_feature_names: - # test[column] = self.labelencoder[column].transform(test[column]) - # return self.normalize_data(test, encoding) - # - # elif encoding == 'one-hot': - # temp = self.prepare_df_for_encoding() - # temp = temp.append(test, ignore_index=True, sort=False) - # temp = self.one_hot_encode_data(temp) - # temp = self.normalize_data(temp) - # - # return temp.tail(test.shape[0]).reset_index(drop=True) - def get_ohe_min_max_normalized_data(self, query_instance): """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts""" query_instance = self.prepare_query_instance(query_instance) - temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False) + ohe_base_df = self.prepare_df_for_ohe_encoding() + temp = ohe_base_df.append(query_instance, ignore_index=True, sort=False) temp = self.one_hot_encode_data(temp) temp = temp.tail(query_instance.shape[0]).reset_index(drop=True) - # returns a pandas dataframe - return self.normalize_data(temp) + # returns a pandas dataframe with all numeric values + return self.normalize_data(temp).apply(pd.to_numeric) def get_inverse_ohe_min_max_normalized_data(self, transformed_data): """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array""" - raw_data = self.get_decoded_data(transformed_data, encoding='one-hot') + raw_data = self.from_dummies(transformed_data) raw_data = self.de_normalize_data(raw_data) precisions = self.get_decimal_precisions() for ix, feature in enumerate(self.continuous_feature_names): diff --git a/dice_ml/dice.py b/dice_ml/dice.py index d1c78172..a3ad9fa6 100644 --- a/dice_ml/dice.py +++ b/dice_ml/dice.py @@ -46,45 +46,51 @@ def decide(model_interface, method): subpackage and import-and-return the class in an elif loop as shown in the below method. """ - if model_interface.backend == BackEndTypes.Sklearn: - if method == SamplingStrategy.Random: - # random sampling of CFs - from dice_ml.explainer_interfaces.dice_random import DiceRandom - return DiceRandom - elif method == SamplingStrategy.Genetic: - from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic - return DiceGenetic - elif method == SamplingStrategy.KdTree: - from dice_ml.explainer_interfaces.dice_KD import DiceKD - return DiceKD - else: - raise UserConfigValidationException("Unsupported sample strategy {0} provided. " - "Please choose one of {1}, {2} or {3}".format( - method, SamplingStrategy.Random, - SamplingStrategy.Genetic, - SamplingStrategy.KdTree - )) - - elif model_interface.backend == BackEndTypes.Tensorflow1: - # pretrained Keras Sequential model with Tensorflow 1.x backend - from dice_ml.explainer_interfaces.dice_tensorflow1 import \ - DiceTensorFlow1 - return DiceTensorFlow1 - - elif model_interface.backend == BackEndTypes.Tensorflow2: - # pretrained Keras Sequential model with Tensorflow 2.x backend - from dice_ml.explainer_interfaces.dice_tensorflow2 import \ - DiceTensorFlow2 - return DiceTensorFlow2 + if method == SamplingStrategy.Random: + # random sampling of CFs + from dice_ml.explainer_interfaces.dice_random import DiceRandom + return DiceRandom + elif method == SamplingStrategy.Genetic: + from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic + return DiceGenetic + elif method == SamplingStrategy.KdTree: + from dice_ml.explainer_interfaces.dice_KD import DiceKD + return DiceKD + elif method == SamplingStrategy.Gradient: + if model_interface.backend == BackEndTypes.Tensorflow1: + # pretrained Keras Sequential model with Tensorflow 1.x backend + from dice_ml.explainer_interfaces.dice_tensorflow1 import \ + DiceTensorFlow1 + return DiceTensorFlow1 - elif model_interface.backend == BackEndTypes.Pytorch: - # PyTorch backend - from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch - return DicePyTorch + elif model_interface.backend == BackEndTypes.Tensorflow2: + # pretrained Keras Sequential model with Tensorflow 2.x backend + from dice_ml.explainer_interfaces.dice_tensorflow2 import \ + DiceTensorFlow2 + return DiceTensorFlow2 - else: + elif model_interface.backend == BackEndTypes.Pytorch: + # PyTorch backend + from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch + return DicePyTorch + else: + raise UserConfigValidationException( + "{0} is only supported for differentiable neural network models. " + "Please choose one of {1}, {2} or {3}".format( + method, SamplingStrategy.Random, + SamplingStrategy.Genetic, + SamplingStrategy.KdTree + )) + elif method is None: # all other backends backend_dice = model_interface.backend['explainer'] module_name, class_name = backend_dice.split('.') module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name]) return getattr(module, class_name) + else: + raise UserConfigValidationException("Unsupported sample strategy {0} provided. " + "Please choose one of {1}, {2} or {3}".format( + method, SamplingStrategy.Random, + SamplingStrategy.Genetic, + SamplingStrategy.KdTree + )) diff --git a/dice_ml/diverse_counterfactuals.py b/dice_ml/diverse_counterfactuals.py index 16a7020d..072cfa61 100644 --- a/dice_ml/diverse_counterfactuals.py +++ b/dice_ml/diverse_counterfactuals.py @@ -104,7 +104,7 @@ def _visualize_internal(self, display_sparse_df=True, show_only_changes=False, is_notebook_console=is_notebook_console) elif not hasattr(self.data_interface, 'data_df'): # for private data print('\nDiverse Counterfactual set without sparsity correction since only metadata about each', - ' feature is available (new outcome: ', self.new_outcome) + ' feature is available (new outcome: %i)' % (self.new_outcome)) self._dump_output(content=self.final_cfs_df, show_only_changes=show_only_changes, is_notebook_console=is_notebook_console) else: diff --git a/dice_ml/explainer_interfaces/dice_KD.py b/dice_ml/explainer_interfaces/dice_KD.py index 51bb24c8..deed36f8 100644 --- a/dice_ml/explainer_interfaces/dice_KD.py +++ b/dice_ml/explainer_interfaces/dice_KD.py @@ -25,15 +25,17 @@ def __init__(self, data_interface, model_interface): self.total_random_inits = 0 super().__init__(data_interface) # initiating data related parameters - # As DiCE KD uses one-hot-encoding - self.data_interface.create_ohe_params() - # initializing model variables self.model = model_interface self.model.load_model() # loading pickled trained model if applicable self.model.transformer.feed_data_params(data_interface) self.model.transformer.initialize_transform_func() + # As DiCE KD uses one-hot-encoding + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) + # loading trained model self.model.load_model() diff --git a/dice_ml/explainer_interfaces/dice_genetic.py b/dice_ml/explainer_interfaces/dice_genetic.py index efc15195..dde842c4 100644 --- a/dice_ml/explainer_interfaces/dice_genetic.py +++ b/dice_ml/explainer_interfaces/dice_genetic.py @@ -25,11 +25,6 @@ def __init__(self, data_interface, model_interface): """ super().__init__(data_interface, model_interface) # initiating data related parameters - # number of output nodes of ML model - if self.model.model_type == ModelTypes.Classifier: - self.num_output_nodes = self.model.get_num_output_nodes2( - self.data_interface.data_df[0:1][self.data_interface.feature_names]) - # variables required to generate CFs - see generate_counterfactuals() for more info self.cfs = [] self.features_to_vary = [] @@ -272,12 +267,18 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k query_instance=query_instance_orig) query_instance = self.data_interface.prepare_query_instance( query_instance=query_instance) + # number of output nodes of ML model + self.num_output_nodes = None + if self.model.model_type == ModelTypes.Classifier: + self.num_output_nodes = self.model.get_num_output_nodes2(query_instance) + query_instance = self.label_encode(query_instance) query_instance = np.array(query_instance.values[0]) self.x1 = query_instance # find the predicted value of query_instance test_pred = self.predict_fn(query_instance) + self.test_pred = test_pred desired_class = self.misc_init(stopping_threshold, desired_class, desired_range, test_pred) @@ -307,12 +308,17 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k def predict_fn_scores(self, input_instance): """Returns prediction scores.""" input_instance = self.label_decode(input_instance) - return self.model.get_output(input_instance) + out = self.model.get_output(input_instance) + if self.model.model_type == ModelTypes.Classifier and out.shape[1] == 1: + # DL models return only 1 for binary classification + out = np.hstack((1-out, out)) + return out def predict_fn(self, input_instance): """Returns actual prediction.""" input_instance = self.label_decode(input_instance) - return self.model.get_output(input_instance, model_score=False) + preds = self.model.get_output(input_instance, model_score=False) + return preds def _predict_fn_custom(self, input_instance, desired_class): """Checks that the maximum predicted score lies in the desired class.""" @@ -324,6 +330,9 @@ def _predict_fn_custom(self, input_instance, desired_class): input_instance = self.label_decode(input_instance) output = self.model.get_output(input_instance, model_score=True) + if self.model.model_type == ModelTypes.Classifier and np.array(output).shape[1] == 1: + # DL models return only 1 for binary classification + output = np.hstack((1-output, output)) desired_class = int(desired_class) maxvalues = np.max(output, 1) predicted_values = np.argmax(output, 1) diff --git a/dice_ml/explainer_interfaces/dice_pytorch.py b/dice_ml/explainer_interfaces/dice_pytorch.py index 51f9e7c8..12412fda 100644 --- a/dice_ml/explainer_interfaces/dice_pytorch.py +++ b/dice_ml/explainer_interfaces/dice_pytorch.py @@ -9,7 +9,6 @@ import torch from dice_ml import diverse_counterfactuals as exp -from dice_ml.counterfactual_explanations import CounterfactualExplanations from dice_ml.explainer_interfaces.explainer_base import ExplainerBase @@ -23,20 +22,17 @@ def __init__(self, data_interface, model_interface): """ # initiating data related parameters super().__init__(data_interface) - self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ - self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() - # initializing model related variables self.model = model_interface self.model.load_model() # loading trained model - if self.model.transformer.func is not None: # TODO: this error is probably too big - need to change it. - raise ValueError("Gradient-based DiCE currently " - "(1) accepts the data only in raw categorical and continuous formats, " - "(2) does one-hot-encoding and min-max-normalization internally, " - "(3) expects the ML model the accept the data in this same format. " - "If your problem supports this, please initialize model class again " - "with no custom transformation function.") - # number of output nodes of ML model + self.model.transformer.feed_data_params(data_interface) + self.model.transformer.initialize_transform_func() + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) + self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ + self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() + self.num_output_nodes = self.model.get_num_output_nodes(len(self.data_interface.ohe_encoded_feature_names)).shape[1] # variables required to generate CFs - see generate_counterfactuals() for more info @@ -48,19 +44,22 @@ def __init__(self, data_interface, model_interface): self.hyperparameters = [1, 1, 1] # proximity_weight, diversity_weight, categorical_penalty self.optimizer_weights = [] # optimizer, learning_rate - def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opposite", proximity_weight=0.5, - diversity_weight=1.0, categorical_penalty=0.1, algorithm="DiverseCF", features_to_vary="all", - permitted_range=None, yloss_type="hinge_loss", diversity_loss_type="dpp_style:inverse_dist", - feature_weights="inverse_mad", optimizer="pytorch:adam", learning_rate=0.05, min_iter=500, - max_iter=5000, project_iter=0, loss_diff_thres=1e-5, loss_converge_maxiter=1, verbose=False, - init_near_query_instance=True, tie_random=False, stopping_threshold=0.5, - posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", limit_steps_ls=10000): + def _generate_counterfactuals(self, query_instance, total_CFs, + desired_class="opposite", desired_range=None, + proximity_weight=0.5, + diversity_weight=1.0, categorical_penalty=0.1, algorithm="DiverseCF", features_to_vary="all", + permitted_range=None, yloss_type="hinge_loss", diversity_loss_type="dpp_style:inverse_dist", + feature_weights="inverse_mad", optimizer="pytorch:adam", learning_rate=0.05, min_iter=500, + max_iter=5000, project_iter=0, loss_diff_thres=1e-5, loss_converge_maxiter=1, verbose=False, + init_near_query_instance=True, tie_random=False, stopping_threshold=0.5, + posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", limit_steps_ls=10000): """Generates diverse counterfactual explanations. :param query_instance: Test point of interest. A dictionary of feature names and values or a single row dataframe :param total_CFs: Total number of counterfactuals required. :param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the outcome class of query_instance for binary classification. + :param desired_range: Not supported currently. :param proximity_weight: A positive float. Larger this weight, more close the counterfactuals are to the query_instance. :param diversity_weight: A positive float. Larger this weight, more diverse the counterfactuals are. @@ -105,10 +104,6 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp # check permitted range for continuous features if permitted_range is not None: - # if not self.data_interface.check_features_range(permitted_range): - # raise ValueError( - # "permitted range of features should be within their original range") - # else: self.data_interface.permitted_range = permitted_range self.minx, self.maxx = self.data_interface.get_minx_maxx(normalized=True) self.cont_minx = [] @@ -130,7 +125,7 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp project_iter, loss_diff_thres, loss_converge_maxiter, verbose, init_near_query_instance, tie_random, stopping_threshold, posthoc_sparsity_param, posthoc_sparsity_algorithm, limit_steps_ls) - counterfactual_explanations = exp.CounterfactualExamples( + return exp.CounterfactualExamples( data_interface=self.data_interface, final_cfs_df=final_cfs_df, test_instance_df=test_instance_df, @@ -138,21 +133,24 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp posthoc_sparsity_param=posthoc_sparsity_param, desired_class=desired_class) - return CounterfactualExplanations(cf_examples_list=[counterfactual_explanations]) - - def get_model_output(self, input_instance): + def get_model_output(self, input_instance, + transform_data=False, out_tensor=True): """get output probability of ML model""" - return self.model.get_output(input_instance)[(self.num_output_nodes-1):] + return self.model.get_output( + input_instance, + transform_data=transform_data, + out_tensor=out_tensor)[(self.num_output_nodes-1):] def predict_fn(self, input_instance): """prediction function""" if not torch.is_tensor(input_instance): input_instance = torch.tensor(input_instance).float() - return self.get_model_output(input_instance).data.numpy() + return self.get_model_output( + input_instance, transform_data=False, out_tensor=False) def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" - input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance).iloc[0].values + input_instance = self.model.transformer.transform(input_instance).to_numpy()[0] return self.predict_fn(torch.tensor(input_instance).float()) def do_cf_initializations(self, total_CFs, algorithm, features_to_vary): @@ -420,11 +418,7 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin init_near_query_instance, tie_random, stopping_threshold, posthoc_sparsity_param, posthoc_sparsity_algorithm, limit_steps_ls): """Finds counterfactuals by gradient-descent.""" - - # Prepares user defined query_instance for DiCE. - # query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance, encoding='one-hot') - # query_instance = query_instance.iloc[0].values - query_instance = self.data_interface.get_ohe_min_max_normalized_data(query_instance).iloc[0].values + query_instance = self.model.transformer.transform(query_instance).to_numpy()[0] self.x1 = torch.tensor(query_instance) # find the predicted value of query_instance @@ -563,12 +557,15 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) - final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + final_cfs_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(cfs)) + # rounding off to 3 decimal places cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) - test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) + test_instance_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(query_instance)) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) # post-hoc operation on continuous features to enhance sparsity - only for public data diff --git a/dice_ml/explainer_interfaces/dice_random.py b/dice_ml/explainer_interfaces/dice_random.py index d1e801c5..43316333 100644 --- a/dice_ml/explainer_interfaces/dice_random.py +++ b/dice_ml/explainer_interfaces/dice_random.py @@ -24,7 +24,6 @@ def __init__(self, data_interface, model_interface): """ super().__init__(data_interface) # initiating data related parameters - self.data_interface.create_ohe_params() self.model = model_interface self.model.load_model() # loading pickled trained model if applicable self.model.transformer.feed_data_params(data_interface) @@ -77,7 +76,6 @@ class of query_instance for binary classification. # Do predictions once on the query_instance and reuse across to reduce the number # inferences. model_predictions = self.predict_fn(query_instance) - # number of output nodes of ML model self.num_output_nodes = None if self.model.model_type == ModelTypes.Classifier: diff --git a/dice_ml/explainer_interfaces/dice_tensorflow1.py b/dice_ml/explainer_interfaces/dice_tensorflow1.py index 30f69530..e4e882ea 100644 --- a/dice_ml/explainer_interfaces/dice_tensorflow1.py +++ b/dice_ml/explainer_interfaces/dice_tensorflow1.py @@ -25,6 +25,14 @@ def __init__(self, data_interface, model_interface): """ # initiating data related parameters super().__init__(data_interface) + # initializing model related variables + self.model = model_interface + self.model.load_model() # loading trained model + self.model.transformer.feed_data_params(data_interface) + self.model.transformer.initialize_transform_func() + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) self.minx, self.maxx, self.encoded_categorical_feature_indexes, \ self.encoded_continuous_feature_indexes, self.cont_minx, \ self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() @@ -35,18 +43,8 @@ def __init__(self, data_interface, model_interface): else: self.dice_sess = tf.InteractiveSession() - # initializing model related variables - self.model = model_interface - self.model.load_model() # loading trained model self.input_tensor = tf.Variable(self.minx, dtype=tf.float32) # placeholder variables for model predictions self.output_tensor = self.model.get_output(self.input_tensor) - if self.model.transformer.func is not None: # TODO: this error is probably too big - need to change it. - raise ValueError("Gradient-based DiCE currently " - "(1) accepts the data only in raw categorical and continuous formats, " - "(2) does one-hot-encoding and min-max-normalization internally, " - "(3) expects the ML model the accept the data in this same format. " - "If your problem supports this, please initialize model class again " - "with no custom transformation function.") # number of output nodes of ML model self.num_output_nodes = self.dice_sess.run( self.model.get_num_output_nodes(len(self.data_interface.ohe_encoded_feature_names))).shape[1] @@ -127,10 +125,6 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp # check permitted range for continuous features if permitted_range is not None: - # if not self.data_interface.check_features_range(permitted_range): - # raise ValueError( - # "permitted range of features should be within their original range") - # else: self.data_interface.permitted_range = permitted_range self.minx, self.maxx = self.data_interface.get_minx_maxx(normalized=True) self.cont_minx = [] @@ -231,7 +225,7 @@ def predict_fn(self, input_instance): def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" - input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance).values + input_instance = self.model.transformer.transform(input_instance).to_numpy() return self.predict_fn(input_instance) def compute_yloss(self, method): @@ -528,13 +522,10 @@ def find_counterfactuals(self, query_instance, limit_steps_ls, desired_class="op stopping_threshold=0.5, posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear"): """Finds counterfactuals by gradient-descent.""" - # Prepares user defined query_instance for DiCE. - # query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance, encoding='one-hot') - # query_instance = np.array([query_instance.iloc[0].values], dtype=np.float32) - query_instance = self.data_interface.get_ohe_min_max_normalized_data(query_instance).values + query_instance = self.model.transformer.transform(query_instance).to_numpy() # find the predicted value of query_instance - test_pred = self.predict_fn(query_instance)[0][0] + test_pred = self.predict_fn(tf.constant(query_instance, dtype=tf.float32))[0][0] if desired_class == "opposite": desired_class = 1.0 - round(test_pred) self.target_cf_class = np.array([[desired_class]]) @@ -653,12 +644,14 @@ def find_counterfactuals(self, query_instance, limit_steps_ls, desired_class="op # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) - final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + final_cfs_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(cfs)) cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) - test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) + test_instance_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(query_instance)) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) # post-hoc operation on continuous features to enhance sparsity - only for public data diff --git a/dice_ml/explainer_interfaces/dice_tensorflow2.py b/dice_ml/explainer_interfaces/dice_tensorflow2.py index 5a24e6a4..8004a341 100644 --- a/dice_ml/explainer_interfaces/dice_tensorflow2.py +++ b/dice_ml/explainer_interfaces/dice_tensorflow2.py @@ -23,20 +23,20 @@ def __init__(self, data_interface, model_interface): """ # initiating data related parameters super().__init__(data_interface) - self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ - self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() - # initializing model related variables self.model = model_interface self.model.load_model() # loading trained model - # TODO: this error is probably too big - need to change it. - if self.model.transformer.func is not None: - raise ValueError("Gradient-based DiCE currently " - "(1) accepts the data only in raw categorical and continuous formats, " - "(2) does one-hot-encoding and min-max-normalization internally, " - "(3) expects the ML model the accept the data in this same format. " - "If your problem supports this, please initialize model class again " - "with no custom transformation function.") + self.model.transformer.feed_data_params(data_interface) + self.model.transformer.initialize_transform_func() + # temp data to create some attributes like encoded feature names + if hasattr(self.data_interface, "data_df"): + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + else: + temp_ohe_data = None + self.data_interface.create_ohe_params(temp_ohe_data) + self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ + self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() + # number of output nodes of ML model self.num_output_nodes = self.model.get_num_output_nodes(len(self.data_interface.ohe_encoded_feature_names)).shape[1] @@ -153,7 +153,7 @@ def predict_fn(self, input_instance): def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" - input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance).values + input_instance = self.model.transformer.transform(input_instance).to_numpy() return self.predict_fn(tf.constant(input_instance, dtype=tf.float32)) def do_cf_initializations(self, total_CFs, algorithm, features_to_vary): @@ -424,10 +424,7 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin posthoc_sparsity_algorithm, limit_steps_ls): """Finds counterfactuals by gradient-descent.""" - # Prepares user defined query_instance for DiCE. - # query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance, encoding='one-hot') - # query_instance = np.array([query_instance.iloc[0].values]) - query_instance = self.data_interface.get_ohe_min_max_normalized_data(query_instance).values + query_instance = self.model.transformer.transform(query_instance).to_numpy() self.x1 = tf.constant(query_instance, dtype=tf.float32) # find the predicted value of query_instance @@ -539,8 +536,7 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin self.max_iterations_run = iterations self.elapsed = timeit.default_timer() - start_time - - self.cfs_preds = [self.predict_fn(cfs) for cfs in self.final_cfs] + self.cfs_preds = [self.predict_fn(tf.constant(cfs, dtype=tf.float32)) for cfs in self.final_cfs] # update final_cfs from backed up CFs if valid CFs are not found if((self.target_cf_class == 0 and any(i[0] > self.stopping_threshold for i in self.cfs_preds)) or @@ -553,12 +549,14 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) - final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + final_cfs_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(cfs)) cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) - test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) + test_instance_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(query_instance)) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) # post-hoc operation on continuous features to enhance sparsity - only for public data diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 633ef880..50ae8e3f 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -156,7 +156,6 @@ def generate_counterfactuals(self, query_instances, total_CFs, query_instances_list.append(query_instances[ix:(ix+1)]) elif isinstance(query_instances, Iterable): query_instances_list = query_instances - for query_instance in tqdm(query_instances_list): self.data_interface.set_continuous_feature_indexes(query_instance) res = self._generate_counterfactuals( @@ -171,7 +170,6 @@ def generate_counterfactuals(self, query_instances, total_CFs, verbose=verbose, **kwargs) cf_examples_arr.append(res) - self._check_any_counterfactuals_computed(cf_examples_arr=cf_examples_arr) return CounterfactualExplanations(cf_examples_list=cf_examples_arr) @@ -227,9 +225,6 @@ def setup(self, features_to_vary, permitted_range, query_instance, feature_weigh self.check_query_instance_validity(features_to_vary, permitted_range, query_instance, feature_ranges_orig) - # check feature MAD validity and throw warnings - self.data_interface.check_mad_validity(feature_weights) - return features_to_vary def check_query_instance_validity(self, features_to_vary, permitted_range, query_instance, feature_ranges_orig): @@ -239,7 +234,6 @@ def check_query_instance_validity(self, features_to_vary, permitted_range, query if feature not in self.data_interface.feature_names: raise ValueError("Feature", feature, "not present in training data!") - for feature in self.data_interface.categorical_feature_names: if query_instance[feature].values[0] not in feature_ranges_orig[feature] and \ str(query_instance[feature].values[0]) not in feature_ranges_orig[feature]: @@ -486,7 +480,12 @@ def feature_importance(self, query_instances, cf_examples_list=None, def predict_fn(self, input_instance): """prediction function""" - return self.model.get_output(input_instance) + + preds = self.model.get_output(input_instance) + if self.model.model_type == ModelTypes.Classifier and \ + len(preds.shape) == 1: # from deep learning predictors + preds = np.column_stack([1 - preds, preds]) + return preds def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" @@ -544,8 +543,7 @@ def do_posthoc_sparsity_enhancement(self, final_cfs_sparse, query_instance, post diff, decimal_prec, query_instance, cf_ix, feature, final_cfs_sparse, current_pred) temp_preds = self.predict_fn_for_sparsity(final_cfs_sparse.loc[[cf_ix]][self.data_interface.feature_names]) - cfs_preds_sparse.append(temp_preds) - + cfs_preds_sparse.append(temp_preds[0]) final_cfs_sparse[self.data_interface.outcome_name] = self.get_model_output_from_scores(cfs_preds_sparse) # final_cfs_sparse[self.data_interface.outcome_name] = np.round(final_cfs_sparse[self.data_interface.outcome_name], 3) return final_cfs_sparse @@ -668,11 +666,21 @@ def infer_target_cfs_class(self, desired_class_input, original_pred, num_output_ original_pred_1 = original_pred target_class = int(1 - original_pred_1) return target_class + elif num_output_nodes == 1: # only for pytorch DL model + original_pred_1 = np.round(original_pred) + target_class = int(1-original_pred_1) + return target_class elif num_output_nodes > 2: raise UserConfigValidationException( "Desired class cannot be opposite if the number of classes is more than 2.") elif isinstance(desired_class_input, int): - if desired_class_input >= 0 and desired_class_input < num_output_nodes: + if num_output_nodes == 1: # for DL models + if desired_class_input in (0, 1): + target_class = desired_class_input + return target_class + else: + raise UserConfigValidationException("Only 0, 1 are supported as desired class for binary classification!") + elif desired_class_input >= 0 and desired_class_input < num_output_nodes: target_class = desired_class_input return target_class else: @@ -697,11 +705,15 @@ def decide_cf_validity(self, model_outputs): for i in range(len(model_outputs)): pred = model_outputs[i] if self.model.model_type == ModelTypes.Classifier: - if self.num_output_nodes == 2: # binary - pred_1 = pred[self.num_output_nodes-1] + if self.num_output_nodes in (1, 2): # binary + if self.num_output_nodes == 2: + pred_1 = pred[self.num_output_nodes-1] + else: + pred_1 = pred[0] validity[i] = 1 if \ ((self.target_cf_class == 0 and pred_1 <= self.stopping_threshold) or (self.target_cf_class == 1 and pred_1 >= self.stopping_threshold)) else 0 + else: # multiclass if np.argmax(pred) == self.target_cf_class: validity[i] = 1 @@ -728,14 +740,14 @@ def is_cf_valid(self, model_score): target_cf_class = self.target_cf_class[0][0] target_cf_class = int(target_cf_class) - if self.num_output_nodes == 1: # for tensorflow/pytorch models + if len(model_score) == 1: # for tensorflow/pytorch models pred_1 = model_score[0] validity = True if \ ((target_cf_class == 0 and pred_1 <= self.stopping_threshold) or (target_cf_class == 1 and pred_1 >= self.stopping_threshold)) else False return validity - if self.num_output_nodes == 2: # binary - pred_1 = model_score[self.num_output_nodes-1] + elif len(model_score) == 2: # binary + pred_1 = model_score[1] validity = True if \ ((target_cf_class == 0 and pred_1 <= self.stopping_threshold) or (target_cf_class == 1 and pred_1 >= self.stopping_threshold)) else False @@ -753,7 +765,13 @@ def get_model_output_from_scores(self, model_scores): model_output = np.zeros(len(model_scores), dtype=output_type) for i in range(len(model_scores)): if self.model.model_type == ModelTypes.Classifier: - model_output[i] = np.argmax(model_scores[i]) + if hasattr(model_scores[i], "shape") and len(model_scores[i].shape) > 0: + if model_scores[i].shape[0] > 1: + model_output[i] = np.argmax(model_scores[i]) + else: + model_output[i] = np.round(model_scores[i])[0] + else: # 1-D input + model_output[i] = np.round(model_scores[i]) elif self.model.model_type == ModelTypes.Regressor: model_output[i] = model_scores[i] return model_output @@ -784,7 +802,7 @@ def build_KD_tree(self, data_df_copy, desired_range, desired_class, predicted_ou dataset_instance = self.data_interface.prepare_query_instance( query_instance=data_df_copy[self.data_interface.feature_names]) - predictions = self.model.model.predict(dataset_instance) + predictions = self.model.get_output(dataset_instance, model_score=False).flatten() # TODO: Is it okay to insert a column in the original dataframe with the predicted outcome? This is memory-efficient data_df_copy[predicted_outcome_name] = predictions diff --git a/dice_ml/model_interfaces/keras_tensorflow_model.py b/dice_ml/model_interfaces/keras_tensorflow_model.py index df150850..00074823 100644 --- a/dice_ml/model_interfaces/keras_tensorflow_model.py +++ b/dice_ml/model_interfaces/keras_tensorflow_model.py @@ -19,7 +19,7 @@ def __init__(self, model=None, model_path='', backend='TF1', func=None, kw_args= dictionary of kw_args, by default. """ - super().__init__(model, model_path, backend) + super().__init__(model, model_path, backend, func, kw_args) def load_model(self): if self.model_path != '': @@ -32,9 +32,8 @@ def get_output(self, input_tensor, training=False, transform_data=False): :param training: to determine training mode in TF2. :param transform_data: boolean to indicate if data transformation is required. """ - if transform_data: - input_tensor = tf.constant(self.transformer.transform(input_tensor), dtype=tf.float32) - + if transform_data or not tf.is_tensor(input_tensor): + input_tensor = tf.constant(self.transformer.transform(input_tensor).to_numpy(), dtype=tf.float32) if self.backend == 'TF2': return self.model(input_tensor, training=training) else: diff --git a/dice_ml/model_interfaces/pytorch_model.py b/dice_ml/model_interfaces/pytorch_model.py index 9cf577cc..4c8e2068 100644 --- a/dice_ml/model_interfaces/pytorch_model.py +++ b/dice_ml/model_interfaces/pytorch_model.py @@ -1,47 +1,57 @@ -"""Module containing an interface to trained PyTorch model.""" - -import torch - -from dice_ml.model_interfaces.base_model import BaseModel - - -class PyTorchModel(BaseModel): - - def __init__(self, model=None, model_path='', backend='PYT', func=None, kw_args=None): - """Init method - - :param model: trained PyTorch Model. - :param model_path: path to trained model. - :param backend: "PYT" for PyTorch framework. - :param func: function transformation required for ML model. If func is None, then func will be the identity function. - :param kw_args: Dictionary of additional keyword arguments to pass to func. DiCE's data_interface is appended to the - dictionary of kw_args, by default. - """ - - super().__init__(model, model_path, backend) - - def load_model(self): - if self.model_path != '': - self.model = torch.load(self.model_path) - - def get_output(self, input_tensor, transform_data=False): - """returns prediction probabilities - - :param input_tensor: test input. - :param transform_data: boolean to indicate if data transformation is required. - """ - if transform_data: - input_tensor = torch.tensor(self.transformer.transform(input_tensor)).float() - - return self.model(input_tensor).float() - - def set_eval_mode(self): - self.model.eval() - - def get_gradient(self, input_instance): - # Future Support - raise NotImplementedError("Future Support") - - def get_num_output_nodes(self, inp_size): - temp_input = torch.rand(1, inp_size).float() - return self.get_output(temp_input).data +"""Module containing an interface to trained PyTorch model.""" + +import numpy as np +import torch + +from dice_ml.constants import ModelTypes +from dice_ml.model_interfaces.base_model import BaseModel + + +class PyTorchModel(BaseModel): + + def __init__(self, model=None, model_path='', backend='PYT', func=None, kw_args=None): + """Init method + + :param model: trained PyTorch Model. + :param model_path: path to trained model. + :param backend: "PYT" for PyTorch framework. + :param func: function transformation required for ML model. If func is None, then func will be the identity function. + :param kw_args: Dictionary of additional keyword arguments to pass to func. DiCE's data_interface is appended to the + dictionary of kw_args, by default. + """ + + super().__init__(model, model_path, backend, func, kw_args) + + def load_model(self): + if self.model_path != '': + self.model = torch.load(self.model_path) + + def get_output(self, input_instance, model_score=True, + transform_data=False, out_tensor=False): + """returns prediction probabilities + + :param input_tensor: test input. + :param transform_data: boolean to indicate if data transformation is required. + """ + input_tensor = input_instance + if transform_data: + input_tensor = torch.tensor(self.transformer.transform(input_instance).to_numpy()).float() + if not torch.is_tensor(input_instance): + input_tensor = torch.tensor(self.transformer.transform(input_instance).to_numpy()).float() + out = self.model(input_tensor).float() + if not out_tensor: + out = out.data.numpy() + if model_score is False and self.model_type == ModelTypes.Classifier: + out = np.round(out) # TODO need to generalize for n-class classifier + return out + + def set_eval_mode(self): + self.model.eval() + + def get_gradient(self, input_instance): + # Future Support + raise NotImplementedError("Future Support") + + def get_num_output_nodes(self, inp_size): + temp_input = torch.rand(1, inp_size).float() + return self.get_output(temp_input).data diff --git a/dice_ml/utils/helpers.py b/dice_ml/utils/helpers.py index d37300e2..aa63e611 100644 --- a/dice_ml/utils/helpers.py +++ b/dice_ml/utils/helpers.py @@ -219,7 +219,11 @@ def get_base_gen_cf_initialization(data_interface, encoded_size, cont_minx, cont def ohe_min_max_transformation(data, data_interface): """the data is one-hot-encoded and min-max normalized and fed to the ML model""" - return data_interface.get_ohe_min_max_normalized_data(data).values + return data_interface.get_ohe_min_max_normalized_data(data) + + +def inverse_ohe_min_max_transformation(data, data_interface): + return data_interface.get_inverse_ohe_min_max_normalized_data(data) class DataTransfomer: @@ -239,7 +243,13 @@ def feed_data_params(self, data_interface): def initialize_transform_func(self): if self.func == 'ohe-min-max': - self.data_transformer = FunctionTransformer(func=ohe_min_max_transformation, kw_args=self.kw_args, validate=False) + self.data_transformer = FunctionTransformer( + func=ohe_min_max_transformation, + inverse_func=inverse_ohe_min_max_transformation, + check_inverse=False, + validate=False, + kw_args=self.kw_args, + inv_kw_args=self.kw_args) elif self.func is None: # identity transformation # add more ready-to-use transformers (such as label-encoding) in elif loops. diff --git a/dice_ml/utils/neuralnetworks.py b/dice_ml/utils/neuralnetworks.py new file mode 100644 index 00000000..07c18d27 --- /dev/null +++ b/dice_ml/utils/neuralnetworks.py @@ -0,0 +1,21 @@ +from torch import nn, sigmoid + + +class FFNetwork(nn.Module): + def __init__(self, input_size, is_classifier=True): + super(FFNetwork, self).__init__() + self.is_classifier = is_classifier + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(input_size, 16), + nn.ReLU(), + nn.Linear(16, 1), + ) + + def forward(self, x): + x = self.flatten(x) + out = self.linear_relu_stack(x) + out = sigmoid(out) + if not self.is_classifier: + out = 3 * out # output between 0 and 3 + return out diff --git a/docs/source/notebooks/DiCE_getting_started.ipynb b/docs/source/notebooks/DiCE_getting_started.ipynb index 47ba396d..f4cb675c 100644 --- a/docs/source/notebooks/DiCE_getting_started.ipynb +++ b/docs/source/notebooks/DiCE_getting_started.ipynb @@ -23,20 +23,10 @@ " * Randomized Search\n", " * Genetic Search\n", " * KD Tree Search (for counterfactuals from a given training dataset)\n", - "* **Gradient-Based**: These methods apply to differentiable models, such as those returned by deep learning libraries like tensorflow and pytorch. They are based on an explicit loss minimization based on proximity, diversity and feasibility. The method is described in this [paper](https://arxiv.org/abs/1905.07697)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![DiCE API](images/dice_getting_started_api.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "* **Gradient-Based**: These methods apply to differentiable models, such as those returned by deep learning libraries like tensorflow and pytorch. They are based on an explicit loss minimization based on proximity, diversity and feasibility. The method is described in this [paper](https://arxiv.org/abs/1905.07697).\n", + "\n", + "![DiCE API](images/dice_getting_started_api.png)\n", + "\n", "DiCE requires two inputs: a training dataset and a pre-trained ML model. When the training dataset is unknown (e.g., for privacy reasons), it can also work without access to the full dataset (see this [notebook](DiCE_with_private_data.ipynb) for an example). Below we show a simple example. " ] }, @@ -53,10 +43,6 @@ "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", - "# Tensorflow import\n", - "import tensorflow as tf\n", - "\n", - "\n", "# DiCE imports\n", "import dice_ml\n", "from dice_ml.utils import helpers # helper functions" @@ -391,13 +377,10 @@ "metadata": {}, "outputs": [], "source": [ - "# supress deprecation warnings from TF\n", - "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", - "\n", - "backend = 'TF'+tf.__version__[0] # TF1\n", + "backend = 'TF2' # needs tensorflow installed\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", "# Step 2: dice_ml.Model\n", - "m = dice_ml.Model(model_path=ML_modelpath, backend=backend)" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] }, { @@ -416,41 +399,14 @@ "outputs": [], "source": [ "# Step 3: initiate DiCE\n", - "exp = dice_ml.Dice(d, m)" + "exp = dice_ml.Dice(d, m, method=\"gradient\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Below we provide query instance as a dict." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# query instance in the form of a dictionary or a dataframe; keys: feature name, values: feature value\n", - "query_instance = {'age': 22,\n", - " 'workclass': 'Private',\n", - " 'education': 'HS-grad',\n", - " 'marital_status': 'Single',\n", - " 'occupation': 'Service',\n", - " 'race': 'White',\n", - " 'gender': 'Female',\n", - " 'hours_per_week': 45}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate counterfactuals\n", - "dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=4, desired_class=\"opposite\")" + "Below we provide query instances from `x_test`." ] }, { @@ -461,6 +417,8 @@ }, "outputs": [], "source": [ + "# generate counterfactuals\n", + "dice_exp = exp.generate_counterfactuals(x_test[1:2], total_CFs=4, desired_class=\"opposite\")\n", "# visualize the result, highlight only the changes\n", "dice_exp.visualize_as_dataframe(show_only_changes=True)" ] @@ -483,7 +441,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Just change the backend variable to 'PYT' to use DiCE with PyTorch. Below, we use a pre-trained ML model in PyTorch which produces high accuracy comparable to other baselines. For convenience, we include the sample trained model with the DiCE package." + "Just change the backend variable to 'PYT' to use DiCE with PyTorch. Below, we use a pre-trained ML model in PyTorch which produces high accuracy comparable to other baselines. For convenience, we include the sample trained model with the DiCE package. Additionally, we need to provide a data transformer function that converts input dataframe into one-hot encoded/numeric format. " ] }, { @@ -492,9 +450,9 @@ "metadata": {}, "outputs": [], "source": [ - "backend = 'PYT'\n", + "backend = 'PYT' # needs pytorch installed\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", - "m = dice_ml.Model(model_path=ML_modelpath, backend=backend)" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] }, { @@ -510,24 +468,28 @@ "metadata": {}, "outputs": [], "source": [ - "exp = dice_ml.Dice(d, m)" + "exp = dice_ml.Dice(d, m, method=\"gradient\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ - "# query instance in the form of a dictionary; keys: feature name, values: feature value\n", - "query_instance = {'age': 22,\n", - " 'workclass': 'Private',\n", - " 'education': 'HS-grad',\n", - " 'marital_status': 'Single',\n", - " 'occupation': 'Service',\n", - " 'race': 'White',\n", - " 'gender': 'Female',\n", - " 'hours_per_week': 45}" + "# generate counterfactuals\n", + "dice_exp = exp.generate_counterfactuals(x_test[1:3], total_CFs=4, desired_class=\"opposite\")\n", + "# highlight only the changes\n", + "dice_exp.visualize_as_dataframe(show_only_changes=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also use method-agnostic explainers like \"random\" or \"genetic\". " ] }, { @@ -536,16 +498,20 @@ "metadata": {}, "outputs": [], "source": [ - "# generate counterfactuals\n", - "dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=4, desired_class=\"opposite\")" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")\n", + "exp = dice_ml.Dice(d, m, method=\"random\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ + "# generate counterfactuals\n", + "dice_exp = exp.generate_counterfactuals(x_test[1:3], total_CFs=4, desired_class=\"opposite\")\n", "# highlight only the changes\n", "dice_exp.visualize_as_dataframe(show_only_changes=True)" ] @@ -569,7 +535,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -583,7 +549,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.8.12" }, "toc": { "base_numbering": 1, diff --git a/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb b/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb index 0f9da6c6..2074a05d 100644 --- a/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb +++ b/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb @@ -302,7 +302,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -316,7 +316,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/docs/source/notebooks/DiCE_with_private_data.ipynb b/docs/source/notebooks/DiCE_with_private_data.ipynb index ebde6c15..be7a3b91 100644 --- a/docs/source/notebooks/DiCE_with_private_data.ipynb +++ b/docs/source/notebooks/DiCE_with_private_data.ipynb @@ -15,6 +15,7 @@ "outputs": [], "source": [ "# import DiCE\n", + "import pandas as pd\n", "import dice_ml\n", "from dice_ml.utils import helpers # helper functions\n", "\n", @@ -53,15 +54,15 @@ "metadata": {}, "outputs": [], "source": [ - "d = dice_ml.Data(features={\n", - " 'age': [17, 90],\n", - " 'workclass': ['Government', 'Other/Unknown', 'Private', 'Self-Employed'],\n", - " 'education': ['Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Prof-school', 'School', 'Some-college'],\n", - " 'marital_status': ['Divorced', 'Married', 'Separated', 'Single', 'Widowed'],\n", - " 'occupation': ['Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar'],\n", - " 'race': ['Other', 'White'],\n", - " 'gender': ['Female', 'Male'],\n", - " 'hours_per_week': [1, 99]},\n", + "d = dice_ml.Data(features={'age': [17, 90],\n", + " 'workclass': ['Government', 'Other/Unknown', 'Private', 'Self-Employed'],\n", + " 'education': ['Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters',\n", + " 'Prof-school', 'School', 'Some-college'],\n", + " 'marital_status': ['Divorced', 'Married', 'Separated', 'Single', 'Widowed'],\n", + " 'occupation': ['Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar'],\n", + " 'race': ['Other', 'White'],\n", + " 'gender': ['Female', 'Male'],\n", + " 'hours_per_week': [1, 99]},\n", " outcome_name='income')" ] }, @@ -94,7 +95,7 @@ "source": [ "backend = 'TF'+tf.__version__[0] # TF1\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", - "m = dice_ml.Model(model_path=ML_modelpath, backend=backend)" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] }, { @@ -113,7 +114,7 @@ "outputs": [], "source": [ "# initiate DiCE\n", - "exp = dice_ml.Dice(d, m)" + "exp = dice_ml.Dice(d, m, method=\"gradient\")" ] }, { @@ -123,14 +124,14 @@ "outputs": [], "source": [ "# query instance in the form of a dictionary; keys: feature name, values: feature value\n", - "query_instance = {'age': 22,\n", - " 'workclass': 'Private',\n", - " 'education': 'HS-grad',\n", - " 'marital_status': 'Single',\n", - " 'occupation': 'Service',\n", - " 'race': 'White',\n", - " 'gender': 'Female',\n", - " 'hours_per_week': 45}" + "query_instance = pd.DataFrame({'age': 22,\n", + " 'workclass': 'Private',\n", + " 'education': 'HS-grad',\n", + " 'marital_status': 'Single',\n", + " 'occupation': 'Service',\n", + " 'race': 'White',\n", + " 'gender': 'Female',\n", + " 'hours_per_week': 45}, index=[0])" ] }, { @@ -165,7 +166,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -179,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.8.12" }, "toc": { "base_numbering": 1, diff --git a/requirements-test.txt b/requirements-test.txt index c44d78f9..35273e31 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,6 +5,6 @@ pytest pytest-cov twine pytest-mock - +torch # Pin scikit-learn scikit-learn<1.1.2 diff --git a/tests/conftest.py b/tests/conftest.py index 1050b0fe..8d5dd1fe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -227,8 +227,11 @@ def sample_adultincome_query(): """ Returns a sample query instance for adult income dataset """ - return {'age': 22, 'workclass': 'Private', 'education': 'HS-grad', 'marital_status': 'Single', 'occupation': 'Service', - 'race': 'White', 'gender': 'Female', 'hours_per_week': 45} + return pd.DataFrame({ + 'age': 22, 'workclass': 'Private', 'education': 'HS-grad', + 'marital_status': 'Single', 'occupation': 'Service', + 'race': 'White', 'gender': 'Female', 'hours_per_week': 45}, + index=[0]) @pytest.fixture() diff --git a/tests/test_counterfactual_explanations.py b/tests/test_counterfactual_explanations.py index 4843fc14..187bbf7e 100644 --- a/tests/test_counterfactual_explanations.py +++ b/tests/test_counterfactual_explanations.py @@ -112,9 +112,9 @@ def test_sorted_local_importance_counterfactual_explanations(self): @pytest.fixture() def random_binary_classification_exp_object(): backend = 'sklearn' - dataset = helpers.load_custom_testing_dataset() + dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline() + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp diff --git a/tests/test_data.py b/tests/test_data.py index 04cf17d6..d562a5aa 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -71,7 +71,6 @@ def test_ohe_min_max_transformed_query_instance(self, sample_adultincome_query): output_query = [0.068, 0.449, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0] d = self.d[0] - d.create_ohe_params() prepared_query = d.get_ohe_min_max_normalized_data(query_instance=sample_adultincome_query).iloc[0].tolist() assert output_query == pytest.approx(prepared_query, abs=1e-3) @@ -88,7 +87,8 @@ def test_encoded_categorical_features(self): # d.continuous_feature_names = ['cat2_cont1', 'cont2'] # d.encoded_feature_names = ['cat2_cont1', 'cont2', 'cat1_val1', 'cat1_val2', 'cat2_val1', 'cat2_val2'] print(d.data_df) - d.create_ohe_params() + temp_ohe_data = d.get_ohe_min_max_normalized_data(d.data_df.iloc[[0]]) + d.create_ohe_params(temp_ohe_data) res.append(d.get_encoded_categorical_feature_indexes()) assert [2, 3, 4, 5] == res[0][0] # there are 4 types of workclass assert len(res[0][1]) == 8 # eight types of education @@ -104,7 +104,9 @@ def test_features_to_vary(self): """ res = [] d = self.d[0] - d.create_ohe_params() + temp_ohe_data = d.get_ohe_min_max_normalized_data(d.data_df.iloc[[0]]) + d.create_ohe_params(temp_ohe_data) + # d.create_ohe_params() # d.categorical_feature_names = ['cat1', 'cat2'] # d.encoded_feature_names = ['cat2_cont1', 'cont2', 'cat1_val1', 'cat1_val2', 'cat2_val1', 'cat2_val2'] # d.continuous_feature_names = ['cat2_cont1', 'cont2'] diff --git a/tests/test_data_interface/test_public_data_interface.py b/tests/test_data_interface/test_public_data_interface.py index 4ced33b9..b713e0f6 100644 --- a/tests/test_data_interface/test_public_data_interface.py +++ b/tests/test_data_interface/test_public_data_interface.py @@ -24,7 +24,8 @@ def _get_data_object(self, data_object): self.d = data_object def test_permitted_range(self): - self.d.create_ohe_params() + temp_ohe_data = self.d.get_ohe_min_max_normalized_data(self.d.data_df.iloc[[0]]) + self.d.create_ohe_params(temp_ohe_data) minx, maxx = self.d.get_minx_maxx(normalized=False) assert [minx[0][0], maxx[0][0]] == [45, 60] minx, maxx = self.d.get_minx_maxx(normalized=True) diff --git a/tests/test_dice.py b/tests/test_dice.py index 7b49ea92..40e37630 100644 --- a/tests/test_dice.py +++ b/tests/test_dice.py @@ -23,14 +23,14 @@ def _get_exp(self, backend, method="random", is_public_data_interface=True): 'hours_per_week': [1, 99]}, outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func="ohe-min-max") exp = dice_ml.Dice(d, m, method=method) return exp def test_tf(self): tf = pytest.importorskip("tensorflow") backend = 'TF'+tf.__version__[0] - exp = self._get_exp(backend) + exp = self._get_exp(backend, method="gradient") assert issubclass(type(exp), dice_ml.explainer_interfaces.explainer_base.ExplainerBase) assert isinstance(exp, dice_ml.explainer_interfaces.dice_tensorflow2.DiceTensorFlow2) or \ isinstance(exp, dice_ml.explainer_interfaces.dice_tensorflow1.DiceTensorFlow1) @@ -38,7 +38,7 @@ def test_tf(self): def test_pyt(self): pytest.importorskip("torch") backend = 'PYT' - exp = self._get_exp(backend) + exp = self._get_exp(backend, method="gradient") assert issubclass(type(exp), dice_ml.explainer_interfaces.explainer_base.ExplainerBase) assert isinstance(exp, dice_ml.explainer_interfaces.dice_pytorch.DicePyTorch) diff --git a/tests/test_dice_interface/test_dice_genetic.py b/tests/test_dice_interface/test_dice_genetic.py index e2c1aaec..02a8f0cf 100644 --- a/tests/test_dice_interface/test_dice_genetic.py +++ b/tests/test_dice_interface/test_dice_genetic.py @@ -3,22 +3,29 @@ import dice_ml from dice_ml.utils import helpers from dice_ml.utils.exception import UserConfigValidationException +from dice_ml.utils.neuralnetworks import FFNetwork +BACKENDS = ['sklearn', 'PYT'] -@pytest.fixture() -def genetic_binary_classification_exp_object(): - backend = 'sklearn' + +@pytest.fixture(scope="module", params=['sklearn']) +def genetic_binary_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + if backend == "PYT": + net = FFNetwork(4) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='genetic') return exp -@pytest.fixture() -def genetic_multi_classification_exp_object(): - backend = 'sklearn' +@pytest.fixture(scope="module", params=['sklearn']) +def genetic_multi_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_multiclass() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_multiclass() @@ -27,13 +34,17 @@ def genetic_multi_classification_exp_object(): return exp -@pytest.fixture() -def genetic_regression_exp_object(): - backend = 'sklearn' +@pytest.fixture(scope="module", params=BACKENDS) +def genetic_regression_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_regression() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') + if backend == "PYT": + net = FFNetwork(4, is_classifier=False) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max", model_type='regressor') + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') exp = dice_ml.Dice(d, m, method='genetic') return exp @@ -136,6 +147,7 @@ def test_predict_custom(self, desired_class, sample_custom_query_2, mocker): self.exp.yloss_type = 'hinge_loss' mocker.patch('dice_ml.explainer_interfaces.dice_genetic.DiceGenetic.label_decode', return_value=None) mocker.patch('dice_ml.model_interfaces.base_model.BaseModel.get_output', return_value=[[0, 0.5, 0.5]]) + mocker.patch('dice_ml.model_interfaces.pytorch_model.PyTorchModel.get_output', return_value=[[0, 0.5, 0.5]]) custom_preds = self.exp._predict_fn_custom(sample_custom_query_2, desired_class) assert custom_preds[0] == desired_class diff --git a/tests/test_dice_interface/test_dice_pytorch.py b/tests/test_dice_interface/test_dice_pytorch.py index 745d53e0..60d46b48 100644 --- a/tests/test_dice_interface/test_dice_pytorch.py +++ b/tests/test_dice_interface/test_dice_pytorch.py @@ -14,8 +14,8 @@ def pyt_exp_object(): dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) - exp = dice_ml.Dice(d, m) + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func="ohe-min-max") + exp = dice_ml.Dice(d, m, method="gradient") return exp diff --git a/tests/test_dice_interface/test_dice_random.py b/tests/test_dice_interface/test_dice_random.py index 57507040..b773ec35 100644 --- a/tests/test_dice_interface/test_dice_random.py +++ b/tests/test_dice_interface/test_dice_random.py @@ -1,41 +1,59 @@ import pytest +import torch import dice_ml from dice_ml.counterfactual_explanations import CounterfactualExplanations from dice_ml.diverse_counterfactuals import CounterfactualExamples from dice_ml.utils import helpers from dice_ml.utils.exception import UserConfigValidationException +from dice_ml.utils.neuralnetworks import FFNetwork +BACKENDS = ['sklearn', 'PYT'] -@pytest.fixture() -def random_binary_classification_exp_object(): - backend = 'sklearn' + +@pytest.fixture(scope="module", params=BACKENDS) +def random_binary_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + if backend == "PYT": + torch.manual_seed(1) + net = FFNetwork(4) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp -@pytest.fixture() -def random_multi_classification_exp_object(): - backend = 'sklearn' +# TODO multiclass is not currently supported for neural networks +@pytest.fixture(scope="module", params=['sklearn']) +def random_multi_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_multiclass() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_multiclass() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + if backend == "PYT": + net = FFNetwork(4) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_multiclass() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp -@pytest.fixture() -def random_regression_exp_object(): - backend = 'sklearn' +@pytest.fixture(scope="module", params=BACKENDS) +def random_regression_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_regression() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') + if backend == "PYT": + net = FFNetwork(4, is_classifier=False) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max", model_type='regressor') + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') exp = dice_ml.Dice(d, m, method='random') return exp @@ -46,7 +64,9 @@ def _initiate_exp_object(self, random_binary_classification_exp_object): self.exp = random_binary_classification_exp_object # explainer object @pytest.mark.parametrize(("desired_class", "total_CFs"), [(0, 1)]) - def test_random_counterfactual_explanations_output(self, desired_class, sample_custom_query_1, total_CFs): + def test_random_counterfactual_explanations_output( + self, + desired_class, sample_custom_query_1, total_CFs): counterfactual_explanations = self.exp.generate_counterfactuals( query_instances=sample_custom_query_1, desired_class=desired_class, total_CFs=total_CFs) diff --git a/tests/test_dice_interface/test_dice_tensorflow.py b/tests/test_dice_interface/test_dice_tensorflow.py index 21969349..26329304 100644 --- a/tests/test_dice_interface/test_dice_tensorflow.py +++ b/tests/test_dice_interface/test_dice_tensorflow.py @@ -14,8 +14,8 @@ def tf_exp_object(): dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) - exp = dice_ml.Dice(d, m) + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func="ohe-min-max") + exp = dice_ml.Dice(d, m, method="gradient") return exp diff --git a/tests/test_model_interface/test_keras_tensorflow_model.py b/tests/test_model_interface/test_keras_tensorflow_model.py index ec2e530b..840a22b1 100644 --- a/tests/test_model_interface/test_keras_tensorflow_model.py +++ b/tests/test_model_interface/test_keras_tensorflow_model.py @@ -62,9 +62,8 @@ def test_load_model(self): @pytest.mark.parametrize("prediction", [0.747]) def test_model_output(self, sample_adultincome_query, public_data_object, prediction): # Initializing data and model objects - public_data_object.create_ohe_params() self.m.load_model() - # initializing data transormation required for ML model + # initializing data transformation required for ML model self.m.transformer = DataTransfomer(func='ohe-min-max', kw_args=None) self.m.transformer.feed_data_params(public_data_object) self.m.transformer.initialize_transform_func() diff --git a/tests/test_model_interface/test_pytorch_model.py b/tests/test_model_interface/test_pytorch_model.py index e7ee4ac3..beecf4ba 100644 --- a/tests/test_model_interface/test_pytorch_model.py +++ b/tests/test_model_interface/test_pytorch_model.py @@ -50,13 +50,12 @@ def test_load_model(self): @pytest.mark.parametrize("prediction", [0.0957]) def test_model_output(self, sample_adultincome_query, public_data_object, prediction): # initializing data transormation required for ML model - public_data_object.create_ohe_params() self.m.load_model() self.m.transformer = DataTransfomer(func='ohe-min-max', kw_args=None) self.m.transformer.feed_data_params(public_data_object) self.m.transformer.initialize_transform_func() output_instance = self.m.get_output(sample_adultincome_query, transform_data=True) - predictval = output_instance.detach().numpy()[0][0] + predictval = output_instance[0][0] assert predictval is not None # TODO: The assert below fails. # assert pytest.approx(predictval, abs=1e-3) == prediction