From 0b05465ee3d19efe9e9159f48cf1d9ff85596660 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sun, 16 Jan 2022 10:19:33 +0530 Subject: [PATCH 01/15] refactoring pytorch model to work with any other method --- .../data_interfaces/public_data_interface.py | 4 +- dice_ml/dice.py | 75 +++++++------ dice_ml/explainer_interfaces/dice_genetic.py | 9 +- dice_ml/explainer_interfaces/dice_pytorch.py | 18 ++-- .../explainer_interfaces/explainer_base.py | 14 ++- dice_ml/model_interfaces/base_model.py | 1 + dice_ml/model_interfaces/pytorch_model.py | 102 ++++++++++-------- .../notebooks/DiCE_getting_started.ipynb | 39 +++---- 8 files changed, 147 insertions(+), 115 deletions(-) diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py index ab3e5ed1..8cf4a9db 100644 --- a/dice_ml/data_interfaces/public_data_interface.py +++ b/dice_ml/data_interfaces/public_data_interface.py @@ -570,8 +570,8 @@ def get_ohe_min_max_normalized_data(self, query_instance): temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False) temp = self.one_hot_encode_data(temp) temp = temp.tail(query_instance.shape[0]).reset_index(drop=True) - # returns a pandas dataframe - return self.normalize_data(temp) + # returns a pandas dataframe with all numeric values + return self.normalize_data(temp).apply(pd.to_numeric) def get_inverse_ohe_min_max_normalized_data(self, transformed_data): """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data diff --git a/dice_ml/dice.py b/dice_ml/dice.py index d1c78172..2124c3af 100644 --- a/dice_ml/dice.py +++ b/dice_ml/dice.py @@ -46,45 +46,44 @@ def decide(model_interface, method): subpackage and import-and-return the class in an elif loop as shown in the below method. """ - if model_interface.backend == BackEndTypes.Sklearn: - if method == SamplingStrategy.Random: - # random sampling of CFs - from dice_ml.explainer_interfaces.dice_random import DiceRandom - return DiceRandom - elif method == SamplingStrategy.Genetic: - from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic - return DiceGenetic - elif method == SamplingStrategy.KdTree: - from dice_ml.explainer_interfaces.dice_KD import DiceKD - return DiceKD - else: - raise UserConfigValidationException("Unsupported sample strategy {0} provided. " - "Please choose one of {1}, {2} or {3}".format( - method, SamplingStrategy.Random, - SamplingStrategy.Genetic, - SamplingStrategy.KdTree - )) - - elif model_interface.backend == BackEndTypes.Tensorflow1: - # pretrained Keras Sequential model with Tensorflow 1.x backend - from dice_ml.explainer_interfaces.dice_tensorflow1 import \ - DiceTensorFlow1 - return DiceTensorFlow1 + if method == SamplingStrategy.Random: + # random sampling of CFs + from dice_ml.explainer_interfaces.dice_random import DiceRandom + return DiceRandom + elif method == SamplingStrategy.Genetic: + from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic + return DiceGenetic + elif method == SamplingStrategy.KdTree: + from dice_ml.explainer_interfaces.dice_KD import DiceKD + return DiceKD + elif method is None: + if model_interface.backend == BackEndTypes.Tensorflow1: + # pretrained Keras Sequential model with Tensorflow 1.x backend + from dice_ml.explainer_interfaces.dice_tensorflow1 import \ + DiceTensorFlow1 + return DiceTensorFlow1 - elif model_interface.backend == BackEndTypes.Tensorflow2: - # pretrained Keras Sequential model with Tensorflow 2.x backend - from dice_ml.explainer_interfaces.dice_tensorflow2 import \ - DiceTensorFlow2 - return DiceTensorFlow2 + elif model_interface.backend == BackEndTypes.Tensorflow2: + # pretrained Keras Sequential model with Tensorflow 2.x backend + from dice_ml.explainer_interfaces.dice_tensorflow2 import \ + DiceTensorFlow2 + return DiceTensorFlow2 - elif model_interface.backend == BackEndTypes.Pytorch: - # PyTorch backend - from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch - return DicePyTorch + elif model_interface.backend == BackEndTypes.Pytorch: + # PyTorch backend + from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch + return DicePyTorch + else: + # all other backends + backend_dice = model_interface.backend['explainer'] + module_name, class_name = backend_dice.split('.') + module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name]) + return getattr(module, class_name) else: - # all other backends - backend_dice = model_interface.backend['explainer'] - module_name, class_name = backend_dice.split('.') - module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name]) - return getattr(module, class_name) + raise UserConfigValidationException("Unsupported sample strategy {0} provided. " + "Please choose one of {1}, {2} or {3}".format( + method, SamplingStrategy.Random, + SamplingStrategy.Genetic, + SamplingStrategy.KdTree + )) diff --git a/dice_ml/explainer_interfaces/dice_genetic.py b/dice_ml/explainer_interfaces/dice_genetic.py index ed35ee49..9342a0a0 100644 --- a/dice_ml/explainer_interfaces/dice_genetic.py +++ b/dice_ml/explainer_interfaces/dice_genetic.py @@ -304,7 +304,11 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k def predict_fn_scores(self, input_instance): """Returns prediction scores.""" input_instance = self.label_decode(input_instance) - return self.model.get_output(input_instance) + out = self.model.get_output(input_instance) + if out.shape[1] == 1 and self.model.model_type == ModelTypes.Classifier: + # DL models return only 1 for binary classification + out = np.hstack((1-out, out)) + return out def predict_fn(self, input_instance): """Returns actual prediction.""" @@ -321,6 +325,9 @@ def _predict_fn_custom(self, input_instance, desired_class): input_instance = self.label_decode(input_instance) output = self.model.get_output(input_instance, model_score=True) + if output.shape[1] == 1 and self.model.model_type == ModelTypes.Classifier: + # DL models return only 1 for binary classification + output = np.hstack((1-output, output)) desired_class = int(desired_class) maxvalues = np.max(output, 1) predicted_values = np.argmax(output, 1) diff --git a/dice_ml/explainer_interfaces/dice_pytorch.py b/dice_ml/explainer_interfaces/dice_pytorch.py index 09d257cc..270ac2ad 100644 --- a/dice_ml/explainer_interfaces/dice_pytorch.py +++ b/dice_ml/explainer_interfaces/dice_pytorch.py @@ -48,7 +48,9 @@ def __init__(self, data_interface, model_interface): self.hyperparameters = [1, 1, 1] # proximity_weight, diversity_weight, categorical_penalty self.optimizer_weights = [] # optimizer, learning_rate - def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opposite", proximity_weight=0.5, + def _generate_counterfactuals(self, query_instance, total_CFs, + desired_class="opposite", desired_range=None, + proximity_weight=0.5, diversity_weight=1.0, categorical_penalty=0.1, algorithm="DiverseCF", features_to_vary="all", permitted_range=None, yloss_type="hinge_loss", diversity_loss_type="dpp_style:inverse_dist", feature_weights="inverse_mad", optimizer="pytorch:adam", learning_rate=0.05, min_iter=500, @@ -61,6 +63,7 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp :param total_CFs: Total number of counterfactuals required. :param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the outcome class of query_instance for binary classification. + :param desired_range: Not supported currently. :param proximity_weight: A positive float. Larger this weight, more close the counterfactuals are to the query_instance. :param diversity_weight: A positive float. Larger this weight, more diverse the counterfactuals are. @@ -129,7 +132,7 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp project_iter, loss_diff_thres, loss_converge_maxiter, verbose, init_near_query_instance, tie_random, stopping_threshold, posthoc_sparsity_param, posthoc_sparsity_algorithm) - counterfactual_explanations = exp.CounterfactualExamples( + return exp.CounterfactualExamples( data_interface=self.data_interface, final_cfs_df=final_cfs_df, test_instance_df=test_instance_df, @@ -137,17 +140,15 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp posthoc_sparsity_param=posthoc_sparsity_param, desired_class=desired_class) - return CounterfactualExplanations(cf_examples_list=[counterfactual_explanations]) - - def get_model_output(self, input_instance): + def get_model_output(self, input_instance, out_tensor=True): """get output probability of ML model""" - return self.model.get_output(input_instance)[(self.num_output_nodes-1):] + return self.model.get_output(input_instance, out_tensor=out_tensor)[(self.num_output_nodes-1):] def predict_fn(self, input_instance): """prediction function""" if not torch.is_tensor(input_instance): input_instance = torch.tensor(input_instance).float() - return self.get_model_output(input_instance).data.numpy() + return self.get_model_output(input_instance, out_tensor=False)#.data.numpy() def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" @@ -563,9 +564,11 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + # rounding off to 3 decimal places cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) + print(final_cfs_df) test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) @@ -606,5 +609,6 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin if final_cfs_df_sparse is not None: final_cfs_df_sparse = final_cfs_df_sparse.iloc[valid_ix].reset_index(drop=True) + print(final_cfs_df_sparse) # returning only valid CFs return final_cfs_df.iloc[valid_ix].reset_index(drop=True), test_instance_df, final_cfs_df_sparse diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 5f25d1cc..8d9ffb56 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -388,6 +388,9 @@ def feature_importance(self, query_instances, cf_examples_list=None, def predict_fn(self, input_instance): """prediction function""" + + #input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance) + #input_instance = input_instance.astype('float64') return self.model.get_output(input_instance) def predict_fn_for_sparsity(self, input_instance): @@ -556,6 +559,10 @@ def infer_target_cfs_class(self, desired_class_input, original_pred, num_output_ original_pred_1 = np.argmax(original_pred) target_class = int(1 - original_pred_1) return target_class + elif num_output_nodes == 1: # only for pytorch DL model + original_pred_1 = np.round(original_pred) + target_class = int(1-original_pred_1) + return target_class elif num_output_nodes > 2: raise UserConfigValidationException( "Desired class cannot be opposite if the number of classes is more than 2.") @@ -641,7 +648,10 @@ def get_model_output_from_scores(self, model_scores): model_output = np.zeros(len(model_scores), dtype=output_type) for i in range(len(model_scores)): if self.model.model_type == ModelTypes.Classifier: - model_output[i] = np.argmax(model_scores[i]) + if model_scores[i].shape[0] > 1: + model_output[i] = np.argmax(model_scores[i]) + else: + model_output[i] = np.round(model_scores[i])[0] elif self.model.model_type == ModelTypes.Regressor: model_output[i] = model_scores[i] return model_output @@ -672,7 +682,7 @@ def build_KD_tree(self, data_df_copy, desired_range, desired_class, predicted_ou dataset_instance = self.data_interface.prepare_query_instance( query_instance=data_df_copy[self.data_interface.feature_names]) - predictions = self.model.model.predict(dataset_instance) + predictions = self.model.get_output(dataset_instance, model_score=False).flatten() # TODO: Is it okay to insert a column in the original dataframe with the predicted outcome? This is memory-efficient data_df_copy[predicted_outcome_name] = predictions diff --git a/dice_ml/model_interfaces/base_model.py b/dice_ml/model_interfaces/base_model.py index 3a25b5cf..5cfe2b2d 100644 --- a/dice_ml/model_interfaces/base_model.py +++ b/dice_ml/model_interfaces/base_model.py @@ -35,6 +35,7 @@ def __init__(self, model=None, model_path='', backend='', func=None, kw_args=Non self.backend = backend # calls FunctionTransformer of scikit-learn internally # (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html) + print(func) self.transformer = DataTransfomer(func, kw_args) def load_model(self): diff --git a/dice_ml/model_interfaces/pytorch_model.py b/dice_ml/model_interfaces/pytorch_model.py index 9cf577cc..3596570a 100644 --- a/dice_ml/model_interfaces/pytorch_model.py +++ b/dice_ml/model_interfaces/pytorch_model.py @@ -1,47 +1,55 @@ -"""Module containing an interface to trained PyTorch model.""" - -import torch - -from dice_ml.model_interfaces.base_model import BaseModel - - -class PyTorchModel(BaseModel): - - def __init__(self, model=None, model_path='', backend='PYT', func=None, kw_args=None): - """Init method - - :param model: trained PyTorch Model. - :param model_path: path to trained model. - :param backend: "PYT" for PyTorch framework. - :param func: function transformation required for ML model. If func is None, then func will be the identity function. - :param kw_args: Dictionary of additional keyword arguments to pass to func. DiCE's data_interface is appended to the - dictionary of kw_args, by default. - """ - - super().__init__(model, model_path, backend) - - def load_model(self): - if self.model_path != '': - self.model = torch.load(self.model_path) - - def get_output(self, input_tensor, transform_data=False): - """returns prediction probabilities - - :param input_tensor: test input. - :param transform_data: boolean to indicate if data transformation is required. - """ - if transform_data: - input_tensor = torch.tensor(self.transformer.transform(input_tensor)).float() - - return self.model(input_tensor).float() - - def set_eval_mode(self): - self.model.eval() - - def get_gradient(self, input_instance): - # Future Support - raise NotImplementedError("Future Support") - - def get_num_output_nodes(self, inp_size): - temp_input = torch.rand(1, inp_size).float() - return self.get_output(temp_input).data +"""Module containing an interface to trained PyTorch model.""" + +import torch +import numpy as np +from dice_ml.model_interfaces.base_model import BaseModel +from dice_ml.constants import ModelTypes + +class PyTorchModel(BaseModel): + + def __init__(self, model=None, model_path='', backend='PYT', func=None, kw_args=None): + """Init method + + :param model: trained PyTorch Model. + :param model_path: path to trained model. + :param backend: "PYT" for PyTorch framework. + :param func: function transformation required for ML model. If func is None, then func will be the identity function. + :param kw_args: Dictionary of additional keyword arguments to pass to func. DiCE's data_interface is appended to the + dictionary of kw_args, by default. + """ + + super().__init__(model, model_path, backend, func, kw_args) + + def load_model(self): + if self.model_path != '': + self.model = torch.load(self.model_path) + + def get_output(self, input_instance, model_score=True, + transform_data=False, out_tensor=False): + """returns prediction probabilities + + :param input_tensor: test input. + :param transform_data: boolean to indicate if data transformation is required. + """ + input_tensor = input_instance + if transform_data: + input_tensor = torch.tensor(self.transformer.transform(input_instance)).float() + if not torch.is_tensor(input_instance): + input_tensor = torch.tensor(self.transformer.transform(input_instance)).float() + out = self.model(input_tensor).float() + if not out_tensor: + out = out.data.numpy() + if model_score is False and self.model_type == ModelTypes.Classifier: + out = np.round(out) # TODO need to generalize for n-class classifier + return out + + def set_eval_mode(self): + self.model.eval() + + def get_gradient(self, input_instance): + # Future Support + raise NotImplementedError("Future Support") + + def get_num_output_nodes(self, inp_size): + temp_input = torch.rand(1, inp_size).float() + return self.get_output(temp_input).data diff --git a/docs/source/notebooks/DiCE_getting_started.ipynb b/docs/source/notebooks/DiCE_getting_started.ipynb index 47ba396d..a53fd125 100644 --- a/docs/source/notebooks/DiCE_getting_started.ipynb +++ b/docs/source/notebooks/DiCE_getting_started.ipynb @@ -53,10 +53,6 @@ "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", - "# Tensorflow import\n", - "import tensorflow as tf\n", - "\n", - "\n", "# DiCE imports\n", "import dice_ml\n", "from dice_ml.utils import helpers # helper functions" @@ -391,13 +387,16 @@ "metadata": {}, "outputs": [], "source": [ + "# Tensorflow import\n", + "import tensorflow as tf\n", + "\n", "# supress deprecation warnings from TF\n", "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", "\n", "backend = 'TF'+tf.__version__[0] # TF1\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", "# Step 2: dice_ml.Model\n", - "m = dice_ml.Model(model_path=ML_modelpath, backend=backend)" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] }, { @@ -416,7 +415,7 @@ "outputs": [], "source": [ "# Step 3: initiate DiCE\n", - "exp = dice_ml.Dice(d, m)" + "exp = dice_ml.Dice(d, m, method=\"random\")" ] }, { @@ -519,15 +518,17 @@ "metadata": {}, "outputs": [], "source": [ - "# query instance in the form of a dictionary; keys: feature name, values: feature value\n", - "query_instance = {'age': 22,\n", - " 'workclass': 'Private',\n", - " 'education': 'HS-grad',\n", - " 'marital_status': 'Single',\n", - " 'occupation': 'Service',\n", - " 'race': 'White',\n", - " 'gender': 'Female',\n", - " 'hours_per_week': 45}" + "# generate counterfactuals\n", + "dice_exp = exp.generate_counterfactuals(x_test[1:3], total_CFs=4, desired_class=\"opposite\")\n", + "# highlight only the changes\n", + "dice_exp.visualize_as_dataframe(show_only_changes=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also use method-agnostic explainers like \"random\" or \"genetic\". Need to provide a transformer function that converts input data into one-hot encoded format. " ] }, { @@ -536,8 +537,8 @@ "metadata": {}, "outputs": [], "source": [ - "# generate counterfactuals\n", - "dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=4, desired_class=\"opposite\")" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")\n", + "exp = dice_ml.Dice(d, m, method=\"random\")" ] }, { @@ -546,6 +547,8 @@ "metadata": {}, "outputs": [], "source": [ + "# generate counterfactuals\n", + "dice_exp = exp.generate_counterfactuals(x_test[1:3], total_CFs=4, desired_class=\"opposite\")\n", "# highlight only the changes\n", "dice_exp.visualize_as_dataframe(show_only_changes=True)" ] @@ -583,7 +586,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.8.5" }, "toc": { "base_numbering": 1, From 0f2f2dd00bac13e26b10115514482839d3d3b7e4 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Tue, 5 Jul 2022 23:04:44 +0530 Subject: [PATCH 02/15] updated pytorch explainer and model to work with all methods Signed-off-by: Amit Sharma --- dice_ml/constants.py | 1 + .../data_interfaces/public_data_interface.py | 71 ++----------------- dice_ml/dice.py | 10 ++- dice_ml/explainer_interfaces/dice_pytorch.py | 49 ++++++------- dice_ml/explainer_interfaces/dice_random.py | 2 +- .../explainer_interfaces/explainer_base.py | 51 ++++++------- dice_ml/model_interfaces/base_model.py | 1 - .../keras_tensorflow_model.py | 7 +- dice_ml/model_interfaces/pytorch_model.py | 4 +- dice_ml/utils/helpers.py | 12 +++- .../notebooks/DiCE_getting_started.ipynb | 51 ++++--------- 11 files changed, 88 insertions(+), 171 deletions(-) diff --git a/dice_ml/constants.py b/dice_ml/constants.py index 1f178f8d..2071e3ea 100644 --- a/dice_ml/constants.py +++ b/dice_ml/constants.py @@ -14,6 +14,7 @@ class SamplingStrategy: Random = 'random' Genetic = 'genetic' KdTree = 'kdtree' + Gradient = 'gradient' class ModelTypes: diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py index 8cf4a9db..8cb019a5 100644 --- a/dice_ml/data_interfaces/public_data_interface.py +++ b/dice_ml/data_interfaces/public_data_interface.py @@ -65,28 +65,7 @@ def __init__(self, params): self.data_df[feature] = self.data_df[feature].astype( np.int32) - # should move the below snippet to gradient based dice interfaces - # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df) - # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist( - # ) if x not in np.array([self.outcome_name])] - - # should move the below snippet to model agnostic dice interfaces - # # Initializing a label encoder to obtain label-encoded values for categorical variables - # self.labelencoder = {} - # - # self.label_encoded_data = self.data_df.copy() - # - # for column in self.categorical_feature_names: - # self.labelencoder[column] = LabelEncoder() - # self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column]) - self._validate_and_set_permitted_range(params=params) - - # should move the below snippet to model agnostic dice interfaces - # self.max_range = -np.inf - # for feature in self.continuous_feature_names: - # self.max_range = max(self.max_range, self.permitted_range[feature][1]) - self._validate_and_set_data_name(params=params) def _validate_and_set_dataframe(self, params): @@ -299,25 +278,6 @@ def get_minx_maxx(self, normalized=True): minx[0][idx] = self.permitted_range[feature_name][0] maxx[0][idx] = self.permitted_range[feature_name][1] return minx, maxx - # if encoding=='one-hot': - # minx = np.array([[0.0] * len(self.ohe_encoded_feature_names)]) - # maxx = np.array([[1.0] * len(self.ohe_encoded_feature_names)]) - - # for idx, feature_name in enumerate(self.continuous_feature_names): - # max_value = self.train_df[feature_name].max() - # min_value = self.train_df[feature_name].min() - - # if normalized: - # minx[0][idx] = (self.permitted_range[feature_name] - # [0] - min_value) / (max_value - min_value) - # maxx[0][idx] = (self.permitted_range[feature_name] - # [1] - min_value) / (max_value - min_value) - # else: - # minx[0][idx] = self.permitted_range[feature_name][0] - # maxx[0][idx] = self.permitted_range[feature_name][1] - # else: - # minx = np.array([[0.0] * len(self.feature_names)]) - # maxx = np.array([[1.0] * len(self.feature_names)]) def get_mads(self, normalized=False): """Computes Median Absolute Deviation of features.""" @@ -362,24 +322,18 @@ def get_quantiles_from_training_data(self, quantile=0.05, normalized=False): list(set(normalized_train_df[feature].tolist())))), quantile) return quantiles - def create_ohe_params(self): + def create_ohe_params(self, one_hot_encoded_data): if len(self.categorical_feature_names) > 0: - one_hot_encoded_data = self.one_hot_encode_data(self.data_df) self.ohe_encoded_feature_names = [x for x in one_hot_encoded_data.columns.tolist( ) if x not in np.array([self.outcome_name])] else: # one-hot-encoded data is same as original data if there is no categorical features. self.ohe_encoded_feature_names = [feat for feat in self.feature_names] - # base dataframe for doing one-hot-encoding - # ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters) - # when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers - self.ohe_base_df = self.prepare_df_for_ohe_encoding() def get_data_params_for_gradient_dice(self): """Gets all data related params for DiCE.""" - self.create_ohe_params() minx, maxx = self.get_minx_maxx(normalized=True) # get the column indexes of categorical and continuous features after one-hot-encoding @@ -489,11 +443,11 @@ def get_decoded_data(self, data, encoding='one-hot'): index = [i for i in range(0, len(data))] if encoding == 'one-hot': if isinstance(data, pd.DataFrame): - return self.from_dummies(data) + return data elif isinstance(data, np.ndarray): data = pd.DataFrame(data=data, index=index, columns=self.ohe_encoded_feature_names) - return self.from_dummies(data) + return data else: raise ValueError("data should be a pandas dataframe or a numpy array") @@ -548,26 +502,13 @@ def prepare_query_instance(self, query_instance): test = test.reset_index(drop=True) return test - # TODO: create a new method, get_LE_min_max_normalized_data() to get label-encoded and normalized data. Keep this - # method only for converting query_instance to pd.DataFrame - # if encoding == 'label': - # for column in self.categorical_feature_names: - # test[column] = self.labelencoder[column].transform(test[column]) - # return self.normalize_data(test, encoding) - # - # elif encoding == 'one-hot': - # temp = self.prepare_df_for_encoding() - # temp = temp.append(test, ignore_index=True, sort=False) - # temp = self.one_hot_encode_data(temp) - # temp = self.normalize_data(temp) - # - # return temp.tail(test.shape[0]).reset_index(drop=True) def get_ohe_min_max_normalized_data(self, query_instance): """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts""" query_instance = self.prepare_query_instance(query_instance) - temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False) + ohe_base_df = self.prepare_df_for_ohe_encoding() + temp = ohe_base_df.append(query_instance, ignore_index=True, sort=False) temp = self.one_hot_encode_data(temp) temp = temp.tail(query_instance.shape[0]).reset_index(drop=True) # returns a pandas dataframe with all numeric values @@ -576,7 +517,7 @@ def get_ohe_min_max_normalized_data(self, query_instance): def get_inverse_ohe_min_max_normalized_data(self, transformed_data): """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array""" - raw_data = self.get_decoded_data(transformed_data, encoding='one-hot') + raw_data = self.from_dummies(transformed_data) raw_data = self.de_normalize_data(raw_data) precisions = self.get_decimal_precisions() for ix, feature in enumerate(self.continuous_feature_names): diff --git a/dice_ml/dice.py b/dice_ml/dice.py index 2124c3af..d4b550ca 100644 --- a/dice_ml/dice.py +++ b/dice_ml/dice.py @@ -56,7 +56,7 @@ def decide(model_interface, method): elif method == SamplingStrategy.KdTree: from dice_ml.explainer_interfaces.dice_KD import DiceKD return DiceKD - elif method is None: + elif method == SamplingStrategy.Gradient: if model_interface.backend == BackEndTypes.Tensorflow1: # pretrained Keras Sequential model with Tensorflow 1.x backend from dice_ml.explainer_interfaces.dice_tensorflow1 import \ @@ -73,8 +73,14 @@ def decide(model_interface, method): # PyTorch backend from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch return DicePyTorch - else: + raise UserConfigValidationException("{0} is only supported for differentiable neural network models. " + "Please choose one of {1}, {2} or {3}".format( + method, SamplingStrategy.Random, + SamplingStrategy.Genetic, + SamplingStrategy.KdTree + )) + elif method is None: # all other backends backend_dice = model_interface.backend['explainer'] module_name, class_name = backend_dice.split('.') diff --git a/dice_ml/explainer_interfaces/dice_pytorch.py b/dice_ml/explainer_interfaces/dice_pytorch.py index 270ac2ad..491dda2e 100644 --- a/dice_ml/explainer_interfaces/dice_pytorch.py +++ b/dice_ml/explainer_interfaces/dice_pytorch.py @@ -23,20 +23,17 @@ def __init__(self, data_interface, model_interface): """ # initiating data related parameters super().__init__(data_interface) - self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ - self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() - # initializing model related variables self.model = model_interface self.model.load_model() # loading trained model - if self.model.transformer.func is not None: # TODO: this error is probably too big - need to change it. - raise ValueError("Gradient-based DiCE currently " - "(1) accepts the data only in raw categorical and continuous formats, " - "(2) does one-hot-encoding and min-max-normalization internally, " - "(3) expects the ML model the accept the data in this same format. " - "If your problem supports this, please initialize model class again " - "with no custom transformation function.") - # number of output nodes of ML model + self.model.transformer.feed_data_params(data_interface) + self.model.transformer.initialize_transform_func() + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) + self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ + self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() + self.num_output_nodes = self.model.get_num_output_nodes(len(self.data_interface.ohe_encoded_feature_names)).shape[1] # variables required to generate CFs - see generate_counterfactuals() for more info @@ -107,10 +104,6 @@ def _generate_counterfactuals(self, query_instance, total_CFs, # check permitted range for continuous features if permitted_range is not None: - # if not self.data_interface.check_features_range(permitted_range): - # raise ValueError( - # "permitted range of features should be within their original range") - # else: self.data_interface.permitted_range = permitted_range self.minx, self.maxx = self.data_interface.get_minx_maxx(normalized=True) self.cont_minx = [] @@ -140,19 +133,23 @@ def _generate_counterfactuals(self, query_instance, total_CFs, posthoc_sparsity_param=posthoc_sparsity_param, desired_class=desired_class) - def get_model_output(self, input_instance, out_tensor=True): + def get_model_output(self, input_instance, + transform_data=False, out_tensor=True): """get output probability of ML model""" - return self.model.get_output(input_instance, out_tensor=out_tensor)[(self.num_output_nodes-1):] + return self.model.get_output(input_instance, + transform_data=transform_data, + out_tensor=out_tensor)[(self.num_output_nodes-1):] def predict_fn(self, input_instance): """prediction function""" if not torch.is_tensor(input_instance): input_instance = torch.tensor(input_instance).float() - return self.get_model_output(input_instance, out_tensor=False)#.data.numpy() + return self.get_model_output(input_instance, + transform_data=False, out_tensor=False) def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" - input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance).iloc[0].values + input_instance = self.model.transformer.transform(input_instance).to_numpy()[0] return self.predict_fn(torch.tensor(input_instance).float()) def do_cf_initializations(self, total_CFs, algorithm, features_to_vary): @@ -420,11 +417,7 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin init_near_query_instance, tie_random, stopping_threshold, posthoc_sparsity_param, posthoc_sparsity_algorithm): """Finds counterfactuals by gradient-descent.""" - - # Prepares user defined query_instance for DiCE. - # query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance, encoding='one-hot') - # query_instance = query_instance.iloc[0].values - query_instance = self.data_interface.get_ohe_min_max_normalized_data(query_instance).iloc[0].values + query_instance = self.model.transformer.transform(query_instance).to_numpy()[0] self.x1 = torch.tensor(query_instance) # find the predicted value of query_instance @@ -563,14 +556,15 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) - final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + final_cfs_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(cfs)) # rounding off to 3 decimal places cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) - print(final_cfs_df) - test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) + test_instance_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(query_instance)) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) # post-hoc operation on continuous features to enhance sparsity - only for public data @@ -609,6 +603,5 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin if final_cfs_df_sparse is not None: final_cfs_df_sparse = final_cfs_df_sparse.iloc[valid_ix].reset_index(drop=True) - print(final_cfs_df_sparse) # returning only valid CFs return final_cfs_df.iloc[valid_ix].reset_index(drop=True), test_instance_df, final_cfs_df_sparse diff --git a/dice_ml/explainer_interfaces/dice_random.py b/dice_ml/explainer_interfaces/dice_random.py index 2995a398..769abc6b 100644 --- a/dice_ml/explainer_interfaces/dice_random.py +++ b/dice_ml/explainer_interfaces/dice_random.py @@ -24,12 +24,12 @@ def __init__(self, data_interface, model_interface): """ super().__init__(data_interface) # initiating data related parameters - self.data_interface.create_ohe_params() self.model = model_interface self.model.load_model() # loading pickled trained model if applicable self.model.transformer.feed_data_params(data_interface) self.model.transformer.initialize_transform_func() + self.precisions = self.data_interface.get_decimal_precisions(output_type="dict") if self.data_interface.outcome_name in self.precisions: self.outcome_precision = [self.precisions[self.data_interface.outcome_name]] diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 8d9ffb56..23794bd3 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -32,20 +32,6 @@ def __init__(self, data_interface, model_interface=None): self.model.transformer.feed_data_params(data_interface) self.model.transformer.initialize_transform_func() - # moved the following snippet to a method in public_data_interface - # self.minx, self.maxx, self.encoded_categorical_feature_indexes = self.data_interface.get_data_params() - # - # # min and max for continuous features in original scale - # flattened_indexes = [item for sublist in self.encoded_categorical_feature_indexes for item in sublist] - # self.encoded_continuous_feature_indexes = [ix for ix in range(len(self.minx[0])) if ix not in flattened_indexes] - # org_minx, org_maxx = self.data_interface.get_minx_maxx(normalized=False) - # self.cont_minx = list(org_minx[0][self.encoded_continuous_feature_indexes]) - # self.cont_maxx = list(org_maxx[0][self.encoded_continuous_feature_indexes]) - # - # # decimal precisions for continuous features - # self.cont_precisions = \ - # [self.data_interface.get_decimal_precisions()[ix] for ix in self.encoded_continuous_feature_indexes] - def generate_counterfactuals(self, query_instances, total_CFs, desired_class="opposite", desired_range=None, permitted_range=None, features_to_vary="all", @@ -88,7 +74,6 @@ def generate_counterfactuals(self, query_instances, total_CFs, query_instances_list.append(query_instances[ix:(ix+1)]) elif isinstance(query_instances, Iterable): query_instances_list = query_instances - for query_instance in tqdm(query_instances_list): self.data_interface.set_continuous_feature_indexes(query_instance) res = self._generate_counterfactuals( @@ -103,7 +88,6 @@ def generate_counterfactuals(self, query_instances, total_CFs, verbose=verbose, **kwargs) cf_examples_arr.append(res) - self._check_any_counterfactuals_computed(cf_examples_arr=cf_examples_arr) return CounterfactualExplanations(cf_examples_list=cf_examples_arr) @@ -389,9 +373,10 @@ def feature_importance(self, query_instances, cf_examples_list=None, def predict_fn(self, input_instance): """prediction function""" - #input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance) - #input_instance = input_instance.astype('float64') - return self.model.get_output(input_instance) + preds = self.model.get_output(input_instance) + if len(preds.shape) == 1: # from deep learning predictors + preds = np.column_stack([preds, 1-preds]) + return preds def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" @@ -447,8 +432,7 @@ def do_posthoc_sparsity_enhancement(self, final_cfs_sparse, query_instance, post diff, decimal_prec, query_instance, cf_ix, feature, final_cfs_sparse, current_pred) temp_preds = self.predict_fn_for_sparsity(final_cfs_sparse.loc[[cf_ix]][self.data_interface.feature_names]) - cfs_preds_sparse.append(temp_preds) - + cfs_preds_sparse.append(temp_preds[0]) final_cfs_sparse[self.data_interface.outcome_name] = self.get_model_output_from_scores(cfs_preds_sparse) # final_cfs_sparse[self.data_interface.outcome_name] = np.round(final_cfs_sparse[self.data_interface.outcome_name], 3) return final_cfs_sparse @@ -592,11 +576,15 @@ def decide_cf_validity(self, model_outputs): for i in range(len(model_outputs)): pred = model_outputs[i] if self.model.model_type == ModelTypes.Classifier: - if self.num_output_nodes == 2: # binary - pred_1 = pred[self.num_output_nodes-1] + if self.num_output_nodes in (1,2): # binary + if self.num_output_nodes == 2: + pred_1 = pred[self.num_output_nodes-1] + else: + pred_1 = pred[0] validity[i] = 1 if \ ((self.target_cf_class == 0 and pred_1 <= self.stopping_threshold) or (self.target_cf_class == 1 and pred_1 >= self.stopping_threshold)) else 0 + else: # multiclass if np.argmax(pred) == self.target_cf_class: validity[i] = 1 @@ -623,14 +611,14 @@ def is_cf_valid(self, model_score): target_cf_class = self.target_cf_class[0][0] target_cf_class = int(target_cf_class) - if self.num_output_nodes == 1: # for tensorflow/pytorch models + if len(model_score) == 1: # for tensorflow/pytorch models pred_1 = model_score[0] validity = True if \ ((target_cf_class == 0 and pred_1 <= self.stopping_threshold) or (target_cf_class == 1 and pred_1 >= self.stopping_threshold)) else False return validity - if self.num_output_nodes == 2: # binary - pred_1 = model_score[self.num_output_nodes-1] + elif len(model_score) == 2: # binary + pred_1 = model_score[1] validity = True if \ ((target_cf_class == 0 and pred_1 <= self.stopping_threshold) or (target_cf_class == 1 and pred_1 >= self.stopping_threshold)) else False @@ -648,10 +636,13 @@ def get_model_output_from_scores(self, model_scores): model_output = np.zeros(len(model_scores), dtype=output_type) for i in range(len(model_scores)): if self.model.model_type == ModelTypes.Classifier: - if model_scores[i].shape[0] > 1: - model_output[i] = np.argmax(model_scores[i]) - else: - model_output[i] = np.round(model_scores[i])[0] + if hasattr(model_scores[i], "shape") and len(model_scores[i].shape) > 0: + if model_scores[i].shape[0] > 1: + model_output[i] = np.argmax(model_scores[i]) + else: + model_output[i] = np.round(model_scores[i])[0] + else: # 1-D input + model_output[i] = np.round(model_scores[i]) elif self.model.model_type == ModelTypes.Regressor: model_output[i] = model_scores[i] return model_output diff --git a/dice_ml/model_interfaces/base_model.py b/dice_ml/model_interfaces/base_model.py index 5cfe2b2d..3a25b5cf 100644 --- a/dice_ml/model_interfaces/base_model.py +++ b/dice_ml/model_interfaces/base_model.py @@ -35,7 +35,6 @@ def __init__(self, model=None, model_path='', backend='', func=None, kw_args=Non self.backend = backend # calls FunctionTransformer of scikit-learn internally # (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html) - print(func) self.transformer = DataTransfomer(func, kw_args) def load_model(self): diff --git a/dice_ml/model_interfaces/keras_tensorflow_model.py b/dice_ml/model_interfaces/keras_tensorflow_model.py index df150850..00074823 100644 --- a/dice_ml/model_interfaces/keras_tensorflow_model.py +++ b/dice_ml/model_interfaces/keras_tensorflow_model.py @@ -19,7 +19,7 @@ def __init__(self, model=None, model_path='', backend='TF1', func=None, kw_args= dictionary of kw_args, by default. """ - super().__init__(model, model_path, backend) + super().__init__(model, model_path, backend, func, kw_args) def load_model(self): if self.model_path != '': @@ -32,9 +32,8 @@ def get_output(self, input_tensor, training=False, transform_data=False): :param training: to determine training mode in TF2. :param transform_data: boolean to indicate if data transformation is required. """ - if transform_data: - input_tensor = tf.constant(self.transformer.transform(input_tensor), dtype=tf.float32) - + if transform_data or not tf.is_tensor(input_tensor): + input_tensor = tf.constant(self.transformer.transform(input_tensor).to_numpy(), dtype=tf.float32) if self.backend == 'TF2': return self.model(input_tensor, training=training) else: diff --git a/dice_ml/model_interfaces/pytorch_model.py b/dice_ml/model_interfaces/pytorch_model.py index 3596570a..c10ef2b4 100644 --- a/dice_ml/model_interfaces/pytorch_model.py +++ b/dice_ml/model_interfaces/pytorch_model.py @@ -33,9 +33,9 @@ def get_output(self, input_instance, model_score=True, """ input_tensor = input_instance if transform_data: - input_tensor = torch.tensor(self.transformer.transform(input_instance)).float() + input_tensor = torch.tensor(self.transformer.transform(input_instance).to_numpy()).float() if not torch.is_tensor(input_instance): - input_tensor = torch.tensor(self.transformer.transform(input_instance)).float() + input_tensor = torch.tensor(self.transformer.transform(input_instance).to_numpy()).float() out = self.model(input_tensor).float() if not out_tensor: out = out.data.numpy() diff --git a/dice_ml/utils/helpers.py b/dice_ml/utils/helpers.py index e3ea688f..20ef1c9b 100644 --- a/dice_ml/utils/helpers.py +++ b/dice_ml/utils/helpers.py @@ -214,8 +214,10 @@ def get_base_gen_cf_initialization(data_interface, encoded_size, cont_minx, cont def ohe_min_max_transformation(data, data_interface): """the data is one-hot-encoded and min-max normalized and fed to the ML model""" - return data_interface.get_ohe_min_max_normalized_data(data).values + return data_interface.get_ohe_min_max_normalized_data(data) +def inverse_ohe_min_max_transformation(data, data_interface): + return data_interface.get_inverse_ohe_min_max_normalized_data(data) class DataTransfomer: """A class to transform data based on user-defined function to get predicted outcomes. @@ -234,7 +236,13 @@ def feed_data_params(self, data_interface): def initialize_transform_func(self): if self.func == 'ohe-min-max': - self.data_transformer = FunctionTransformer(func=ohe_min_max_transformation, kw_args=self.kw_args, validate=False) + self.data_transformer = FunctionTransformer( + func=ohe_min_max_transformation, + inverse_func=inverse_ohe_min_max_transformation, + check_inverse=False, + validate=False, + kw_args=self.kw_args, + inv_kw_args=self.kw_args) elif self.func is None: # identity transformation # add more ready-to-use transformers (such as label-encoding) in elif loops. diff --git a/docs/source/notebooks/DiCE_getting_started.ipynb b/docs/source/notebooks/DiCE_getting_started.ipynb index a53fd125..fd6b1b3e 100644 --- a/docs/source/notebooks/DiCE_getting_started.ipynb +++ b/docs/source/notebooks/DiCE_getting_started.ipynb @@ -422,34 +422,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Below we provide query instance as a dict." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# query instance in the form of a dictionary or a dataframe; keys: feature name, values: feature value\n", - "query_instance = {'age': 22,\n", - " 'workclass': 'Private',\n", - " 'education': 'HS-grad',\n", - " 'marital_status': 'Single',\n", - " 'occupation': 'Service',\n", - " 'race': 'White',\n", - " 'gender': 'Female',\n", - " 'hours_per_week': 45}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate counterfactuals\n", - "dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=4, desired_class=\"opposite\")" + "Below we provide query instances from `x_test`." ] }, { @@ -460,6 +433,8 @@ }, "outputs": [], "source": [ + "# generate counterfactuals\n", + "dice_exp = exp.generate_counterfactuals(x_test[1:2], total_CFs=4, desired_class=\"opposite\")\n", "# visualize the result, highlight only the changes\n", "dice_exp.visualize_as_dataframe(show_only_changes=True)" ] @@ -482,7 +457,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Just change the backend variable to 'PYT' to use DiCE with PyTorch. Below, we use a pre-trained ML model in PyTorch which produces high accuracy comparable to other baselines. For convenience, we include the sample trained model with the DiCE package." + "Just change the backend variable to 'PYT' to use DiCE with PyTorch. Below, we use a pre-trained ML model in PyTorch which produces high accuracy comparable to other baselines. For convenience, we include the sample trained model with the DiCE package. Additionally, we need to provide a data transformer function that converts input dataframe into one-hot encoded/numeric format. " ] }, { @@ -493,7 +468,7 @@ "source": [ "backend = 'PYT'\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", - "m = dice_ml.Model(model_path=ML_modelpath, backend=backend)" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] }, { @@ -509,13 +484,15 @@ "metadata": {}, "outputs": [], "source": [ - "exp = dice_ml.Dice(d, m)" + "exp = dice_ml.Dice(d, m, method=\"gradient\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ "# generate counterfactuals\n", @@ -528,7 +505,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also use method-agnostic explainers like \"random\" or \"genetic\". Need to provide a transformer function that converts input data into one-hot encoded format. " + "We can also use method-agnostic explainers like \"random\" or \"genetic\". " ] }, { @@ -544,7 +521,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# generate counterfactuals\n", @@ -572,7 +551,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -586,7 +565,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.12" }, "toc": { "base_numbering": 1, From b01bab0de6fabff153380d336e5def69cfb50121 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Wed, 6 Jul 2022 22:14:54 +0530 Subject: [PATCH 03/15] fixed failing tests --- dice_ml/explainer_interfaces/dice_KD.py | 7 +++++-- dice_ml/explainer_interfaces/dice_genetic.py | 18 ++++++++++-------- dice_ml/explainer_interfaces/dice_random.py | 1 - ...ticlass_classification_and_regression.ipynb | 4 ++-- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/dice_ml/explainer_interfaces/dice_KD.py b/dice_ml/explainer_interfaces/dice_KD.py index a8e951ea..d31c9cc1 100644 --- a/dice_ml/explainer_interfaces/dice_KD.py +++ b/dice_ml/explainer_interfaces/dice_KD.py @@ -25,8 +25,6 @@ def __init__(self, data_interface, model_interface): self.total_random_inits = 0 super().__init__(data_interface) # initiating data related parameters - # As DiCE KD uses one-hot-encoding - self.data_interface.create_ohe_params() # initializing model variables self.model = model_interface @@ -34,6 +32,11 @@ def __init__(self, data_interface, model_interface): self.model.transformer.feed_data_params(data_interface) self.model.transformer.initialize_transform_func() + # As DiCE KD uses one-hot-encoding + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) + # loading trained model self.model.load_model() diff --git a/dice_ml/explainer_interfaces/dice_genetic.py b/dice_ml/explainer_interfaces/dice_genetic.py index 9342a0a0..b1fd3bc9 100644 --- a/dice_ml/explainer_interfaces/dice_genetic.py +++ b/dice_ml/explainer_interfaces/dice_genetic.py @@ -25,11 +25,6 @@ def __init__(self, data_interface, model_interface): """ super().__init__(data_interface, model_interface) # initiating data related parameters - # number of output nodes of ML model - if self.model.model_type == ModelTypes.Classifier: - self.num_output_nodes = self.model.get_num_output_nodes2( - self.data_interface.data_df[0:1][self.data_interface.feature_names]) - # variables required to generate CFs - see generate_counterfactuals() for more info self.cfs = [] self.features_to_vary = [] @@ -269,12 +264,18 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k # Prepares user defined query_instance for DiCE. query_instance_orig = query_instance query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance) + # number of output nodes of ML model + self.num_output_nodes = None + if self.model.model_type == ModelTypes.Classifier: + self.num_output_nodes = self.model.get_num_output_nodes2(query_instance) + query_instance = self.label_encode(query_instance) query_instance = np.array(query_instance.values[0]) self.x1 = query_instance # find the predicted value of query_instance test_pred = self.predict_fn(query_instance) + self.test_pred = test_pred desired_class = self.misc_init(stopping_threshold, desired_class, desired_range, test_pred) @@ -305,7 +306,7 @@ def predict_fn_scores(self, input_instance): """Returns prediction scores.""" input_instance = self.label_decode(input_instance) out = self.model.get_output(input_instance) - if out.shape[1] == 1 and self.model.model_type == ModelTypes.Classifier: + if self.model.model_type == ModelTypes.Classifier and out.shape[1] == 1: # DL models return only 1 for binary classification out = np.hstack((1-out, out)) return out @@ -313,7 +314,8 @@ def predict_fn_scores(self, input_instance): def predict_fn(self, input_instance): """Returns actual prediction.""" input_instance = self.label_decode(input_instance) - return self.model.get_output(input_instance, model_score=False) + preds = self.model.get_output(input_instance, model_score=False) + return preds def _predict_fn_custom(self, input_instance, desired_class): """Checks that the maximum predicted score lies in the desired class.""" @@ -325,7 +327,7 @@ def _predict_fn_custom(self, input_instance, desired_class): input_instance = self.label_decode(input_instance) output = self.model.get_output(input_instance, model_score=True) - if output.shape[1] == 1 and self.model.model_type == ModelTypes.Classifier: + if self.model.model_type == ModelTypes.Classifier and np.array(output).shape[1] == 1: # DL models return only 1 for binary classification output = np.hstack((1-output, output)) desired_class = int(desired_class) diff --git a/dice_ml/explainer_interfaces/dice_random.py b/dice_ml/explainer_interfaces/dice_random.py index 769abc6b..5df536a5 100644 --- a/dice_ml/explainer_interfaces/dice_random.py +++ b/dice_ml/explainer_interfaces/dice_random.py @@ -75,7 +75,6 @@ class of query_instance for binary classification. # Do predictions once on the query_instance and reuse across to reduce the number # inferences. model_predictions = self.predict_fn(query_instance) - # number of output nodes of ML model self.num_output_nodes = None if self.model.model_type == ModelTypes.Classifier: diff --git a/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb b/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb index 247ed1d7..f008d3b4 100644 --- a/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb +++ b/docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb @@ -314,7 +314,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -328,7 +328,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.8.12" } }, "nbformat": 4, From 579994c84cc4378803056880daa947162723d62e Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Thu, 7 Jul 2022 12:28:34 +0530 Subject: [PATCH 04/15] fixed failing tests --- .../explainer_interfaces/dice_tensorflow2.py | 19 ++++++++----------- .../explainer_interfaces/explainer_base.py | 5 +++-- tests/conftest.py | 7 +++++-- tests/test_data.py | 8 +++++--- .../test_public_data_interface.py | 3 ++- tests/test_dice.py | 6 +++--- .../test_dice_interface/test_dice_pytorch.py | 4 ++-- .../test_dice_tensorflow.py | 4 ++-- .../test_pytorch_model.py | 4 ++-- 9 files changed, 32 insertions(+), 28 deletions(-) diff --git a/dice_ml/explainer_interfaces/dice_tensorflow2.py b/dice_ml/explainer_interfaces/dice_tensorflow2.py index 58445929..14292261 100644 --- a/dice_ml/explainer_interfaces/dice_tensorflow2.py +++ b/dice_ml/explainer_interfaces/dice_tensorflow2.py @@ -23,20 +23,17 @@ def __init__(self, data_interface, model_interface): """ # initiating data related parameters super().__init__(data_interface) - self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ - self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() - # initializing model related variables self.model = model_interface self.model.load_model() # loading trained model - # TODO: this error is probably too big - need to change it. - if self.model.transformer.func is not None: - raise ValueError("Gradient-based DiCE currently " - "(1) accepts the data only in raw categorical and continuous formats, " - "(2) does one-hot-encoding and min-max-normalization internally, " - "(3) expects the ML model the accept the data in this same format. " - "If your problem supports this, please initialize model class again " - "with no custom transformation function.") + self.model.transformer.feed_data_params(data_interface) + self.model.transformer.initialize_transform_func() + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) + self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ + self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() + # number of output nodes of ML model self.num_output_nodes = self.model.get_num_output_nodes(len(self.data_interface.ohe_encoded_feature_names)).shape[1] diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 23794bd3..2571d26f 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -374,8 +374,9 @@ def predict_fn(self, input_instance): """prediction function""" preds = self.model.get_output(input_instance) - if len(preds.shape) == 1: # from deep learning predictors - preds = np.column_stack([preds, 1-preds]) + if self.model.model_type == ModelTypes.Classifier and \ + len(preds.shape) == 1: # from deep learning predictors + preds = np.column_stack([1 - preds, preds]) return preds def predict_fn_for_sparsity(self, input_instance): diff --git a/tests/conftest.py b/tests/conftest.py index c96a0089..915c27ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -115,8 +115,11 @@ def sample_adultincome_query(): """ Returns a sample query instance for adult income dataset """ - return {'age': 22, 'workclass': 'Private', 'education': 'HS-grad', 'marital_status': 'Single', 'occupation': 'Service', - 'race': 'White', 'gender': 'Female', 'hours_per_week': 45} + return pd.DataFrame({ + 'age': 22, 'workclass': 'Private', 'education': 'HS-grad', + 'marital_status': 'Single', 'occupation': 'Service', + 'race': 'White', 'gender': 'Female', 'hours_per_week': 45}, + index=[0]) @pytest.fixture diff --git a/tests/test_data.py b/tests/test_data.py index 04cf17d6..69cea0c8 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -71,7 +71,6 @@ def test_ohe_min_max_transformed_query_instance(self, sample_adultincome_query): output_query = [0.068, 0.449, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0] d = self.d[0] - d.create_ohe_params() prepared_query = d.get_ohe_min_max_normalized_data(query_instance=sample_adultincome_query).iloc[0].tolist() assert output_query == pytest.approx(prepared_query, abs=1e-3) @@ -88,7 +87,8 @@ def test_encoded_categorical_features(self): # d.continuous_feature_names = ['cat2_cont1', 'cont2'] # d.encoded_feature_names = ['cat2_cont1', 'cont2', 'cat1_val1', 'cat1_val2', 'cat2_val1', 'cat2_val2'] print(d.data_df) - d.create_ohe_params() + temp_ohe_data = d.get_ohe_min_max_normalized_data(d.data_df.iloc[[0]]) + d.create_ohe_params(temp_ohe_data) res.append(d.get_encoded_categorical_feature_indexes()) assert [2, 3, 4, 5] == res[0][0] # there are 4 types of workclass assert len(res[0][1]) == 8 # eight types of education @@ -104,7 +104,9 @@ def test_features_to_vary(self): """ res = [] d = self.d[0] - d.create_ohe_params() + temp_ohe_data = d.get_ohe_min_max_normalized_data(d.data_df.iloc[[0]]) + d.create_ohe_params(temp_ohe_data) + #d.create_ohe_params() # d.categorical_feature_names = ['cat1', 'cat2'] # d.encoded_feature_names = ['cat2_cont1', 'cont2', 'cat1_val1', 'cat1_val2', 'cat2_val1', 'cat2_val2'] # d.continuous_feature_names = ['cat2_cont1', 'cont2'] diff --git a/tests/test_data_interface/test_public_data_interface.py b/tests/test_data_interface/test_public_data_interface.py index 7ef66d87..c14f0a10 100644 --- a/tests/test_data_interface/test_public_data_interface.py +++ b/tests/test_data_interface/test_public_data_interface.py @@ -24,7 +24,8 @@ def _get_data_object(self, data_object): self.d = data_object def test_permitted_range(self): - self.d.create_ohe_params() + temp_ohe_data = self.d.get_ohe_min_max_normalized_data(self.d.data_df.iloc[[0]]) + self.d.create_ohe_params(temp_ohe_data) minx, maxx = self.d.get_minx_maxx(normalized=False) assert [minx[0][0], maxx[0][0]] == [45, 60] minx, maxx = self.d.get_minx_maxx(normalized=True) diff --git a/tests/test_dice.py b/tests/test_dice.py index a148cf61..95353cb7 100644 --- a/tests/test_dice.py +++ b/tests/test_dice.py @@ -23,14 +23,14 @@ def _get_exp(self, backend, method="random", is_public_data_interface=True): 'hours_per_week': [1, 99]}, outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func="ohe-min-max") exp = dice_ml.Dice(d, m, method=method) return exp def test_tf(self): tf = pytest.importorskip("tensorflow") backend = 'TF'+tf.__version__[0] - exp = self._get_exp(backend) + exp = self._get_exp(backend, method="gradient") assert issubclass(type(exp), dice_ml.explainer_interfaces.explainer_base.ExplainerBase) assert isinstance(exp, dice_ml.explainer_interfaces.dice_tensorflow2.DiceTensorFlow2) or \ isinstance(exp, dice_ml.explainer_interfaces.dice_tensorflow1.DiceTensorFlow1) @@ -38,7 +38,7 @@ def test_tf(self): def test_pyt(self): pytest.importorskip("torch") backend = 'PYT' - exp = self._get_exp(backend) + exp = self._get_exp(backend, method="gradient") assert issubclass(type(exp), dice_ml.explainer_interfaces.explainer_base.ExplainerBase) assert isinstance(exp, dice_ml.explainer_interfaces.dice_pytorch.DicePyTorch) diff --git a/tests/test_dice_interface/test_dice_pytorch.py b/tests/test_dice_interface/test_dice_pytorch.py index f43667d9..681107e2 100644 --- a/tests/test_dice_interface/test_dice_pytorch.py +++ b/tests/test_dice_interface/test_dice_pytorch.py @@ -14,8 +14,8 @@ def pyt_exp_object(): dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) - exp = dice_ml.Dice(d, m) + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func="ohe-min-max") + exp = dice_ml.Dice(d, m, method="gradient") return exp diff --git a/tests/test_dice_interface/test_dice_tensorflow.py b/tests/test_dice_interface/test_dice_tensorflow.py index ebe93279..b917309c 100644 --- a/tests/test_dice_interface/test_dice_tensorflow.py +++ b/tests/test_dice_interface/test_dice_tensorflow.py @@ -14,8 +14,8 @@ def tf_exp_object(): dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) - exp = dice_ml.Dice(d, m) + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func="ohe-min-max") + exp = dice_ml.Dice(d, m, method="gradient") return exp diff --git a/tests/test_model_interface/test_pytorch_model.py b/tests/test_model_interface/test_pytorch_model.py index 71816c60..76e08e8e 100644 --- a/tests/test_model_interface/test_pytorch_model.py +++ b/tests/test_model_interface/test_pytorch_model.py @@ -50,13 +50,13 @@ def test_load_model(self): @pytest.mark.parametrize("prediction", [0.0957]) def test_model_output(self, sample_adultincome_query, public_data_object, prediction): # initializing data transormation required for ML model - public_data_object.create_ohe_params() self.m.load_model() self.m.transformer = DataTransfomer(func='ohe-min-max', kw_args=None) self.m.transformer.feed_data_params(public_data_object) self.m.transformer.initialize_transform_func() output_instance = self.m.get_output(sample_adultincome_query, transform_data=True) - predictval = output_instance.detach().numpy()[0][0] + predictval = output_instance[0][0] + print(predictval) assert predictval is not None # TODO: The assert below fails. # assert pytest.approx(predictval, abs=1e-3) == prediction From 57ed9456ea6bc3e98a542dd5530f746d3f30ccc4 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Thu, 7 Jul 2022 13:46:00 +0530 Subject: [PATCH 05/15] updated tf methods to work with transformers --- .../explainer_interfaces/dice_tensorflow1.py | 37 ++++++++----------- .../explainer_interfaces/dice_tensorflow2.py | 16 ++++---- dice_ml/model_interfaces/pytorch_model.py | 6 ++- .../notebooks/DiCE_getting_started.ipynb | 2 +- .../test_keras_tensorflow_model.py | 3 +- .../test_pytorch_model.py | 1 - 6 files changed, 28 insertions(+), 37 deletions(-) diff --git a/dice_ml/explainer_interfaces/dice_tensorflow1.py b/dice_ml/explainer_interfaces/dice_tensorflow1.py index 69ee8298..ada79a4f 100644 --- a/dice_ml/explainer_interfaces/dice_tensorflow1.py +++ b/dice_ml/explainer_interfaces/dice_tensorflow1.py @@ -25,6 +25,14 @@ def __init__(self, data_interface, model_interface): """ # initiating data related parameters super().__init__(data_interface) + # initializing model related variables + self.model = model_interface + self.model.load_model() # loading trained model + self.model.transformer.feed_data_params(data_interface) + self.model.transformer.initialize_transform_func() + # temp data to create some attributes like encoded feature names + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + self.data_interface.create_ohe_params(temp_ohe_data) self.minx, self.maxx, self.encoded_categorical_feature_indexes, \ self.encoded_continuous_feature_indexes, self.cont_minx, \ self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() @@ -35,18 +43,8 @@ def __init__(self, data_interface, model_interface): else: self.dice_sess = tf.InteractiveSession() - # initializing model related variables - self.model = model_interface - self.model.load_model() # loading trained model self.input_tensor = tf.Variable(self.minx, dtype=tf.float32) # placeholder variables for model predictions self.output_tensor = self.model.get_output(self.input_tensor) - if self.model.transformer.func is not None: # TODO: this error is probably too big - need to change it. - raise ValueError("Gradient-based DiCE currently " - "(1) accepts the data only in raw categorical and continuous formats, " - "(2) does one-hot-encoding and min-max-normalization internally, " - "(3) expects the ML model the accept the data in this same format. " - "If your problem supports this, please initialize model class again " - "with no custom transformation function.") # number of output nodes of ML model self.num_output_nodes = self.dice_sess.run( self.model.get_num_output_nodes(len(self.data_interface.ohe_encoded_feature_names))).shape[1] @@ -125,10 +123,6 @@ def generate_counterfactuals(self, query_instance, total_CFs, desired_class="opp # check permitted range for continuous features if permitted_range is not None: - # if not self.data_interface.check_features_range(permitted_range): - # raise ValueError( - # "permitted range of features should be within their original range") - # else: self.data_interface.permitted_range = permitted_range self.minx, self.maxx = self.data_interface.get_minx_maxx(normalized=True) self.cont_minx = [] @@ -229,7 +223,7 @@ def predict_fn(self, input_instance): def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" - input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance).values + input_instance = self.model.transformer.transform(input_instance).to_numpy() return self.predict_fn(input_instance) def compute_yloss(self, method): @@ -526,13 +520,10 @@ def find_counterfactuals(self, query_instance, desired_class="opposite", learnin stopping_threshold=0.5, posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear"): """Finds counterfactuals by gradient-descent.""" - # Prepares user defined query_instance for DiCE. - # query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance, encoding='one-hot') - # query_instance = np.array([query_instance.iloc[0].values], dtype=np.float32) - query_instance = self.data_interface.get_ohe_min_max_normalized_data(query_instance).values + query_instance = self.model.transformer.transform(query_instance).to_numpy() # find the predicted value of query_instance - test_pred = self.predict_fn(query_instance)[0][0] + test_pred = self.predict_fn(tf.constant(query_instance, dtype=tf.float32))[0][0] if desired_class == "opposite": desired_class = 1.0 - round(test_pred) self.target_cf_class = np.array([[desired_class]]) @@ -651,12 +642,14 @@ def find_counterfactuals(self, query_instance, desired_class="opposite", learnin # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) - final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + final_cfs_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(cfs)) cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) - test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) + test_instance_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(query_instance)) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) # post-hoc operation on continuous features to enhance sparsity - only for public data diff --git a/dice_ml/explainer_interfaces/dice_tensorflow2.py b/dice_ml/explainer_interfaces/dice_tensorflow2.py index 14292261..78f0d63e 100644 --- a/dice_ml/explainer_interfaces/dice_tensorflow2.py +++ b/dice_ml/explainer_interfaces/dice_tensorflow2.py @@ -149,7 +149,7 @@ def predict_fn(self, input_instance): def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" - input_instance = self.data_interface.get_ohe_min_max_normalized_data(input_instance).values + input_instance = self.model.transformer.transform(input_instance).to_numpy() return self.predict_fn(tf.constant(input_instance, dtype=tf.float32)) def do_cf_initializations(self, total_CFs, algorithm, features_to_vary): @@ -420,10 +420,7 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin posthoc_sparsity_algorithm): """Finds counterfactuals by gradient-descent.""" - # Prepares user defined query_instance for DiCE. - # query_instance = self.data_interface.prepare_query_instance(query_instance=query_instance, encoding='one-hot') - # query_instance = np.array([query_instance.iloc[0].values]) - query_instance = self.data_interface.get_ohe_min_max_normalized_data(query_instance).values + query_instance = self.model.transformer.transform(query_instance).to_numpy() self.x1 = tf.constant(query_instance, dtype=tf.float32) # find the predicted value of query_instance @@ -535,8 +532,7 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin self.max_iterations_run = iterations self.elapsed = timeit.default_timer() - start_time - - self.cfs_preds = [self.predict_fn(cfs) for cfs in self.final_cfs] + self.cfs_preds = [self.predict_fn(tf.constant(cfs, dtype=tf.float32)) for cfs in self.final_cfs] # update final_cfs from backed up CFs if valid CFs are not found if((self.target_cf_class == 0 and any(i[0] > self.stopping_threshold for i in self.cfs_preds)) or @@ -549,12 +545,14 @@ def find_counterfactuals(self, query_instance, desired_class, optimizer, learnin # do inverse transform of CFs to original user-fed format cfs = np.array([self.final_cfs[i][0] for i in range(len(self.final_cfs))]) - final_cfs_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(cfs) + final_cfs_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(cfs)) cfs_preds = [np.round(preds.flatten().tolist(), 3) for preds in self.cfs_preds] cfs_preds = [item for sublist in cfs_preds for item in sublist] final_cfs_df[self.data_interface.outcome_name] = np.array(cfs_preds) - test_instance_df = self.data_interface.get_inverse_ohe_min_max_normalized_data(query_instance) + test_instance_df = self.model.transformer.inverse_transform( + self.data_interface.get_decoded_data(query_instance)) test_instance_df[self.data_interface.outcome_name] = np.array(np.round(test_pred, 3)) # post-hoc operation on continuous features to enhance sparsity - only for public data diff --git a/dice_ml/model_interfaces/pytorch_model.py b/dice_ml/model_interfaces/pytorch_model.py index c10ef2b4..6e7833e9 100644 --- a/dice_ml/model_interfaces/pytorch_model.py +++ b/dice_ml/model_interfaces/pytorch_model.py @@ -1,9 +1,11 @@ """Module containing an interface to trained PyTorch model.""" -import torch import numpy as np -from dice_ml.model_interfaces.base_model import BaseModel +import torch + from dice_ml.constants import ModelTypes +from dice_ml.model_interfaces.base_model import BaseModel + class PyTorchModel(BaseModel): diff --git a/docs/source/notebooks/DiCE_getting_started.ipynb b/docs/source/notebooks/DiCE_getting_started.ipynb index fd6b1b3e..83cee0fe 100644 --- a/docs/source/notebooks/DiCE_getting_started.ipynb +++ b/docs/source/notebooks/DiCE_getting_started.ipynb @@ -415,7 +415,7 @@ "outputs": [], "source": [ "# Step 3: initiate DiCE\n", - "exp = dice_ml.Dice(d, m, method=\"random\")" + "exp = dice_ml.Dice(d, m, method=\"gradient\")" ] }, { diff --git a/tests/test_model_interface/test_keras_tensorflow_model.py b/tests/test_model_interface/test_keras_tensorflow_model.py index 1bf6e84d..e1d2476c 100644 --- a/tests/test_model_interface/test_keras_tensorflow_model.py +++ b/tests/test_model_interface/test_keras_tensorflow_model.py @@ -62,9 +62,8 @@ def test_load_model(self): @pytest.mark.parametrize("prediction", [0.747]) def test_model_output(self, sample_adultincome_query, public_data_object, prediction): # Initializing data and model objects - public_data_object.create_ohe_params() self.m.load_model() - # initializing data transormation required for ML model + # initializing data transformation required for ML model self.m.transformer = DataTransfomer(func='ohe-min-max', kw_args=None) self.m.transformer.feed_data_params(public_data_object) self.m.transformer.initialize_transform_func() diff --git a/tests/test_model_interface/test_pytorch_model.py b/tests/test_model_interface/test_pytorch_model.py index 76e08e8e..f7a8bef9 100644 --- a/tests/test_model_interface/test_pytorch_model.py +++ b/tests/test_model_interface/test_pytorch_model.py @@ -56,7 +56,6 @@ def test_model_output(self, sample_adultincome_query, public_data_object, predic self.m.transformer.initialize_transform_func() output_instance = self.m.get_output(sample_adultincome_query, transform_data=True) predictval = output_instance[0][0] - print(predictval) assert predictval is not None # TODO: The assert below fails. # assert pytest.approx(predictval, abs=1e-3) == prediction From ee4ee16a8f40039e12cb9d1b11be1ee4336d5d71 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Thu, 7 Jul 2022 14:06:09 +0530 Subject: [PATCH 06/15] fixed linting bugs --- .../data_interfaces/public_data_interface.py | 2 -- dice_ml/dice.py | 23 ++++++++-------- dice_ml/explainer_interfaces/dice_KD.py | 1 - dice_ml/explainer_interfaces/dice_pytorch.py | 26 +++++++++---------- dice_ml/explainer_interfaces/dice_random.py | 1 - .../explainer_interfaces/explainer_base.py | 9 +++---- dice_ml/model_interfaces/pytorch_model.py | 4 +-- dice_ml/utils/helpers.py | 2 ++ tests/test_data.py | 2 +- 9 files changed, 34 insertions(+), 36 deletions(-) diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py index a0b5c663..829c209f 100644 --- a/dice_ml/data_interfaces/public_data_interface.py +++ b/dice_ml/data_interfaces/public_data_interface.py @@ -338,7 +338,6 @@ def create_ohe_params(self, one_hot_encoded_data): # one-hot-encoded data is same as original data if there is no categorical features. self.ohe_encoded_feature_names = [feat for feat in self.feature_names] - def get_data_params_for_gradient_dice(self): """Gets all data related params for DiCE.""" @@ -514,7 +513,6 @@ def prepare_query_instance(self, query_instance): self.continuous_feature_names) return test - def get_ohe_min_max_normalized_data(self, query_instance): """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts""" diff --git a/dice_ml/dice.py b/dice_ml/dice.py index d4b550ca..a3ad9fa6 100644 --- a/dice_ml/dice.py +++ b/dice_ml/dice.py @@ -74,18 +74,19 @@ def decide(model_interface, method): from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch return DicePyTorch else: - raise UserConfigValidationException("{0} is only supported for differentiable neural network models. " - "Please choose one of {1}, {2} or {3}".format( - method, SamplingStrategy.Random, - SamplingStrategy.Genetic, - SamplingStrategy.KdTree - )) + raise UserConfigValidationException( + "{0} is only supported for differentiable neural network models. " + "Please choose one of {1}, {2} or {3}".format( + method, SamplingStrategy.Random, + SamplingStrategy.Genetic, + SamplingStrategy.KdTree + )) elif method is None: - # all other backends - backend_dice = model_interface.backend['explainer'] - module_name, class_name = backend_dice.split('.') - module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name]) - return getattr(module, class_name) + # all other backends + backend_dice = model_interface.backend['explainer'] + module_name, class_name = backend_dice.split('.') + module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name]) + return getattr(module, class_name) else: raise UserConfigValidationException("Unsupported sample strategy {0} provided. " "Please choose one of {1}, {2} or {3}".format( diff --git a/dice_ml/explainer_interfaces/dice_KD.py b/dice_ml/explainer_interfaces/dice_KD.py index 085df155..deed36f8 100644 --- a/dice_ml/explainer_interfaces/dice_KD.py +++ b/dice_ml/explainer_interfaces/dice_KD.py @@ -25,7 +25,6 @@ def __init__(self, data_interface, model_interface): self.total_random_inits = 0 super().__init__(data_interface) # initiating data related parameters - # initializing model variables self.model = model_interface self.model.load_model() # loading pickled trained model if applicable diff --git a/dice_ml/explainer_interfaces/dice_pytorch.py b/dice_ml/explainer_interfaces/dice_pytorch.py index c2bc0fb2..12412fda 100644 --- a/dice_ml/explainer_interfaces/dice_pytorch.py +++ b/dice_ml/explainer_interfaces/dice_pytorch.py @@ -9,7 +9,6 @@ import torch from dice_ml import diverse_counterfactuals as exp -from dice_ml.counterfactual_explanations import CounterfactualExplanations from dice_ml.explainer_interfaces.explainer_base import ExplainerBase @@ -46,14 +45,14 @@ def __init__(self, data_interface, model_interface): self.optimizer_weights = [] # optimizer, learning_rate def _generate_counterfactuals(self, query_instance, total_CFs, - desired_class="opposite", desired_range=None, - proximity_weight=0.5, - diversity_weight=1.0, categorical_penalty=0.1, algorithm="DiverseCF", features_to_vary="all", - permitted_range=None, yloss_type="hinge_loss", diversity_loss_type="dpp_style:inverse_dist", - feature_weights="inverse_mad", optimizer="pytorch:adam", learning_rate=0.05, min_iter=500, - max_iter=5000, project_iter=0, loss_diff_thres=1e-5, loss_converge_maxiter=1, verbose=False, - init_near_query_instance=True, tie_random=False, stopping_threshold=0.5, - posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", limit_steps_ls=10000): + desired_class="opposite", desired_range=None, + proximity_weight=0.5, + diversity_weight=1.0, categorical_penalty=0.1, algorithm="DiverseCF", features_to_vary="all", + permitted_range=None, yloss_type="hinge_loss", diversity_loss_type="dpp_style:inverse_dist", + feature_weights="inverse_mad", optimizer="pytorch:adam", learning_rate=0.05, min_iter=500, + max_iter=5000, project_iter=0, loss_diff_thres=1e-5, loss_converge_maxiter=1, verbose=False, + init_near_query_instance=True, tie_random=False, stopping_threshold=0.5, + posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", limit_steps_ls=10000): """Generates diverse counterfactual explanations. :param query_instance: Test point of interest. A dictionary of feature names and values or a single row dataframe @@ -135,9 +134,10 @@ def _generate_counterfactuals(self, query_instance, total_CFs, desired_class=desired_class) def get_model_output(self, input_instance, - transform_data=False, out_tensor=True): + transform_data=False, out_tensor=True): """get output probability of ML model""" - return self.model.get_output(input_instance, + return self.model.get_output( + input_instance, transform_data=transform_data, out_tensor=out_tensor)[(self.num_output_nodes-1):] @@ -145,8 +145,8 @@ def predict_fn(self, input_instance): """prediction function""" if not torch.is_tensor(input_instance): input_instance = torch.tensor(input_instance).float() - return self.get_model_output(input_instance, - transform_data=False, out_tensor=False) + return self.get_model_output( + input_instance, transform_data=False, out_tensor=False) def predict_fn_for_sparsity(self, input_instance): """prediction function for sparsity correction""" diff --git a/dice_ml/explainer_interfaces/dice_random.py b/dice_ml/explainer_interfaces/dice_random.py index 4b21b68a..43316333 100644 --- a/dice_ml/explainer_interfaces/dice_random.py +++ b/dice_ml/explainer_interfaces/dice_random.py @@ -29,7 +29,6 @@ def __init__(self, data_interface, model_interface): self.model.transformer.feed_data_params(data_interface) self.model.transformer.initialize_transform_func() - self.precisions = self.data_interface.get_decimal_precisions(output_type="dict") if self.data_interface.outcome_name in self.precisions: self.outcome_precision = [self.precisions[self.data_interface.outcome_name]] diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 208ac45c..7228df86 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -94,7 +94,6 @@ def _validate_counterfactual_configuration( raise UserConfigValidationException( "The range provided in desired_range should be in ascending order.") - def generate_counterfactuals(self, query_instances, total_CFs, desired_class="opposite", desired_range=None, permitted_range=None, features_to_vary="all", @@ -488,7 +487,7 @@ def predict_fn(self, input_instance): preds = self.model.get_output(input_instance) if self.model.model_type == ModelTypes.Classifier and \ - len(preds.shape) == 1: # from deep learning predictors + len(preds.shape) == 1: # from deep learning predictors preds = np.column_stack([1 - preds, preds]) return preds @@ -671,7 +670,7 @@ def infer_target_cfs_class(self, desired_class_input, original_pred, num_output_ original_pred_1 = original_pred target_class = int(1 - original_pred_1) return target_class - elif num_output_nodes == 1: # only for pytorch DL model + elif num_output_nodes == 1: # only for pytorch DL model original_pred_1 = np.round(original_pred) target_class = int(1-original_pred_1) return target_class @@ -704,7 +703,7 @@ def decide_cf_validity(self, model_outputs): for i in range(len(model_outputs)): pred = model_outputs[i] if self.model.model_type == ModelTypes.Classifier: - if self.num_output_nodes in (1,2): # binary + if self.num_output_nodes in (1, 2): # binary if self.num_output_nodes == 2: pred_1 = pred[self.num_output_nodes-1] else: @@ -769,7 +768,7 @@ def get_model_output_from_scores(self, model_scores): model_output[i] = np.argmax(model_scores[i]) else: model_output[i] = np.round(model_scores[i])[0] - else: # 1-D input + else: # 1-D input model_output[i] = np.round(model_scores[i]) elif self.model.model_type == ModelTypes.Regressor: model_output[i] = model_scores[i] diff --git a/dice_ml/model_interfaces/pytorch_model.py b/dice_ml/model_interfaces/pytorch_model.py index 6e7833e9..4c8e2068 100644 --- a/dice_ml/model_interfaces/pytorch_model.py +++ b/dice_ml/model_interfaces/pytorch_model.py @@ -27,7 +27,7 @@ def load_model(self): self.model = torch.load(self.model_path) def get_output(self, input_instance, model_score=True, - transform_data=False, out_tensor=False): + transform_data=False, out_tensor=False): """returns prediction probabilities :param input_tensor: test input. @@ -42,7 +42,7 @@ def get_output(self, input_instance, model_score=True, if not out_tensor: out = out.data.numpy() if model_score is False and self.model_type == ModelTypes.Classifier: - out = np.round(out) # TODO need to generalize for n-class classifier + out = np.round(out) # TODO need to generalize for n-class classifier return out def set_eval_mode(self): diff --git a/dice_ml/utils/helpers.py b/dice_ml/utils/helpers.py index bd8e6800..aa63e611 100644 --- a/dice_ml/utils/helpers.py +++ b/dice_ml/utils/helpers.py @@ -221,9 +221,11 @@ def ohe_min_max_transformation(data, data_interface): """the data is one-hot-encoded and min-max normalized and fed to the ML model""" return data_interface.get_ohe_min_max_normalized_data(data) + def inverse_ohe_min_max_transformation(data, data_interface): return data_interface.get_inverse_ohe_min_max_normalized_data(data) + class DataTransfomer: """A class to transform data based on user-defined function to get predicted outcomes. This class calls FunctionTransformer of scikit-learn internally diff --git a/tests/test_data.py b/tests/test_data.py index 69cea0c8..d562a5aa 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -106,7 +106,7 @@ def test_features_to_vary(self): d = self.d[0] temp_ohe_data = d.get_ohe_min_max_normalized_data(d.data_df.iloc[[0]]) d.create_ohe_params(temp_ohe_data) - #d.create_ohe_params() + # d.create_ohe_params() # d.categorical_feature_names = ['cat1', 'cat2'] # d.encoded_feature_names = ['cat2_cont1', 'cont2', 'cat1_val1', 'cat1_val2', 'cat2_val1', 'cat2_val2'] # d.continuous_feature_names = ['cat2_cont1', 'cont2'] From 45fc06ae541ea87590d654ad8abdc713261d0ce4 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Thu, 7 Jul 2022 17:02:00 +0530 Subject: [PATCH 07/15] fixed minor bugs --- .../data_interfaces/private_data_interface.py | 15 +++++++--- .../notebooks/DiCE_getting_started.ipynb | 28 ++++--------------- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py index 2960c17b..6f125053 100644 --- a/dice_ml/data_interfaces/private_data_interface.py +++ b/dice_ml/data_interfaces/private_data_interface.py @@ -1,6 +1,7 @@ """Module containing meta data information about private data.""" import collections +from collections import defaultdict import logging import sys @@ -265,16 +266,22 @@ def from_dummies(self, data, prefix_sep='_'): out.drop(cols, axis=1, inplace=True) return out - def get_decimal_precisions(self): + def get_decimal_precisions(self, output_type="list"): """"Gets the precision of continuous features in the data.""" + precisions_dict = defaultdict(int) precisions = [0]*len(self.continuous_feature_names) for ix, feature_name in enumerate(self.continuous_feature_names): type_prec = self.type_and_precision[feature_name] if type_prec == 'int': - precisions[ix] = 0 + prec = 0 else: - precisions[ix] = self.type_and_precision[feature_name][1] - return precisions + prec = self.type_and_precision[feature_name][1] + precisions[ix] = prec + precisions_dict[feature_name] = prec + if output_type == "list": + return precisions + elif output_type == "dict": + return precisions_dict def get_decoded_data(self, data, encoding='one-hot'): """Gets the original data from encoded data.""" diff --git a/docs/source/notebooks/DiCE_getting_started.ipynb b/docs/source/notebooks/DiCE_getting_started.ipynb index 83cee0fe..f4cb675c 100644 --- a/docs/source/notebooks/DiCE_getting_started.ipynb +++ b/docs/source/notebooks/DiCE_getting_started.ipynb @@ -23,20 +23,10 @@ " * Randomized Search\n", " * Genetic Search\n", " * KD Tree Search (for counterfactuals from a given training dataset)\n", - "* **Gradient-Based**: These methods apply to differentiable models, such as those returned by deep learning libraries like tensorflow and pytorch. They are based on an explicit loss minimization based on proximity, diversity and feasibility. The method is described in this [paper](https://arxiv.org/abs/1905.07697)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![DiCE API](images/dice_getting_started_api.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "* **Gradient-Based**: These methods apply to differentiable models, such as those returned by deep learning libraries like tensorflow and pytorch. They are based on an explicit loss minimization based on proximity, diversity and feasibility. The method is described in this [paper](https://arxiv.org/abs/1905.07697).\n", + "\n", + "![DiCE API](images/dice_getting_started_api.png)\n", + "\n", "DiCE requires two inputs: a training dataset and a pre-trained ML model. When the training dataset is unknown (e.g., for privacy reasons), it can also work without access to the full dataset (see this [notebook](DiCE_with_private_data.ipynb) for an example). Below we show a simple example. " ] }, @@ -387,13 +377,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Tensorflow import\n", - "import tensorflow as tf\n", - "\n", - "# supress deprecation warnings from TF\n", - "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", - "\n", - "backend = 'TF'+tf.__version__[0] # TF1\n", + "backend = 'TF2' # needs tensorflow installed\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", "# Step 2: dice_ml.Model\n", "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" @@ -466,7 +450,7 @@ "metadata": {}, "outputs": [], "source": [ - "backend = 'PYT'\n", + "backend = 'PYT' # needs pytorch installed\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] From 67148486a7870e057fc7c4b5eb844e35f4f152cf Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sat, 9 Jul 2022 16:45:53 +0530 Subject: [PATCH 08/15] updated failing tets --- .../explainer_interfaces/explainer_base.py | 8 ++- .../test_dice_interface/test_dice_genetic.py | 38 +++++++++----- tests/test_dice_interface/test_dice_random.py | 50 +++++++++++++------ 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 7228df86..fa56d8b0 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -678,7 +678,13 @@ def infer_target_cfs_class(self, desired_class_input, original_pred, num_output_ raise UserConfigValidationException( "Desired class cannot be opposite if the number of classes is more than 2.") elif isinstance(desired_class_input, int): - if desired_class_input >= 0 and desired_class_input < num_output_nodes: + if num_output_nodes == 1: # for DL models + if desired_class_input in (0, 1): + target_class = desired_class_input + return target_class + else: + raise UserConfigValidationException("Only 0 and 1 are supported as desired class for binary classification!") + elif desired_class_input >= 0 and desired_class_input < num_output_nodes: target_class = desired_class_input return target_class else: diff --git a/tests/test_dice_interface/test_dice_genetic.py b/tests/test_dice_interface/test_dice_genetic.py index e2c1aaec..02a8f0cf 100644 --- a/tests/test_dice_interface/test_dice_genetic.py +++ b/tests/test_dice_interface/test_dice_genetic.py @@ -3,22 +3,29 @@ import dice_ml from dice_ml.utils import helpers from dice_ml.utils.exception import UserConfigValidationException +from dice_ml.utils.neuralnetworks import FFNetwork +BACKENDS = ['sklearn', 'PYT'] -@pytest.fixture() -def genetic_binary_classification_exp_object(): - backend = 'sklearn' + +@pytest.fixture(scope="module", params=['sklearn']) +def genetic_binary_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + if backend == "PYT": + net = FFNetwork(4) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='genetic') return exp -@pytest.fixture() -def genetic_multi_classification_exp_object(): - backend = 'sklearn' +@pytest.fixture(scope="module", params=['sklearn']) +def genetic_multi_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_multiclass() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_multiclass() @@ -27,13 +34,17 @@ def genetic_multi_classification_exp_object(): return exp -@pytest.fixture() -def genetic_regression_exp_object(): - backend = 'sklearn' +@pytest.fixture(scope="module", params=BACKENDS) +def genetic_regression_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_regression() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') + if backend == "PYT": + net = FFNetwork(4, is_classifier=False) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max", model_type='regressor') + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') exp = dice_ml.Dice(d, m, method='genetic') return exp @@ -136,6 +147,7 @@ def test_predict_custom(self, desired_class, sample_custom_query_2, mocker): self.exp.yloss_type = 'hinge_loss' mocker.patch('dice_ml.explainer_interfaces.dice_genetic.DiceGenetic.label_decode', return_value=None) mocker.patch('dice_ml.model_interfaces.base_model.BaseModel.get_output', return_value=[[0, 0.5, 0.5]]) + mocker.patch('dice_ml.model_interfaces.pytorch_model.PyTorchModel.get_output', return_value=[[0, 0.5, 0.5]]) custom_preds = self.exp._predict_fn_custom(sample_custom_query_2, desired_class) assert custom_preds[0] == desired_class diff --git a/tests/test_dice_interface/test_dice_random.py b/tests/test_dice_interface/test_dice_random.py index 57507040..579d0c14 100644 --- a/tests/test_dice_interface/test_dice_random.py +++ b/tests/test_dice_interface/test_dice_random.py @@ -5,37 +5,53 @@ from dice_ml.diverse_counterfactuals import CounterfactualExamples from dice_ml.utils import helpers from dice_ml.utils.exception import UserConfigValidationException +from dice_ml.utils.neuralnetworks import FFNetwork +BACKENDS = ['sklearn', 'PYT'] -@pytest.fixture() -def random_binary_classification_exp_object(): - backend = 'sklearn' + +@pytest.fixture(scope="module", params=BACKENDS) +def random_binary_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + if backend == "PYT": + net = FFNetwork(4) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp -@pytest.fixture() -def random_multi_classification_exp_object(): - backend = 'sklearn' +#TODO multiclass is not currently supported for neural networks +@pytest.fixture(scope="module", params=['sklearn']) +def random_multi_classification_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_multiclass() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_multiclass() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend) + if backend == "PYT": + net = FFNetwork(4) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_multiclass() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp -@pytest.fixture() -def random_regression_exp_object(): - backend = 'sklearn' +@pytest.fixture(scope="module", params=BACKENDS) +def random_regression_exp_object(request): + backend = request.param dataset = helpers.load_custom_testing_dataset_regression() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() - m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') + if backend == "PYT": + net = FFNetwork(4, is_classifier=False) + m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max", model_type='regressor') + else: + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() + m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') exp = dice_ml.Dice(d, m, method='random') return exp @@ -46,7 +62,9 @@ def _initiate_exp_object(self, random_binary_classification_exp_object): self.exp = random_binary_classification_exp_object # explainer object @pytest.mark.parametrize(("desired_class", "total_CFs"), [(0, 1)]) - def test_random_counterfactual_explanations_output(self, desired_class, sample_custom_query_1, total_CFs): + def test_random_counterfactual_explanations_output( + self, + desired_class, sample_custom_query_1, total_CFs): counterfactual_explanations = self.exp.generate_counterfactuals( query_instances=sample_custom_query_1, desired_class=desired_class, total_CFs=total_CFs) From 4a91f59c71291216b703aebccadbbfe2683b9b8c Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sat, 9 Jul 2022 16:50:57 +0530 Subject: [PATCH 09/15] fixing flake errors --- dice_ml/data_interfaces/private_data_interface.py | 2 +- dice_ml/explainer_interfaces/explainer_base.py | 2 +- tests/test_dice_interface/test_dice_random.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py index 6f125053..9725a288 100644 --- a/dice_ml/data_interfaces/private_data_interface.py +++ b/dice_ml/data_interfaces/private_data_interface.py @@ -1,9 +1,9 @@ """Module containing meta data information about private data.""" import collections -from collections import defaultdict import logging import sys +from collections import defaultdict import numpy as np import pandas as pd diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index fa56d8b0..0fbf9ca3 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -683,7 +683,7 @@ def infer_target_cfs_class(self, desired_class_input, original_pred, num_output_ target_class = desired_class_input return target_class else: - raise UserConfigValidationException("Only 0 and 1 are supported as desired class for binary classification!") + raise UserConfigValidationException("Only 0, 1 are supported as desired class for binary classification!") elif desired_class_input >= 0 and desired_class_input < num_output_nodes: target_class = desired_class_input return target_class diff --git a/tests/test_dice_interface/test_dice_random.py b/tests/test_dice_interface/test_dice_random.py index 579d0c14..102f3efd 100644 --- a/tests/test_dice_interface/test_dice_random.py +++ b/tests/test_dice_interface/test_dice_random.py @@ -25,7 +25,7 @@ def random_binary_classification_exp_object(request): return exp -#TODO multiclass is not currently supported for neural networks +# TODO multiclass is not currently supported for neural networks @pytest.fixture(scope="module", params=['sklearn']) def random_multi_classification_exp_object(request): backend = request.param From 3d19ce20adfc8444f3944ee0ce15a169b4563409 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sat, 9 Jul 2022 17:41:15 +0530 Subject: [PATCH 10/15] added neural network file Signed-off-by: Amit Sharma --- dice_ml/utils/neuralnetworks.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 dice_ml/utils/neuralnetworks.py diff --git a/dice_ml/utils/neuralnetworks.py b/dice_ml/utils/neuralnetworks.py new file mode 100644 index 00000000..07c18d27 --- /dev/null +++ b/dice_ml/utils/neuralnetworks.py @@ -0,0 +1,21 @@ +from torch import nn, sigmoid + + +class FFNetwork(nn.Module): + def __init__(self, input_size, is_classifier=True): + super(FFNetwork, self).__init__() + self.is_classifier = is_classifier + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(input_size, 16), + nn.ReLU(), + nn.Linear(16, 1), + ) + + def forward(self, x): + x = self.flatten(x) + out = self.linear_relu_stack(x) + out = sigmoid(out) + if not self.is_classifier: + out = 3 * out # output between 0 and 3 + return out From ca94404c1506af41750c3610e0869ce1996d4b4c Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sat, 9 Jul 2022 17:56:09 +0530 Subject: [PATCH 11/15] added torch as a test dependency Signed-off-by: Amit Sharma --- requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-test.txt b/requirements-test.txt index 47041027..104907c7 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,3 +5,4 @@ pytest pytest-cov twine pytest-mock +torch From 83ca596c0b176a9b65a4870e7dcd779864e323ee Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sun, 10 Jul 2022 18:39:38 +0530 Subject: [PATCH 12/15] fixed some bugs with private data --- .../data_interfaces/base_data_interface.py | 41 ++++++++++++++++ .../data_interfaces/private_data_interface.py | 44 ++++++++--------- .../data_interfaces/public_data_interface.py | 49 +------------------ dice_ml/diverse_counterfactuals.py | 2 +- .../explainer_interfaces/dice_tensorflow2.py | 5 +- .../explainer_interfaces/explainer_base.py | 4 -- .../notebooks/DiCE_with_private_data.ipynb | 13 ++--- 7 files changed, 76 insertions(+), 82 deletions(-) diff --git a/dice_ml/data_interfaces/base_data_interface.py b/dice_ml/data_interfaces/base_data_interface.py index f0abfa54..e9fbece2 100644 --- a/dice_ml/data_interfaces/base_data_interface.py +++ b/dice_ml/data_interfaces/base_data_interface.py @@ -1,6 +1,7 @@ """Module containing base class for data interfaces for dice-ml.""" from abc import ABC, abstractmethod +from dice_ml.utils.exception import SystemException, UserConfigValidationException class _BaseData(ABC): @@ -27,6 +28,46 @@ def set_continuous_feature_indexes(self, query_instance): self.continuous_feature_indexes = [query_instance.columns.get_loc(name) for name in self.continuous_feature_names] + def check_features_to_vary(self, features_to_vary): + if features_to_vary is not None and features_to_vary != 'all': + not_training_features = set(features_to_vary) - set(self.feature_names) + if len(not_training_features) > 0: + raise UserConfigValidationException("Got features {0} which are not present in training data".format( + not_training_features)) + + def check_permitted_range(self, permitted_range): + if permitted_range is not None: + permitted_range_features = list(permitted_range) + not_training_features = set(permitted_range_features) - set(self.feature_names) + if len(not_training_features) > 0: + raise UserConfigValidationException("Got features {0} which are not present in training data".format( + not_training_features)) + + for feature in permitted_range_features: + if feature in self.categorical_feature_names: + train_categories = self.permitted_range[feature] + for test_category in permitted_range[feature]: + if test_category not in train_categories: + raise UserConfigValidationException( + 'The category {0} does not occur in the training data for feature {1}.' + ' Allowed categories are {2}'.format(test_category, feature, train_categories)) + + def _validate_and_set_permitted_range(self, params, features_dict=None): + """Validate and set the dictionary of permitted ranges for continuous features.""" + input_permitted_range = None + if 'permitted_range' in params: + input_permitted_range = params['permitted_range'] + + if not hasattr(self, 'feature_names'): + raise SystemException('Feature names not correctly set in public data interface') + + for input_permitted_range_feature_name in input_permitted_range: + if input_permitted_range_feature_name not in self.feature_names: + raise UserConfigValidationException( + "permitted_range contains some feature names which are not part of columns in dataframe" + ) + self.permitted_range, _ = self.get_features_range(input_permitted_range, features_dict) + @abstractmethod def __init__(self, params): """The init method needs to be implemented by the inherting classes.""" diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py index 9725a288..00b3b995 100644 --- a/dice_ml/data_interfaces/private_data_interface.py +++ b/dice_ml/data_interfaces/private_data_interface.py @@ -47,21 +47,18 @@ def __init__(self, params): self._validate_and_set_type_and_precision(params=params) self.continuous_feature_names = [] - self.permitted_range = {} self.categorical_feature_names = [] self.categorical_levels = {} for feature in features_dict: if type(features_dict[feature][0]) is int: # continuous feature self.continuous_feature_names.append(feature) - self.permitted_range[feature] = features_dict[feature] else: self.categorical_feature_names.append(feature) self.categorical_levels[feature] = features_dict[feature] self._validate_and_set_mad(params=params) - - # self.continuous_feature_names + self.categorical_feature_names + self._validate_and_set_permitted_range(params=params, features_dict=features_dict) self.feature_names = list(features_dict.keys()) self.continuous_feature_indexes = [list(features_dict.keys()).index( @@ -74,19 +71,6 @@ def __init__(self, params): if feature_name not in self.type_and_precision: self.type_and_precision[feature_name] = 'int' - # # Initializing a label encoder to obtain label-encoded values for categorical variables - # self.labelencoder = {} - # - # self.label_encoded_data = {} - # - # for column in self.categorical_feature_names: - # self.labelencoder[column] = LabelEncoder() - # self.label_encoded_data[column] = \ - # self.labelencoder[column].fit_transform(self.categorical_levels[column]) - - # self.max_range = -np.inf - # for feature in self.continuous_feature_names: - # self.max_range = max(self.max_range, self.permitted_range[feature][1]) self._validate_and_set_data_name(params=params) @@ -177,7 +161,22 @@ def get_valid_mads(self, normalized=False, display_warnings=False, return_mads=T if return_mads: return mads - def create_ohe_params(self): + def get_features_range(self, permitted_range_input=None, features_dict=None): + ranges = {} + # Getting default ranges based on the dataset + for feature in features_dict: + if type(features_dict[feature][0]) is int: # continuous feature + ranges[feature] = features_dict[feature] + else: + ranges[feature] = features_dict[feature] + feature_ranges_orig = ranges.copy() + # Overwriting the ranges for a feature if input provided + if permitted_range_input is not None: + for feature_name, feature_range in permitted_range_input.items(): + ranges[feature_name] = feature_range + return ranges, feature_ranges_orig + + def create_ohe_params(self, one_hot_encoded_data=None): if len(self.categorical_feature_names) > 0: # simulating sklearn's one-hot-encoding # continuous features on the left @@ -291,11 +290,11 @@ def get_decoded_data(self, data, encoding='one-hot'): index = [i for i in range(0, len(data))] if encoding == 'one-hot': if isinstance(data, pd.DataFrame): - return self.from_dummies(data) + return data elif isinstance(data, np.ndarray): data = pd.DataFrame(data=data, index=index, columns=self.ohe_encoded_feature_names) - return self.from_dummies(data) + return data else: raise ValueError("data should be a pandas dataframe or a numpy array") @@ -354,7 +353,8 @@ def get_ohe_min_max_normalized_data(self, query_instance): """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts""" query_instance = self.prepare_query_instance(query_instance) - temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False) + ohe_base_df = self.prepare_df_for_ohe_encoding() + temp = ohe_base_df.append(query_instance, ignore_index=True, sort=False) temp = self.one_hot_encode_data(temp) temp = temp.tail(query_instance.shape[0]).reset_index(drop=True) # returns a pandas dataframe @@ -363,7 +363,7 @@ def get_ohe_min_max_normalized_data(self, query_instance): def get_inverse_ohe_min_max_normalized_data(self, transformed_data): """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array""" - raw_data = self.get_decoded_data(transformed_data, encoding='one-hot') + raw_data = self.from_dummies(transformed_data) raw_data = self.de_normalize_data(raw_data) precisions = self.get_decimal_precisions() for ix, feature in enumerate(self.continuous_feature_names): diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py index 829c209f..b7cc0e42 100644 --- a/dice_ml/data_interfaces/public_data_interface.py +++ b/dice_ml/data_interfaces/public_data_interface.py @@ -101,22 +101,6 @@ def _validate_and_set_continuous_features_precision(self, params): else: self.continuous_features_precision = None - def _validate_and_set_permitted_range(self, params): - """Validate and set the dictionary of permitted ranges for continuous features.""" - input_permitted_range = None - if 'permitted_range' in params: - input_permitted_range = params['permitted_range'] - - if not hasattr(self, 'feature_names'): - raise SystemException('Feature names not correctly set in public data interface') - - for input_permitted_range_feature_name in input_permitted_range: - if input_permitted_range_feature_name not in self.feature_names: - raise UserConfigValidationException( - "permitted_range contains some feature names which are not part of columns in dataframe" - ) - self.permitted_range, _ = self.get_features_range(input_permitted_range) - def _set_feature_dtypes(self, data_df, categorical_feature_names, continuous_feature_names): """Set the correct type of each feature column.""" @@ -136,38 +120,7 @@ def _set_feature_dtypes(self, data_df, categorical_feature_names, np.int32) return data_df - def check_features_to_vary(self, features_to_vary): - if features_to_vary is not None and features_to_vary != 'all': - not_training_features = set(features_to_vary) - set(self.feature_names) - if len(not_training_features) > 0: - raise UserConfigValidationException("Got features {0} which are not present in training data".format( - not_training_features)) - - def check_permitted_range(self, permitted_range): - if permitted_range is not None: - permitted_range_features = list(permitted_range) - not_training_features = set(permitted_range_features) - set(self.feature_names) - if len(not_training_features) > 0: - raise UserConfigValidationException("Got features {0} which are not present in training data".format( - not_training_features)) - - for feature in permitted_range_features: - if feature in self.categorical_feature_names: - train_categories = self.permitted_range[feature] - for test_category in permitted_range[feature]: - if test_category not in train_categories: - raise UserConfigValidationException( - 'The category {0} does not occur in the training data for feature {1}.' - ' Allowed categories are {2}'.format(test_category, feature, train_categories)) - - def check_mad_validity(self, feature_weights): - """checks feature MAD validity and throw warnings. - TODO: add comments as to where this is used if this function is necessary, else remove. - """ - if feature_weights == "inverse_mad": - self.get_valid_mads(display_warnings=True, return_mads=False) - - def get_features_range(self, permitted_range_input=None): + def get_features_range(self, permitted_range_input=None, features_dict=None): ranges = {} # Getting default ranges based on the dataset for feature_name in self.continuous_feature_names: diff --git a/dice_ml/diverse_counterfactuals.py b/dice_ml/diverse_counterfactuals.py index 2dc5c044..ebc5f7ba 100644 --- a/dice_ml/diverse_counterfactuals.py +++ b/dice_ml/diverse_counterfactuals.py @@ -104,7 +104,7 @@ def _visualize_internal(self, display_sparse_df=True, show_only_changes=False, is_notebook_console=is_notebook_console) elif not hasattr(self.data_interface, 'data_df'): # for private data print('\nDiverse Counterfactual set without sparsity correction since only metadata about each', - ' feature is available (new outcome: ', self.new_outcome) + ' feature is available (new outcome: %i)' % (self.new_outcome)) self._dump_output(content=self.final_cfs_df, show_only_changes=show_only_changes, is_notebook_console=is_notebook_console) else: diff --git a/dice_ml/explainer_interfaces/dice_tensorflow2.py b/dice_ml/explainer_interfaces/dice_tensorflow2.py index 5c29c9e5..8004a341 100644 --- a/dice_ml/explainer_interfaces/dice_tensorflow2.py +++ b/dice_ml/explainer_interfaces/dice_tensorflow2.py @@ -29,7 +29,10 @@ def __init__(self, data_interface, model_interface): self.model.transformer.feed_data_params(data_interface) self.model.transformer.initialize_transform_func() # temp data to create some attributes like encoded feature names - temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + if hasattr(self.data_interface, "data_df"): + temp_ohe_data = self.model.transformer.transform(self.data_interface.data_df.iloc[[0]]) + else: + temp_ohe_data = None self.data_interface.create_ohe_params(temp_ohe_data) self.minx, self.maxx, self.encoded_categorical_feature_indexes, self.encoded_continuous_feature_indexes, \ self.cont_minx, self.cont_maxx, self.cont_precisions = self.data_interface.get_data_params_for_gradient_dice() diff --git a/dice_ml/explainer_interfaces/explainer_base.py b/dice_ml/explainer_interfaces/explainer_base.py index 0fbf9ca3..50ae8e3f 100644 --- a/dice_ml/explainer_interfaces/explainer_base.py +++ b/dice_ml/explainer_interfaces/explainer_base.py @@ -225,9 +225,6 @@ def setup(self, features_to_vary, permitted_range, query_instance, feature_weigh self.check_query_instance_validity(features_to_vary, permitted_range, query_instance, feature_ranges_orig) - # check feature MAD validity and throw warnings - self.data_interface.check_mad_validity(feature_weights) - return features_to_vary def check_query_instance_validity(self, features_to_vary, permitted_range, query_instance, feature_ranges_orig): @@ -237,7 +234,6 @@ def check_query_instance_validity(self, features_to_vary, permitted_range, query if feature not in self.data_interface.feature_names: raise ValueError("Feature", feature, "not present in training data!") - for feature in self.data_interface.categorical_feature_names: if query_instance[feature].values[0] not in feature_ranges_orig[feature] and \ str(query_instance[feature].values[0]) not in feature_ranges_orig[feature]: diff --git a/docs/source/notebooks/DiCE_with_private_data.ipynb b/docs/source/notebooks/DiCE_with_private_data.ipynb index ebde6c15..b6cf8ab8 100644 --- a/docs/source/notebooks/DiCE_with_private_data.ipynb +++ b/docs/source/notebooks/DiCE_with_private_data.ipynb @@ -15,6 +15,7 @@ "outputs": [], "source": [ "# import DiCE\n", + "import pandas as pd\n", "import dice_ml\n", "from dice_ml.utils import helpers # helper functions\n", "\n", @@ -94,7 +95,7 @@ "source": [ "backend = 'TF'+tf.__version__[0] # TF1\n", "ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)\n", - "m = dice_ml.Model(model_path=ML_modelpath, backend=backend)" + "m = dice_ml.Model(model_path=ML_modelpath, backend=backend, func=\"ohe-min-max\")" ] }, { @@ -113,7 +114,7 @@ "outputs": [], "source": [ "# initiate DiCE\n", - "exp = dice_ml.Dice(d, m)" + "exp = dice_ml.Dice(d, m, method=\"gradient\")" ] }, { @@ -123,14 +124,14 @@ "outputs": [], "source": [ "# query instance in the form of a dictionary; keys: feature name, values: feature value\n", - "query_instance = {'age': 22,\n", + "query_instance = pd.DataFrame({'age': 22,\n", " 'workclass': 'Private',\n", " 'education': 'HS-grad',\n", " 'marital_status': 'Single',\n", " 'occupation': 'Service',\n", " 'race': 'White',\n", " 'gender': 'Female',\n", - " 'hours_per_week': 45}" + " 'hours_per_week': 45}, index=[0])" ] }, { @@ -165,7 +166,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -179,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.8.12" }, "toc": { "base_numbering": 1, From 7672de7a8fc621ea4ed4191b9ca759796838a340 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sun, 10 Jul 2022 19:21:21 +0530 Subject: [PATCH 13/15] fixed random seed to have a fixed pytorch model Signed-off-by: Amit Sharma --- dice_ml/data_interfaces/base_data_interface.py | 4 +++- tests/test_dice_interface/test_dice_random.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dice_ml/data_interfaces/base_data_interface.py b/dice_ml/data_interfaces/base_data_interface.py index e9fbece2..9143c7e6 100644 --- a/dice_ml/data_interfaces/base_data_interface.py +++ b/dice_ml/data_interfaces/base_data_interface.py @@ -1,7 +1,9 @@ """Module containing base class for data interfaces for dice-ml.""" from abc import ABC, abstractmethod -from dice_ml.utils.exception import SystemException, UserConfigValidationException + +from dice_ml.utils.exception import (SystemException, + UserConfigValidationException) class _BaseData(ABC): diff --git a/tests/test_dice_interface/test_dice_random.py b/tests/test_dice_interface/test_dice_random.py index 102f3efd..b773ec35 100644 --- a/tests/test_dice_interface/test_dice_random.py +++ b/tests/test_dice_interface/test_dice_random.py @@ -1,4 +1,5 @@ import pytest +import torch import dice_ml from dice_ml.counterfactual_explanations import CounterfactualExplanations @@ -16,6 +17,7 @@ def random_binary_classification_exp_object(request): dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') if backend == "PYT": + torch.manual_seed(1) net = FFNetwork(4) m = dice_ml.Model(model=net, backend=backend, func="ohe-min-max") else: From aeda846763df73b3a566a67b215eb1ff794f4076 Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Sun, 10 Jul 2022 22:14:22 +0530 Subject: [PATCH 14/15] fixed cf exp tets --- dice_ml/data_interfaces/private_data_interface.py | 1 - tests/test_counterfactual_explanations.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py index 00b3b995..b3ce377f 100644 --- a/dice_ml/data_interfaces/private_data_interface.py +++ b/dice_ml/data_interfaces/private_data_interface.py @@ -71,7 +71,6 @@ def __init__(self, params): if feature_name not in self.type_and_precision: self.type_and_precision[feature_name] = 'int' - self._validate_and_set_data_name(params=params) def _validate_and_set_type_and_precision(self, params): diff --git a/tests/test_counterfactual_explanations.py b/tests/test_counterfactual_explanations.py index 4843fc14..187bbf7e 100644 --- a/tests/test_counterfactual_explanations.py +++ b/tests/test_counterfactual_explanations.py @@ -112,9 +112,9 @@ def test_sorted_local_importance_counterfactual_explanations(self): @pytest.fixture() def random_binary_classification_exp_object(): backend = 'sklearn' - dataset = helpers.load_custom_testing_dataset() + dataset = helpers.load_custom_testing_dataset_binary() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') - ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline() + ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp From 28dcb9e77d6b3d42d2d9405d62ade98841ac837a Mon Sep 17 00:00:00 2001 From: Amit Sharma Date: Mon, 11 Jul 2022 09:06:24 +0530 Subject: [PATCH 15/15] fixed notebook flake error Signed-off-by: Amit Sharma --- .../notebooks/DiCE_with_private_data.ipynb | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/notebooks/DiCE_with_private_data.ipynb b/docs/source/notebooks/DiCE_with_private_data.ipynb index b6cf8ab8..be7a3b91 100644 --- a/docs/source/notebooks/DiCE_with_private_data.ipynb +++ b/docs/source/notebooks/DiCE_with_private_data.ipynb @@ -54,15 +54,15 @@ "metadata": {}, "outputs": [], "source": [ - "d = dice_ml.Data(features={\n", - " 'age': [17, 90],\n", - " 'workclass': ['Government', 'Other/Unknown', 'Private', 'Self-Employed'],\n", - " 'education': ['Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Prof-school', 'School', 'Some-college'],\n", - " 'marital_status': ['Divorced', 'Married', 'Separated', 'Single', 'Widowed'],\n", - " 'occupation': ['Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar'],\n", - " 'race': ['Other', 'White'],\n", - " 'gender': ['Female', 'Male'],\n", - " 'hours_per_week': [1, 99]},\n", + "d = dice_ml.Data(features={'age': [17, 90],\n", + " 'workclass': ['Government', 'Other/Unknown', 'Private', 'Self-Employed'],\n", + " 'education': ['Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters',\n", + " 'Prof-school', 'School', 'Some-college'],\n", + " 'marital_status': ['Divorced', 'Married', 'Separated', 'Single', 'Widowed'],\n", + " 'occupation': ['Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar'],\n", + " 'race': ['Other', 'White'],\n", + " 'gender': ['Female', 'Male'],\n", + " 'hours_per_week': [1, 99]},\n", " outcome_name='income')" ] }, @@ -125,13 +125,13 @@ "source": [ "# query instance in the form of a dictionary; keys: feature name, values: feature value\n", "query_instance = pd.DataFrame({'age': 22,\n", - " 'workclass': 'Private',\n", - " 'education': 'HS-grad',\n", - " 'marital_status': 'Single',\n", - " 'occupation': 'Service',\n", - " 'race': 'White',\n", - " 'gender': 'Female',\n", - " 'hours_per_week': 45}, index=[0])" + " 'workclass': 'Private',\n", + " 'education': 'HS-grad',\n", + " 'marital_status': 'Single',\n", + " 'occupation': 'Service',\n", + " 'race': 'White',\n", + " 'gender': 'Female',\n", + " 'hours_per_week': 45}, index=[0])" ] }, {