Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dice_ml/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class SamplingStrategy:
Random = 'random'
Genetic = 'genetic'
KdTree = 'kdtree'
Gradient = 'gradient'


class ModelTypes:
Expand Down
43 changes: 43 additions & 0 deletions dice_ml/data_interfaces/base_data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from abc import ABC, abstractmethod

from dice_ml.utils.exception import (SystemException,
UserConfigValidationException)


class _BaseData(ABC):

Expand All @@ -27,6 +30,46 @@ def set_continuous_feature_indexes(self, query_instance):
self.continuous_feature_indexes = [query_instance.columns.get_loc(name) for name in
self.continuous_feature_names]

def check_features_to_vary(self, features_to_vary):
if features_to_vary is not None and features_to_vary != 'all':
not_training_features = set(features_to_vary) - set(self.feature_names)
if len(not_training_features) > 0:
raise UserConfigValidationException("Got features {0} which are not present in training data".format(
not_training_features))

def check_permitted_range(self, permitted_range):
if permitted_range is not None:
permitted_range_features = list(permitted_range)
not_training_features = set(permitted_range_features) - set(self.feature_names)
if len(not_training_features) > 0:
raise UserConfigValidationException("Got features {0} which are not present in training data".format(
not_training_features))

for feature in permitted_range_features:
if feature in self.categorical_feature_names:
train_categories = self.permitted_range[feature]
for test_category in permitted_range[feature]:
if test_category not in train_categories:
raise UserConfigValidationException(
'The category {0} does not occur in the training data for feature {1}.'
' Allowed categories are {2}'.format(test_category, feature, train_categories))

def _validate_and_set_permitted_range(self, params, features_dict=None):
"""Validate and set the dictionary of permitted ranges for continuous features."""
input_permitted_range = None
if 'permitted_range' in params:
input_permitted_range = params['permitted_range']

if not hasattr(self, 'feature_names'):
raise SystemException('Feature names not correctly set in public data interface')

for input_permitted_range_feature_name in input_permitted_range:
if input_permitted_range_feature_name not in self.feature_names:
raise UserConfigValidationException(
"permitted_range contains some feature names which are not part of columns in dataframe"
)
self.permitted_range, _ = self.get_features_range(input_permitted_range, features_dict)

@abstractmethod
def __init__(self, params):
"""The init method needs to be implemented by the inherting classes."""
Expand Down
60 changes: 33 additions & 27 deletions dice_ml/data_interfaces/private_data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import collections
import logging
import sys
from collections import defaultdict

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -46,21 +47,18 @@ def __init__(self, params):
self._validate_and_set_type_and_precision(params=params)

self.continuous_feature_names = []
self.permitted_range = {}
self.categorical_feature_names = []
self.categorical_levels = {}

for feature in features_dict:
if type(features_dict[feature][0]) is int: # continuous feature
self.continuous_feature_names.append(feature)
self.permitted_range[feature] = features_dict[feature]
else:
self.categorical_feature_names.append(feature)
self.categorical_levels[feature] = features_dict[feature]

self._validate_and_set_mad(params=params)

# self.continuous_feature_names + self.categorical_feature_names
self._validate_and_set_permitted_range(params=params, features_dict=features_dict)
self.feature_names = list(features_dict.keys())

self.continuous_feature_indexes = [list(features_dict.keys()).index(
Expand All @@ -73,20 +71,6 @@ def __init__(self, params):
if feature_name not in self.type_and_precision:
self.type_and_precision[feature_name] = 'int'

# # Initializing a label encoder to obtain label-encoded values for categorical variables
# self.labelencoder = {}
#
# self.label_encoded_data = {}
#
# for column in self.categorical_feature_names:
# self.labelencoder[column] = LabelEncoder()
# self.label_encoded_data[column] = \
# self.labelencoder[column].fit_transform(self.categorical_levels[column])

# self.max_range = -np.inf
# for feature in self.continuous_feature_names:
# self.max_range = max(self.max_range, self.permitted_range[feature][1])

self._validate_and_set_data_name(params=params)

def _validate_and_set_type_and_precision(self, params):
Expand Down Expand Up @@ -176,7 +160,22 @@ def get_valid_mads(self, normalized=False, display_warnings=False, return_mads=T
if return_mads:
return mads

def create_ohe_params(self):
def get_features_range(self, permitted_range_input=None, features_dict=None):
ranges = {}
# Getting default ranges based on the dataset
for feature in features_dict:
if type(features_dict[feature][0]) is int: # continuous feature
ranges[feature] = features_dict[feature]
else:
ranges[feature] = features_dict[feature]
feature_ranges_orig = ranges.copy()
# Overwriting the ranges for a feature if input provided
if permitted_range_input is not None:
for feature_name, feature_range in permitted_range_input.items():
ranges[feature_name] = feature_range
return ranges, feature_ranges_orig

def create_ohe_params(self, one_hot_encoded_data=None):
if len(self.categorical_feature_names) > 0:
# simulating sklearn's one-hot-encoding
# continuous features on the left
Expand Down Expand Up @@ -265,16 +264,22 @@ def from_dummies(self, data, prefix_sep='_'):
out.drop(cols, axis=1, inplace=True)
return out

def get_decimal_precisions(self):
def get_decimal_precisions(self, output_type="list"):
""""Gets the precision of continuous features in the data."""
precisions_dict = defaultdict(int)
precisions = [0]*len(self.continuous_feature_names)
for ix, feature_name in enumerate(self.continuous_feature_names):
type_prec = self.type_and_precision[feature_name]
if type_prec == 'int':
precisions[ix] = 0
prec = 0
else:
precisions[ix] = self.type_and_precision[feature_name][1]
return precisions
prec = self.type_and_precision[feature_name][1]
precisions[ix] = prec
precisions_dict[feature_name] = prec
if output_type == "list":
return precisions
elif output_type == "dict":
return precisions_dict

def get_decoded_data(self, data, encoding='one-hot'):
"""Gets the original data from encoded data."""
Expand All @@ -284,11 +289,11 @@ def get_decoded_data(self, data, encoding='one-hot'):
index = [i for i in range(0, len(data))]
if encoding == 'one-hot':
if isinstance(data, pd.DataFrame):
return self.from_dummies(data)
return data
elif isinstance(data, np.ndarray):
data = pd.DataFrame(data=data, index=index,
columns=self.ohe_encoded_feature_names)
return self.from_dummies(data)
return data
else:
raise ValueError("data should be a pandas dataframe or a numpy array")

Expand Down Expand Up @@ -347,7 +352,8 @@ def get_ohe_min_max_normalized_data(self, query_instance):
"""Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict,
a dataframe, a list, or a list of dicts"""
query_instance = self.prepare_query_instance(query_instance)
temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False)
ohe_base_df = self.prepare_df_for_ohe_encoding()
temp = ohe_base_df.append(query_instance, ignore_index=True, sort=False)
temp = self.one_hot_encode_data(temp)
temp = temp.tail(query_instance.shape[0]).reset_index(drop=True)
# returns a pandas dataframe
Expand All @@ -356,7 +362,7 @@ def get_ohe_min_max_normalized_data(self, query_instance):
def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
should be a dataframe or an array"""
raw_data = self.get_decoded_data(transformed_data, encoding='one-hot')
raw_data = self.from_dummies(transformed_data)
raw_data = self.de_normalize_data(raw_data)
precisions = self.get_decimal_precisions()
for ix, feature in enumerate(self.continuous_feature_names):
Expand Down
126 changes: 9 additions & 117 deletions dice_ml/data_interfaces/public_data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,7 @@ def __init__(self, params):
self.categorical_feature_names,
self.continuous_feature_names)

# should move the below snippet to gradient based dice interfaces
# self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
# self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
# ) if x not in np.array([self.outcome_name])]

# should move the below snippet to model agnostic dice interfaces
# # Initializing a label encoder to obtain label-encoded values for categorical variables
# self.labelencoder = {}
#
# self.label_encoded_data = self.data_df.copy()
#
# for column in self.categorical_feature_names:
# self.labelencoder[column] = LabelEncoder()
# self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column])

self._validate_and_set_permitted_range(params=params)

# should move the below snippet to model agnostic dice interfaces
# self.max_range = -np.inf
# for feature in self.continuous_feature_names:
# self.max_range = max(self.max_range, self.permitted_range[feature][1])

self._validate_and_set_data_name(params=params)

def _validate_and_set_dataframe(self, params):
Expand Down Expand Up @@ -122,22 +101,6 @@ def _validate_and_set_continuous_features_precision(self, params):
else:
self.continuous_features_precision = None

def _validate_and_set_permitted_range(self, params):
"""Validate and set the dictionary of permitted ranges for continuous features."""
input_permitted_range = None
if 'permitted_range' in params:
input_permitted_range = params['permitted_range']

if not hasattr(self, 'feature_names'):
raise SystemException('Feature names not correctly set in public data interface')

for input_permitted_range_feature_name in input_permitted_range:
if input_permitted_range_feature_name not in self.feature_names:
raise UserConfigValidationException(
"permitted_range contains some feature names which are not part of columns in dataframe"
)
self.permitted_range, _ = self.get_features_range(input_permitted_range)

def _set_feature_dtypes(self, data_df, categorical_feature_names,
continuous_feature_names):
"""Set the correct type of each feature column."""
Expand All @@ -157,38 +120,7 @@ def _set_feature_dtypes(self, data_df, categorical_feature_names,
np.int32)
return data_df

def check_features_to_vary(self, features_to_vary):
if features_to_vary is not None and features_to_vary != 'all':
not_training_features = set(features_to_vary) - set(self.feature_names)
if len(not_training_features) > 0:
raise UserConfigValidationException("Got features {0} which are not present in training data".format(
not_training_features))

def check_permitted_range(self, permitted_range):
if permitted_range is not None:
permitted_range_features = list(permitted_range)
not_training_features = set(permitted_range_features) - set(self.feature_names)
if len(not_training_features) > 0:
raise UserConfigValidationException("Got features {0} which are not present in training data".format(
not_training_features))

for feature in permitted_range_features:
if feature in self.categorical_feature_names:
train_categories = self.permitted_range[feature]
for test_category in permitted_range[feature]:
if test_category not in train_categories:
raise UserConfigValidationException(
'The category {0} does not occur in the training data for feature {1}.'
' Allowed categories are {2}'.format(test_category, feature, train_categories))

def check_mad_validity(self, feature_weights):
"""checks feature MAD validity and throw warnings.
TODO: add comments as to where this is used if this function is necessary, else remove.
"""
if feature_weights == "inverse_mad":
self.get_valid_mads(display_warnings=True, return_mads=False)

def get_features_range(self, permitted_range_input=None):
def get_features_range(self, permitted_range_input=None, features_dict=None):
ranges = {}
# Getting default ranges based on the dataset
for feature_name in self.continuous_feature_names:
Expand Down Expand Up @@ -307,25 +239,6 @@ def get_minx_maxx(self, normalized=True):
minx[0][idx] = self.permitted_range[feature_name][0]
maxx[0][idx] = self.permitted_range[feature_name][1]
return minx, maxx
# if encoding=='one-hot':
# minx = np.array([[0.0] * len(self.ohe_encoded_feature_names)])
# maxx = np.array([[1.0] * len(self.ohe_encoded_feature_names)])

# for idx, feature_name in enumerate(self.continuous_feature_names):
# max_value = self.train_df[feature_name].max()
# min_value = self.train_df[feature_name].min()

# if normalized:
# minx[0][idx] = (self.permitted_range[feature_name]
# [0] - min_value) / (max_value - min_value)
# maxx[0][idx] = (self.permitted_range[feature_name]
# [1] - min_value) / (max_value - min_value)
# else:
# minx[0][idx] = self.permitted_range[feature_name][0]
# maxx[0][idx] = self.permitted_range[feature_name][1]
# else:
# minx = np.array([[0.0] * len(self.feature_names)])
# maxx = np.array([[1.0] * len(self.feature_names)])

def get_mads(self, normalized=False):
"""Computes Median Absolute Deviation of features."""
Expand Down Expand Up @@ -370,24 +283,17 @@ def get_quantiles_from_training_data(self, quantile=0.05, normalized=False):
list(set(normalized_train_df[feature].tolist())))), quantile)
return quantiles

def create_ohe_params(self):
def create_ohe_params(self, one_hot_encoded_data):
if len(self.categorical_feature_names) > 0:
one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
self.ohe_encoded_feature_names = [x for x in one_hot_encoded_data.columns.tolist(
) if x not in np.array([self.outcome_name])]
else:
# one-hot-encoded data is same as original data if there is no categorical features.
self.ohe_encoded_feature_names = [feat for feat in self.feature_names]

# base dataframe for doing one-hot-encoding
# ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters)
# when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
self.ohe_base_df = self.prepare_df_for_ohe_encoding()

def get_data_params_for_gradient_dice(self):
"""Gets all data related params for DiCE."""

self.create_ohe_params()
minx, maxx = self.get_minx_maxx(normalized=True)

# get the column indexes of categorical and continuous features after one-hot-encoding
Expand Down Expand Up @@ -497,11 +403,11 @@ def get_decoded_data(self, data, encoding='one-hot'):
index = [i for i in range(0, len(data))]
if encoding == 'one-hot':
if isinstance(data, pd.DataFrame):
return self.from_dummies(data)
return data
elif isinstance(data, np.ndarray):
data = pd.DataFrame(data=data, index=index,
columns=self.ohe_encoded_feature_names)
return self.from_dummies(data)
return data
else:
raise ValueError("data should be a pandas dataframe or a numpy array")

Expand Down Expand Up @@ -560,35 +466,21 @@ def prepare_query_instance(self, query_instance):
self.continuous_feature_names)
return test

# TODO: create a new method, get_LE_min_max_normalized_data() to get label-encoded and normalized data. Keep this
# method only for converting query_instance to pd.DataFrame
# if encoding == 'label':
# for column in self.categorical_feature_names:
# test[column] = self.labelencoder[column].transform(test[column])
# return self.normalize_data(test, encoding)
#
# elif encoding == 'one-hot':
# temp = self.prepare_df_for_encoding()
# temp = temp.append(test, ignore_index=True, sort=False)
# temp = self.one_hot_encode_data(temp)
# temp = self.normalize_data(temp)
#
# return temp.tail(test.shape[0]).reset_index(drop=True)

def get_ohe_min_max_normalized_data(self, query_instance):
"""Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict,
a dataframe, a list, or a list of dicts"""
query_instance = self.prepare_query_instance(query_instance)
temp = self.ohe_base_df.append(query_instance, ignore_index=True, sort=False)
ohe_base_df = self.prepare_df_for_ohe_encoding()
temp = ohe_base_df.append(query_instance, ignore_index=True, sort=False)
temp = self.one_hot_encode_data(temp)
temp = temp.tail(query_instance.shape[0]).reset_index(drop=True)
# returns a pandas dataframe
return self.normalize_data(temp)
# returns a pandas dataframe with all numeric values
return self.normalize_data(temp).apply(pd.to_numeric)

def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
should be a dataframe or an array"""
raw_data = self.get_decoded_data(transformed_data, encoding='one-hot')
raw_data = self.from_dummies(transformed_data)
raw_data = self.de_normalize_data(raw_data)
precisions = self.get_decimal_precisions()
for ix, feature in enumerate(self.continuous_feature_names):
Expand Down
Loading