In [9]:
# SET THIS VARIABLE IF 
explained_model_backend = 'tensorflow' # 'sklearn' or 'tensorflow'

# WARNING REMEMEBER TO CHANGE MANUALLY CFEC MODEL LOADING IF SOME CHANGES APPEAR 

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import json
from utils.transformations import min_max_normalization, inverse_min_max_normalization, transform_to_sparse, inverse_transform_to_sparse
import warnings
import tensorflow as tf
import pickle

warnings.filterwarnings('ignore', category=UserWarning) #Ignore sklearn "RF fitted with FeatureNames"

train_dataset = pd.read_csv("../data/adult.csv")
dataset_name = 'adult'
instance_to_explain_index = 890

with open('../data/adult_constraints.json', 'r') as f:
    constr = json.load(f)

if explained_model_backend == 'sklearn':
    # SKLEARN
    with open('../models/adult_RF.pkl', 'rb') as f:
        explained_model = pickle.load(f)
else: 
    # TENSORFLOW
    explained_model = tf.keras.models.load_model('../models/adult_NN/')


train_dataset = train_dataset[constr['features_order_nonsplit']]
train_dataset.columns

Index(['hours.per.week', 'age', 'capital.loss', 'education.num',
       'capital.gain', 'workclass', 'marital.status', 'occupation', 'race',
       'sex', 'native.country', 'income'],
      dtype='object')

In [11]:
actionable_mask_indices_sparse = [1 if any([act in x for act in constr['actionable_features']]) else 0 for x in constr['features_order_after_split']]

In [12]:
query_instance = train_dataset.drop(columns="income")[instance_to_explain_index:instance_to_explain_index+1]

all_counterfactuals = pd.DataFrame(columns=train_dataset.columns.tolist() + ['explainer'])

In [13]:
query_instance

Unnamed: 0,hours.per.week,age,capital.loss,education.num,capital.gain,workclass,marital.status,occupation,race,sex,native.country
890,40,35,1887,14,0,Local-gov,Married-civ-spouse,Prof-specialty,White,Male,United-States


In [14]:
# Transform dataset to sparse
train_dataset_sparse = transform_to_sparse(
    _df=train_dataset.drop(columns="income"),
    original_df=train_dataset.drop(columns="income"),
    categorical_features=constr['categorical_features_nonsplit'],
    continuous_features=constr['continuous_features_nonsplit']
)

# Min-max normalization
train_dataset_sparse_normalized = min_max_normalization(
    _df=train_dataset_sparse,
    original_df=train_dataset.drop(columns="income"),
    continuous_features=constr['continuous_features_nonsplit']
)

query_instance_sparse_normalized = train_dataset_sparse_normalized[instance_to_explain_index:instance_to_explain_index+1]

In [15]:
explained_model.predict(query_instance_sparse_normalized)



array([[2.2027573e-04, 9.9977976e-01]], dtype=float32)

DICE

In [16]:
from dice import DiceModel

if explained_model_backend == 'sklearn':
    dice_model = DiceModel(
        train_dataset=train_dataset,
        continuous_features=constr['continuous_features_nonsplit'],
        categorical_features=constr['categorical_features_nonsplit'],
        target=constr['target_feature'],
        backend='sklearn',
        model=explained_model
    )
else:
    dice_model = DiceModel(
        train_dataset=train_dataset,
        continuous_features=constr['continuous_features_nonsplit'],
        categorical_features=constr['categorical_features_nonsplit'],
        target=constr['target_feature'],
        backend='TF2',
        model=explained_model
    )

dice_counterfactuals_df = dice_model.generate_counterfactuals(
    query_instance=query_instance,
    total_CFs=50,
    desired_class='opposite',
    features_to_vary=constr['actionable_features'],
    permitted_range=constr['feature_ranges'],
)

dice_counterfactuals_df['explainer'] = 'dice'
all_counterfactuals = pd.concat([all_counterfactuals, dice_counterfactuals_df], ignore_index=True)

100%|██████████| 1/1 [00:07<00:00,  7.87s/it]


In [None]:
all_counterfactuals.head(8)

In [19]:
print(actionable_mask_indices_sparse)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
print(np.where(actionable_mask_indices_sparse)[0].tolist())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]


CFEC

In [None]:
from cfec_ece import CfecEceModel 

train_dataset_sparse_normalized_subsample = train_dataset_sparse_normalized.sample(frac=1.0)

if explained_model_backend == 'sklearn':
    cfec_model = CfecEceModel(
        train_data_normalized=train_dataset_sparse_normalized_subsample,
        constraints_dictionary=constr,
        model_path='../models/adult_RF.pkl',
        model_backend='sklearn',
        fimap_load_s_g_full_id=f'adult_sklearn|2023-01-17',
        #fimap_save_s_q_prefix='adult_sklearn',
        columns_to_change=np.where(actionable_mask_indices_sparse)[0].tolist(),
        )
else:
    cfec_model = CfecEceModel(
        train_data_normalized=train_dataset_sparse_normalized_subsample,
        constraints_dictionary=constr,
        model_path='../models/adult_NN/',
        model_backend='tensorflow',
        fimap_load_s_g_full_id=f'adult_tensorflow|2023-01-17',
        #fimap_save_s_q_prefix='adult_tensorflow',
        columns_to_change=np.where(actionable_mask_indices_sparse)[0].tolist(),
        )

In [None]:
cfec_counterfactuals_raw, list_cfs_explainers = cfec_model.generate_counterfactuals(query_instance=query_instance_sparse_normalized.iloc[0])
cfec_counterfactuals_raw

In [None]:
# Do not allow for negative values
cfec_counterfactuals_raw[cfec_counterfactuals_raw < 0] = 0

# Inverse min-max normalization
cfec_counterfactuals = inverse_min_max_normalization(
    _df=cfec_counterfactuals_raw,
    original_df=train_dataset.drop(columns="income"),
    continuous_features=constr['continuous_features_nonsplit']
)

# Inverse transform to sparse
cfec_counterfactuals = inverse_transform_to_sparse(
    sparse_df=cfec_counterfactuals,
    original_df=train_dataset.drop(columns="income"),
    categorical_features=constr['categorical_features_nonsplit'],
    continuous_features=constr['continuous_features_nonsplit']
)

In [None]:
list_cfs_explainers = list(map(lambda x: 'cadex' if 'cadex' in str.lower(x) else 'fimap', list_cfs_explainers))
cfec_counterfactuals['explainer'] = list_cfs_explainers
all_counterfactuals = pd.concat([all_counterfactuals, cfec_counterfactuals], ignore_index=True)
cfec_counterfactuals

In [None]:
query_instance_sparse_normalized[cfec_counterfactuals_raw.columns[39:]]

In [None]:
cfec_counterfactuals_raw[cfec_counterfactuals_raw.columns[39:]]

WACHTER

In [None]:
feature_ranges = (
    train_dataset_sparse_normalized.to_numpy().min(axis=0),
    train_dataset_sparse_normalized.to_numpy().max(axis=0),
)
non_actionable_indices = ~np.array(actionable_mask_indices_sparse, dtype='bool')
feature_ranges[0][non_actionable_indices] = query_instance_sparse_normalized.to_numpy()[0][non_actionable_indices]
feature_ranges[1][non_actionable_indices] = query_instance_sparse_normalized.to_numpy()[0][non_actionable_indices]
feature_ranges

In [None]:
from alibi_impl import AlibiWachter

continous = len(constr['continuous_features_nonsplit'])




if explained_model_backend == 'sklearn':
    max_iter = 100
    max_lam_steps=10
    lam_init=0.001
    lr_init=0.1
    early_stop=50
    tolerance=0.4
    target_proba=1.0

    wachter_model = AlibiWachter('../models/adult_RF.pkl', 'sklearn', 
    query_instance_sparse_normalized.shape, feature_ranges=feature_ranges,
    max_iter=max_iter, max_lam_steps=max_lam_steps, lam_init=lam_init, 
    learning_rate_init=lr_init, early_stop=early_stop, tolerance=tolerance,
    target_proba=target_proba,
    )
else:
    #eps_wachter = np.array([[0.01] * continous + [0.01] * (len(train_dataset_sparse_normalized.columns) - continous)]) * (np.array(actionable_mask_indices_sparse, dtype=int) + 0.001)
    wachter_model = AlibiWachter('../models/adult_NN/', 'tensorflow', query_instance_sparse_normalized.shape, target_proba=1.0, feature_ranges=feature_ranges)
    
explanation = wachter_model.generate_counterfactuals(query_instance_sparse_normalized)

In [None]:
explanation.cf['X']

In [None]:
explanation.cf['class']

In [None]:
wachter_counterfactuals = [explanation.cf['X']]
for key, lst in explanation['data']['all'].items():
    if lst:
        for cf in lst:
            wachter_counterfactuals.append(cf['X'])

wachter_counterfactuals = np.array(wachter_counterfactuals).reshape(-1, query_instance_sparse_normalized.shape[1])

wachter_counterfactuals_df = pd.DataFrame(wachter_counterfactuals, columns=constr['features_order_after_split'])

# Inverse min-max normalization
wachter_counterfactuals_df = inverse_min_max_normalization(
    _df=wachter_counterfactuals_df,
    original_df=train_dataset.drop(columns="income"),
    continuous_features=constr['continuous_features_nonsplit']
)

# Inverse transform to sparse
wachter_counterfactuals_df = inverse_transform_to_sparse(
    sparse_df=wachter_counterfactuals_df,
    original_df=train_dataset.drop(columns="income"),
    categorical_features=constr['categorical_features_nonsplit'],
    continuous_features=constr['continuous_features_nonsplit']
)

wachter_counterfactuals_df['explainer'] = 'wachter'

# Reduce number of Wachter counterfactuals because they are almost the same
sampled_wachter_cfs = wachter_counterfactuals_df.sample(min(len(wachter_counterfactuals_df), 10))

#sampled_wachter_cfs = sampled_wachter_cfs.append(wachter_counterfactuals_df.iloc[0])

all_counterfactuals = pd.concat([all_counterfactuals, wachter_counterfactuals_df.iloc[0:1], sampled_wachter_cfs], ignore_index=True)

wachter_counterfactuals_df.head(11)

In [None]:
wachter_counterfactuals_df.iloc[0:1]

In [None]:
wachter_counterfactuals_df.shape

CEM

In [None]:
from alibi.explainers import CEM

tf.keras.backend.clear_session()
tf.compat.v1.disable_eager_execution()

if explained_model_backend == 'sklearn':
    # SKLEARN
    with open('../models/adult_RF.pkl', 'rb') as f:
        explained_model = pickle.load(f)
else: 
    # TENSORFLOW
    explained_model = tf.keras.models.load_model('../models/adult_NN/')

shape = query_instance_sparse_normalized.shape  # instance shape
continous = len(constr['continuous_features_nonsplit'])
clip = (-1000.,1000.)
eps_cem = (
        0.05,
        np.array([[0.05] * continous + [1.0] * (len(train_dataset_sparse_normalized.columns) - continous)]) #* (np.array(actionable_mask_indices_sparse) + 0.001) # Dont allow changes on non-actionable features
        )
# eps_cem = (0.1, 0.1)

if explained_model_backend == 'sklearn':
    mode = 'PN'
    #feature_range = (train_dataset_sparse_normalized.to_numpy().min(),  # feature range for the perturbed instance
    #                   train_dataset_sparse_normalized.to_numpy().max()) 
    update_num_grad = 2
    c_init = 15.  # initial weight c of the loss term encouraging to predict a different class (PN) or
                # the same class (PP) for the perturbed instance compared to the original instance to be explained
    # Return probabilities for x
    cem_pred_fn = lambda x: np.array(explained_model.predict_proba(x)[0])#explained_model.predict_proba(x)[0][0][::-1].reshape(1, 2)#np.array([explained_model.predict_proba(x)[0][0][1], explained_model.predict_proba(x)[0][0][0]])

    cem = CEM(cem_pred_fn, mode, shape, kappa=0.0, beta=0.1, feature_range=feature_ranges, 
            update_num_grad=update_num_grad, clip=clip, no_info_val=-0.0, c_init=c_init,
            c_steps=10, learning_rate_init=.1, max_iterations=10, eps=eps_cem
            )
else:
    mode = 'PN'  # 'PN' (pertinent negative) or 'PP' (pertinent positive)
    kappa = .3 # minimum difference needed between the prediction probability for the perturbed instance on the
                # class predicted by the original instance and the max probability on the other classes
                # in order for the first loss term to be minimized
    beta = .1  # weight of the L1 loss term
    c_init = 10  # initial weight c of the loss term encouraging to predict a different class (PN) or
                # the same class (PP) for the perturbed instance compared to the original instance to be explained
   
    c_steps = 10  # nb of updates for c
    max_iterations = 1000  # nb of iterations per value of c
    # feature_range = (train_dataset_sparse_normalized.to_numpy().min(axis=0).reshape(shape),  # feature range for the perturbed instance
    #                 train_dataset_sparse_normalized.to_numpy().max(axis=0).reshape(shape))  # can be either a float or array of shape (1xfeatures)
    #feature_range = (train_dataset_sparse_normalized.to_numpy().min(),train_dataset_sparse_normalized.to_numpy().max())  # can be either a float or array of shape (1xfeatures)
      # gradient clipping
    lr_init = 1e-2  # initial learning rate

    # initialize CEM explainer and explain instance
    cem = CEM(explained_model, mode, shape, kappa=kappa, beta=beta, feature_range=feature_ranges,
            max_iterations=max_iterations, c_init=c_init, c_steps=c_steps,
            learning_rate_init=lr_init, clip=clip, no_info_val=0.0
            )

In [None]:

cem.fit(train_dataset_sparse_normalized.to_numpy(), no_info_type='median')  # we need to define what feature values contain the least
                                                                    # info wrt predictions
                                                                    # here we will naively assume that the feature-wise median
                                                                    # contains no info; domain knowledge helps!

In [None]:
cem_explanation = cem.explain(query_instance_sparse_normalized.to_numpy(), verbose=True)

cem_cf_df = pd.DataFrame(cem_explanation.PN, columns=constr['features_order_after_split'])

# Inverse min-max normalization
cem_cf_df = inverse_min_max_normalization(
    _df=cem_cf_df,
    original_df=train_dataset.drop(columns="income"),
    continuous_features=constr['continuous_features_nonsplit']
)

# Inverse transform to sparse
cem_cf_df = inverse_transform_to_sparse(
    sparse_df=cem_cf_df,
    original_df=train_dataset.drop(columns="income"),
    categorical_features=constr['categorical_features_nonsplit'],
    continuous_features=constr['continuous_features_nonsplit']
)

cem_cf_df['explainer'] = 'Cem'
print(all_counterfactuals.shape)
all_counterfactuals = pd.concat([all_counterfactuals, cem_cf_df], ignore_index=True)
print(all_counterfactuals.shape)

cem_cf_df

CFPROTO

In [None]:
# from alibi.explainers import CounterfactualProto

# import pickle
# with open('../models/adult_RF.pkl', 'rb') as f:
#     cfprot_model = pickle.load(f)
# predict_fnct = lambda x: cfprot_model.predict(x)


In [None]:
# cat_vars_ord = {}
# for i, cat in enumerate(constr['categorical_features_nonsplit']):
#     start_index = np.argwhere(cat == train_dataset.columns.to_numpy())[0][0]
#     unique = len(np.unique(train_dataset[cat]))
#     cat_vars_ord[start_index] = unique
# print(cat_vars_ord)

In [None]:
# cat_vars_ohe = {}
# for f in constr['categorical_features_nonsplit']:
#     indx = constr['feature_first_occurrence_after_split'][f]
#     cnt = constr['features_count_nonsplit'][f] 
#     cat_vars_ohe[indx] = cnt
# cat_vars_ohe

In [None]:
# cfProto = CounterfactualProto(predict_fnct,
#                          query_instance_sparse_normalized.shape,
#                          cat_vars=cat_vars_ohe,
#                          ohe=True,  # OHE flag
#                          max_iterations=500,
#                          beta=0.01,
#                          feature_range=(0.0, 1.0),
#                         #  use_kdtree=True,
#                          theta= 10.,
#                          c_init=1.0,
#                          c_steps=5,
#                         )

In [None]:

# cfProto.fit(train_dataset_sparse_normalized.to_numpy().astype('float64'), d_type='abdm', trustscore_kwargs=None)

In [None]:
# explanation = cfProto.explain(query_instance_sparse_normalized.to_numpy())

In [None]:
# all_counterfactuals

VISUALIZATION

In [None]:
from visualization_helpers import get_scores
from visualization_helpers import remove_duplicates

all_counterfactuals = remove_duplicates(all_counterfactuals)
print('Counterfactuals: ', all_counterfactuals.shape)

# Transform counterfactuals to sparse
counterfactuals_sparse = transform_to_sparse(
    _df=all_counterfactuals,
    original_df=train_dataset.drop(columns="income"),
    categorical_features=constr['categorical_features_nonsplit'],
    continuous_features=constr['continuous_features_nonsplit']
)

# Normalize counterfactuals
counterfactuals_sparse_normalized = min_max_normalization(
    _df=counterfactuals_sparse,
    original_df=train_dataset.drop(columns="income"),
    continuous_features=constr['continuous_features_nonsplit']
)


# Transform query instance to sparse
query_instance_sparse = transform_to_sparse(
    _df=query_instance,
    original_df=train_dataset.drop(columns="income"),
    categorical_features=constr['categorical_features_nonsplit'],
    continuous_features=constr['continuous_features_nonsplit']
)

# Normalize query instance sparse
query_instance_sparse_normalized = min_max_normalization(
    _df=query_instance_sparse,
    original_df=train_dataset.drop(columns="income"),
    continuous_features=constr['continuous_features_nonsplit']
)

# Mask non actionable features
mask_indices = [1 if any([act in x for act in constr['actionable_features']]) else 0 for x in constr['features_order_after_split']]

In [None]:
all_counterfactuals.tail(10)

In [None]:
counterfactuals_sparse_normalized.tail(10)

In [None]:
if explained_model_backend == 'sklearn':

    cems = all_counterfactuals[all_counterfactuals['explainer'] == 'Cem'].index.tolist()
    wachters = all_counterfactuals[all_counterfactuals['explainer'] == 'wachter'].index.tolist()
    cadexes = all_counterfactuals[all_counterfactuals['explainer'] == 'Cadex'].index.tolist()
    fimaps = all_counterfactuals[all_counterfactuals['explainer'] == 'Fimap'].index.tolist()


    print('Orginal x: ',explained_model.predict_proba(query_instance_sparse_normalized)[0] )

    if len(cems) > 0:
        print('cem: ', explained_model.predict_proba(counterfactuals_sparse_normalized.iloc[cems].to_numpy().reshape(-1, 85))[0])
    if len(wachters) > 0:
        print('wachters: ', explained_model.predict_proba(counterfactuals_sparse_normalized.iloc[wachters].to_numpy().reshape(-1, 85))[0])
    if len(cadexes) > 0:
        print('cadexes: ', explained_model.predict_proba(counterfactuals_sparse_normalized.iloc[cadexes].to_numpy().reshape(-1, 85))[0])
    if len(fimaps) > 0:
        print('fimaps: ', explained_model.predict_proba(counterfactuals_sparse_normalized.iloc[fimaps].to_numpy().reshape(-1, 85))[0])

if explained_model_backend == 'tensorflow':

    cems = all_counterfactuals[all_counterfactuals['explainer'] == 'Cem'].index.tolist()
    wachters = all_counterfactuals[all_counterfactuals['explainer'] == 'wachter'].index.tolist()
    cadexes = all_counterfactuals[all_counterfactuals['explainer'] == 'Cadex'].index.tolist()
    fimaps = all_counterfactuals[all_counterfactuals['explainer'] == 'Fimap'].index.tolist()

    print(cems)


    print('Orginal x: ',explained_model.predict(query_instance_sparse_normalized) )

    if len(cems) > 0:
        print(f'cem: {np.round(explained_model.predict(counterfactuals_sparse_normalized.iloc[cems].to_numpy().reshape(-1, 85)), 3)}')
    if len(wachters) > 0:
        print(f'wachters: {np.round(explained_model.predict(counterfactuals_sparse_normalized.iloc[wachters].to_numpy().reshape(-1, 85)), 3)}')
    if len(cadexes) > 0:
        print(f'cadexes: {np.round(explained_model.predict(counterfactuals_sparse_normalized.iloc[cadexes].to_numpy().reshape(-1, 85)), 3)}')
    if len(fimaps) > 0:
        print(f'fimaps: {np.round(explained_model.predict(counterfactuals_sparse_normalized.iloc[fimaps].to_numpy().reshape(-1, 85)), 3)}')

In [None]:
from visualization_helpers import filter_non_valid
from visualization_helpers import filter_non_actionable_features

# # SKLEARN
# with open('../models/adult_RF.pkl', 'rb') as f:
#     model = pickle.load(f)

# # TENSORFLOW
# model = tf.keras.models.load_model('../models/adult_NN/')

if explained_model_backend == 'sklearn':
    # SKLEARN
    with open('../models/adult_RF.pkl', 'rb') as f:
        explained_model = pickle.load(f)
else: 
    # TENSORFLOW
    explained_model = tf.keras.models.load_model('../models/adult_NN/')

predict_fn = lambda x: explained_model.predict(x)

valid_counterfactuals_sparse_normalized = filter_non_valid(predict_fn, query_instance_sparse_normalized, counterfactuals_sparse_normalized)
valid_counterfactuals_sparse_normalized.index

In [None]:
#Filter not feasible (data is min-max normalized, so values shouldn't be less than zero)
# not_feasible = np.where(np.sum(counterfactuals_sparse_normalized < 0, axis=1) > 0)[0]
not_feasible = []

indices_to_keep = list(filter(lambda x: x not in not_feasible, valid_counterfactuals_sparse_normalized.index.tolist()))
valid_counterfactuals_sparse_normalized = counterfactuals_sparse_normalized.iloc[indices_to_keep]
print(indices_to_keep)

In [None]:
valid_counterfactuals_sparse_normalized.head(5)

In [None]:
all_counterfactuals.iloc[indices_to_keep]

In [None]:
valid_counterfactuals = all_counterfactuals.iloc[indices_to_keep]
valid_counterfactuals.reset_index(drop=True, inplace=True)
valid_counterfactuals = filter_non_actionable_features(valid_counterfactuals, query_instance, constr['non_actionable_features'], constr['categorical_features_nonsplit'], constr['continuous_features_nonsplit'])
valid_counterfactuals_sparse_normalized = valid_counterfactuals_sparse_normalized.iloc[valid_counterfactuals.index.tolist()]
valid_counterfactuals_sparse_normalized.reset_index(drop=True, inplace=True)
valid_counterfactuals.reset_index(drop=True, inplace=True)
valid_counterfactuals.shape

In [None]:
# add income column
valid_counterfactuals['income'] = np.argmax(predict_fn(valid_counterfactuals_sparse_normalized.to_numpy()[0:1]))
valid_counterfactuals

In [None]:
#scores_df = get_scores(valid_counterfactuals_sparse_normalized.to_numpy(), query_instance_sparse_normalized, train_dataset_sparse_normalized, train_dataset['income'], mask_indices)

In [None]:
train_data_predicted_classes = np.argmax(predict_fn(train_dataset_sparse_normalized), axis=1)
x_predicted_class = np.argmax(predict_fn(query_instance_sparse_normalized))

In [None]:
cols = train_dataset.drop(['income'], axis=1).columns.tolist()
continous_indices = list()
categorical_indices = list()

for col in constr['continuous_features_nonsplit']:
    continous_indices += [cols.index(col)]

for col in constr['categorical_features_nonsplit']:
    categorical_indices += [cols.index(col)]

print(continous_indices)
print(categorical_indices)
print('Proper indices extracted: ', len(categorical_indices + continous_indices) == len(cols))

In [None]:
preferences = [0, 4, 2, 3, 5, 1]

print(f'Preferences: {np.array(cols)[preferences]}')

In [None]:
from utils.scores import get_scores


scores_df = get_scores(
    cfs=valid_counterfactuals.drop(['income', 'explainer'], axis=1).to_numpy().astype('<U11'),
    cf_predicted_classes=valid_counterfactuals['income'].to_numpy(),
    x=query_instance.to_numpy()[0].astype('<U11'),
    x_predicted_class=x_predicted_class,
    training_data=train_dataset.drop(['income'], axis=1).to_numpy().astype('<U11'),
    training_data_predicted_classes=train_data_predicted_classes,
    continous_indices=continous_indices,
    categorical_indices=categorical_indices,
    preferences_ranking=preferences,
)

In [None]:
scores_df['explainer'] = valid_counterfactuals['explainer']
scores_df.head(3)

In [None]:
scores_to_plot = scores_df.copy()

In [None]:
scores_to_plot.explainer.value_counts()

In [None]:
if 'sklearn' in str(type(explained_model)):
    explained_model_name = 'RF'
else:
    explained_model_name = 'NN'

In [None]:
def get_optimization_direction(metric_name: str) -> str:
    cost_criteria = ['feasibility', 'proximity', 'features']
    gain_criteria = ['discriminative', 'dcg']

    cost = any([True if x.lower() in metric_name.lower() else False for x in cost_criteria])
    if cost:
        return 'min'
    else:
        return 'max'
        
get_optimization_direction('Feasibility')

In [None]:
import matplotlib.pyplot as plt
from visualization_helpers import get_pareto_frontier_mask
from pareto import get_pareto_optimal_mask

#metrics_to_plot = ['proximity', 'features_changed', 'feasibility', 'dispreference_dcg', 'non_discriminative_power']
metrics_to_plot = scores_to_plot.drop(['explainer'], axis=1).columns.tolist()

n = len(metrics_to_plot)

fig, ax = plt.subplots(n, n, figsize=(3.5*n, 3*n))

colors = ['r', 'g', 'b', 'y', 'c', 'm', 'k', 'w']
markers = ['s', 'o', 'v', '+', '*', 'p', 'P', 'X', 'D', '>']
labels = []

ax = ax.flatten()

for plot_round in ['nonpareto', 'pareto']:
    for i, other_metric in enumerate(metrics_to_plot):
        for j, metric in enumerate(metrics_to_plot):

            all_x = scores_to_plot[metric].to_numpy()
            all_y = scores_to_plot[other_metric].to_numpy()
            to_check = np.array([all_x, all_y], dtype=np.float64).T

            # Get pareto frontiers mask
            metric_direction = get_optimization_direction(metric)
            other_metric_direction = get_optimization_direction(other_metric)
            optimization_directions = [metric_direction, other_metric_direction]
            all_pareto = get_pareto_optimal_mask(data=to_check, optimization_direction=optimization_directions).astype('bool')



            ax[i*n+j].grid()

            for k, explainer in enumerate(scores_to_plot['explainer'].value_counts().sort_values(ascending=True).index.tolist()):

                mask = scores_to_plot['explainer'] == explainer
                pareto = all_pareto[mask]

                x = scores_to_plot[mask][metric].to_numpy()
                y = scores_to_plot[mask][other_metric].to_numpy()
            
                if plot_round == 'nonpareto':
                    if i == j:
                        ax[i*n+j].hist(x, color=colors[k], label=explainer, alpha=0.5)
                        ax[i*n+j].legend()
                    else:
                        ax[i*n+j].scatter(x[~pareto], y[~pareto], color='steelblue', marker=markers[k], label=explainer)
                elif plot_round == 'pareto' and i!=j:
                    ax[i*n+j].scatter(x[pareto], y[pareto], color='orange', marker=markers[k])

                    if i < j:
                        print(f'For explainer: {explainer} and metrics {metric}, {other_metric}, paretos: {sum(pareto)} out of {len(pareto)}')
                        # print(f'{scores_df[scores_df["explainer"] == explainer][[metric, other_metric]][pareto]}')
            
            ax[i*n+j].set_xlabel(f'({metric_direction}) {metric}')
            ax[i*n+j].set_ylabel(f'({other_metric_direction}) {other_metric}')
    # plt.title('Proximity vs Dispreference DCG \n(Pareto front in orange). \nLower is better.')

handles, labels = ax[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')

counts = scores_df['explainer'].value_counts()

plt.suptitle(f'Pareto frontiers of the counterfactuals (lower is better)\nExplained model: {explained_model_name}\nDataset: {dataset_name}\nCounterfactuals by method {counts.to_dict()}\n')
plt.tight_layout()
plt.savefig(f'../images/{dataset_name}/{explained_model_name}/{dataset_name}_{explained_model_name}_{instance_to_explain_index}_pairplot_with_frontiers.png')
plt.show()

In [None]:
scores_to_plot.to_csv(f'cf_scores_tmp-{explained_model_name}-{dataset_name}-{instance_to_explain_index}.csv', index=False)

In [None]:
# scores_df[['feasibility', 'features_changed']]

In [None]:
# metric = 'feasibility'
# other_metric = 'features_changed'
# all_x = scores_df[metric].to_numpy()
# all_y = scores_df[other_metric].to_numpy()
# to_check = np.array([all_x, all_y], dtype=np.float64).T
# all_pareto = get_pareto_frontier_mask(to_check)
# scores_df[[metric, other_metric, 'explainer']][all_pareto]

In [None]:
# scores_df['explainer']

3D plot

In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(projection='3d')


# metric = 'Proximity'
# other_metric = 'Features Changed (normalized)'
# other_other_metric = 'Feasibility'

# all_x = scores_to_plot[metric].to_numpy()
# all_y = scores_to_plot[other_metric].to_numpy()
# all_z = scores_to_plot[other_other_metric].to_numpy()
# to_check = np.array([all_x, all_y, all_z], dtype=np.float64).T

# # Get pareto frontiers mask
# metric_direction = get_optimization_direction(metric)
# other_metric_direction = get_optimization_direction(other_metric)
# other_other_metric_direction = get_optimization_direction(other_other_metric)
# optimization_directions = [metric_direction, other_metric_direction, other_other_metric_direction]
# all_pareto = get_pareto_optimal_mask(data=to_check, optimization_direction=optimization_directions).astype('bool')

# for plot_round in ['nonpareto', 'pareto']:
#     for k, explainer in enumerate(scores_to_plot['explainer'].value_counts().sort_values(ascending=True).index.tolist()):

#         mask = scores_to_plot['explainer'] == explainer
#         pareto = all_pareto[mask]

#         x = scores_to_plot[mask][metric].to_numpy()
#         y = scores_to_plot[mask][other_metric].to_numpy()
#         z = scores_to_plot[mask][other_other_metric].to_numpy()

#         if plot_round == 'nonpareto':
#             ax.scatter(x[~pareto], y[~pareto], z[~pareto], color='steelblue', marker=markers[k], label=explainer)
#         elif plot_round == 'pareto':
#             ax.scatter(x[pareto], y[pareto], z[pareto], color='orange', marker=markers[k])
         
# ax.set_xlabel(f'({metric_direction}) {metric}')
# ax.set_ylabel(f'({other_metric_direction}) {other_metric}')
# ax.set_zlabel(f'({other_other_metric_direction}) {other_other_metric}')

# plt.title(f'Pareto frontiers of the counterfactuals (lower is better)\nExplained model: {explained_model_name}\nDataset: {dataset_name}\nCounterfactuals by method {counts.to_dict()}\n')

# plt.tight_layout()
# plt.legend()
# plt.show()