In [87]:
import pandas as pd
import numpy as np
import numpy.typing as npt
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json
from experiment_utils import load_data, get_closest_to_optimal_point, get_pareto_optimal_mask, get_ideal_point

dataset = 'german'
date = '2023-03-12'

path = os.path.join(os.getcwd(), 'experiments', date, 'scores')

In [88]:
# Load scores
list_of_scores_df , scores_df_all, scores_test_set_indices = load_data('scores', date, dataset)
print(f'Gathered scores for {len(list_of_scores_df)} instances')

Gathered scores for 23 instances


In [89]:
# Load valid_scores
list_of_valid_scores_df , valid_scores_df_all, valid_scores_test_set_indices = load_data('valid_scores', date, dataset)
print(f'Gathered valid_scores for {len(list_of_valid_scores_df)} instances')

Gathered valid_scores for 23 instances


In [90]:
# Load counterfactuals
list_of_counterfactuals_df , counterfactuals_df_all, cf_test_set_indices = load_data('counterfactuals', date, dataset)
print(f'Gathered counterfactuals for {len(list_of_counterfactuals_df)} instances')

Gathered counterfactuals for 23 instances


In [91]:
# Load valid_counterfactuals
list_of_valid_counterfactuals_df , valid_counterfactuals_df_all, valid_cf_test_set_indices = load_data('valid_counterfactuals', date, dataset)
print(f'Gathered valid_counterfactuals for {len(list_of_valid_counterfactuals_df)} instances')

Gathered valid_counterfactuals for 23 instances


In [92]:
# Load test data - original x instances
test_data_path = os.path.join(os.getcwd(), 'data', f'{dataset}_test.csv')
test_dataset = pd.read_csv(test_data_path).iloc[scores_test_set_indices]
print(f'Loaded test data for {len(test_dataset)} instances')

Loaded test data for 23 instances


In [93]:
# Load constraints for the dataset
with open(os.path.join(os.getcwd(), 'data', f'{dataset}_constraints.json'), 'r') as f:
    constraints = json.load(f)
print(f'Loaded constraints for: {constraints["dataset_shortname"]}')

Loaded constraints for: german


In [94]:
assert scores_test_set_indices == cf_test_set_indices
assert len(list_of_scores_df) == len(list_of_counterfactuals_df) == len(test_dataset)

In [95]:
from typing import List


def get_ranges(test_data: pd.DataFrame, constraints: dict) -> npt.NDArray:
    '''
    Get ranges for continous variables.
    '''
    mins = test_data[constraints['continuous_features_nonsplit']].to_numpy().min(axis=0)
    maxes = test_data[constraints['continuous_features_nonsplit']].to_numpy().max(axis=0)
    feature_ranges = maxes - mins
    return feature_ranges


def heom(x: npt.NDArray, y: npt.NDArray, ranges: npt.NDArray, continous_indices: npt.NDArray, categorical_indices: npt.NDArray) -> float:
    '''
    Calculate HEOM distance between x and y. 
    X and Y should not be normalized. 
    X should be (n, m) dimensional.
    Y should be 1-D array.
    Ranges is max-min on each continous variables (order matters). 
    '''
    distance = np.zeros(x.shape[0])

    # Continous |x-y| / range
    distance += np.sum(np.abs(x[:, continous_indices].astype('float64') - y[continous_indices].astype('float64')) / ranges, axis=1)

    # Categorical - overlap
    distance += np.sum(~np.equal(x[:, categorical_indices], y[categorical_indices]), axis=1)

    return distance

def plausibility(test_data: pd.DataFrame, 
                 x_index: int, 
                 counterfactual: pd.DataFrame | pd.Series, 
                 list_of_counterfactuals_df: List[pd.DataFrame], 
                 ranges: npt.NDArray, 
                 continous_indices: npt.NDArray | List[float], 
                 categorical_indices: npt.NDArray | List[float]
                 ):
    # Find closest instance to original_x in test_data
    n = len(test_data)
    x = test_data.iloc[0:n+1].to_numpy()
    y = test_data.iloc[x_index].to_numpy()

    all_distances = heom(x, y, ranges, continous_indices, categorical_indices)
    # find closest instance to original_x in test_data
    sorting_indices = np.argsort(all_distances)
    # we do not take 0 because it is the same instance as original_x
    closest_index = np.array(list(zip(range(n), all_distances)))[sorting_indices][1][0].astype(int)
    # counterfactuals of closest x' to x
    closest_counterfactuals = list_of_counterfactuals_df[closest_index].to_numpy()
    
    # x_counterfactuals = list_of_counterfactuals_df[x_index].to_numpy()
    # # calculate all pairs of distances between counterfactuals from x and x'
    # sum_of_distances = .0
    # for x_cf in x_counterfactuals:
    #     mean_distance = np.mean(heom(closest_counterfactuals, x_cf, ranges, continous_indices, categorical_indices))
    #     sum_of_distances += mean_distance
    # return sum_of_distances / len(x_counterfactuals)
    
    plausibility_score = np.min(heom(closest_counterfactuals, counterfactual.to_numpy(), ranges, continous_indices, categorical_indices))
    return plausibility_score
    
    
    

continous_indices = [test_dataset.columns.get_loc(c) for c in constraints['continuous_features_nonsplit']]
categorical_indices = [test_dataset.columns.get_loc(c) for c in constraints['categorical_features_nonsplit']]
ranges = get_ranges(test_dataset, constraints)

print(f'Continous indices: {continous_indices}')
print(f'Categorical indices: {categorical_indices}')
print(f'Ranges: {ranges}')

test_plaus = plausibility(test_dataset, 0, list_of_counterfactuals_df[0].iloc[0], list_of_counterfactuals_df, ranges, continous_indices, categorical_indices)
# Calculate example plausibility score
print(f'Test plausibility: {test_plaus:.2f}')

Continous indices: [0, 1, 2, 3, 4, 5, 6]
Categorical indices: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Ranges: [   36 13946     3     3    46     1     1]
Test plausibility: 5.94


In [96]:
def sparsity(x_instance: npt.NDArray, cf_instance: npt.NDArray, continous_indices, categorical_indices) -> int:
    _sparsity = 0
    
    # Continous
    _sparsity += np.sum(~np.isclose(x_instance[continous_indices].astype('float64'), cf_instance[continous_indices].astype('float64'), atol=1e-05))
    
    # Categorical
    _sparsity += np.sum(~np.equal(x_instance[categorical_indices].astype('str'), cf_instance[categorical_indices].astype('str')))
    
    return _sparsity

In [97]:
def is_actionable(x_instance: npt.NDArray, cf_instance: npt.NDArray, continous_indices, categorical_indices, freeze_indices) -> bool:
    for freeze_index in freeze_indices:
        if freeze_index in continous_indices \
            and not np.isclose(x_instance[freeze_index].astype('float64'), cf_instance[freeze_index].astype('float64'), atol=1e-05):
            return False
        if freeze_index in categorical_indices \
            and not np.equal(x_instance.astype('str')[freeze_index], cf_instance.astype('str')[freeze_index]):
            return False
    return True

freeze_indices = [test_dataset.columns.get_loc(c) for c in constraints['non_actionable_features']]

## Combine experiment metrics

In [98]:
all_explainer_names = counterfactuals_df_all['explainer'].unique().tolist() + ['ideal_point_eucli', 'ideal_point_cheby', 'random_choice']

experiment_scores = {
    'proximity': {k: [] for k in all_explainer_names},
    'k_feasibility_3': {k: [] for k in all_explainer_names},
    'discriminative_power_9': {k: [] for k in all_explainer_names},
    'sparsity': {k: [] for k in all_explainer_names},
    'plausibility': {k: [] for k in all_explainer_names},
    'coverage': {k: 0 for k in all_explainer_names},
    'actionable': {k: 0 for k in all_explainer_names},
}

In [99]:
# Calculate plausibility for all counterfactuals
for i in range(len(test_dataset)):
    i_counterfactuals = list_of_counterfactuals_df[i]
    i_scores = list_of_scores_df[i]

    for explainer_name in all_explainer_names:
        if 'ideal_point' in explainer_name:
            # Get counterfactual closest to ideal point
            iscores = i_scores[['Proximity', 'K_Feasibility(3)', 'DiscriminativePower(9)']].to_numpy()
            
            # Apply normalization in each feature
            iscores = (iscores - iscores.min(axis=0)) / (iscores.max(axis=0) - iscores.min(axis=0))
            
            pareto_mask = get_pareto_optimal_mask(iscores, ['min', 'min', 'max'])
            ideal_point = get_ideal_point(iscores, ['min', 'min', 'max'], pareto_mask)
            
            distance_metric = 'euclidean' if 'eucli' in explainer_name else 'chebyshev'
            
            closest_idx = get_closest_to_optimal_point(iscores, ['min', 'min', 'max'], pareto_mask, ideal_point, distance_metric)
            #print(closest_idx)
            _index = closest_idx
        elif explainer_name == 'random_choice':
            # Get random counterfactual from all counterfactuals
            _index = np.random.permutation(i_scores.index)[0]
        elif explainer_name not in i_scores['explainer'].unique():
            continue
        else:
            #print(explainer_name)
            # Get random counterfactual from particular explainer
            _index = np.random.permutation(i_scores[i_counterfactuals['explainer'] == explainer_name].index)[0]
            
        _cf = i_counterfactuals.loc[_index]
        _plausibility = plausibility(test_dataset, i, _cf, list_of_counterfactuals_df, ranges, continous_indices, categorical_indices)
        experiment_scores['plausibility'][explainer_name].append(_plausibility)
        
        _sparsity = sparsity(test_dataset.iloc[i].to_numpy(), _cf.to_numpy(), continous_indices, categorical_indices)
        experiment_scores['sparsity'][explainer_name].append(_sparsity)
        
        _score = i_scores.loc[_index]
        experiment_scores['proximity'][explainer_name].append(_score['Proximity'])
        experiment_scores['k_feasibility_3'][explainer_name].append(_score['K_Feasibility(3)'])
        experiment_scores['discriminative_power_9'][explainer_name].append(_score['DiscriminativePower(9)'])
        experiment_scores['coverage'][explainer_name] += 1
        
        actionable = is_actionable(test_dataset.iloc[i].to_numpy(), _cf.to_numpy(), continous_indices, categorical_indices, freeze_indices)
        experiment_scores['actionable'][explainer_name] += int(actionable)
        

In [100]:
# average experiment scores
for metric_name, v in experiment_scores.items():
    for explainer_name, scores in v.items():
        if metric_name in ['coverage', 'actionable']:
            experiment_scores[metric_name][explainer_name] = experiment_scores[metric_name][explainer_name] / len(test_dataset)
        else:
            experiment_scores[metric_name][explainer_name] = np.mean(scores)
        print(f'{metric_name} {explainer_name}: {experiment_scores[metric_name][explainer_name]:.2f}')

proximity dice: 1.93
proximity cadex: 1.23
proximity fimap: 6.55
proximity wachter: 0.52
proximity cem: 0.38
proximity cfproto: 5.54
proximity growing-spheres: 7.93
proximity actionable-recourse: 0.75
proximity face: 4.66
proximity ideal_point_eucli: 1.49
proximity ideal_point_cheby: 1.38
proximity random_choice: 5.21
k_feasibility_3 dice: 3.91
k_feasibility_3 cadex: 3.61
k_feasibility_3 fimap: 2.92
k_feasibility_3 wachter: 3.31
k_feasibility_3 cem: 3.42
k_feasibility_3 cfproto: 4.90
k_feasibility_3 growing-spheres: 5.81
k_feasibility_3 actionable-recourse: 3.39
k_feasibility_3 face: 1.72
k_feasibility_3 ideal_point_eucli: 2.78
k_feasibility_3 ideal_point_cheby: 2.87
k_feasibility_3 random_choice: 3.96
discriminative_power_9 dice: 0.45
discriminative_power_9 cadex: 0.42
discriminative_power_9 fimap: 0.66
discriminative_power_9 wachter: 0.69
discriminative_power_9 cem: 0.65
discriminative_power_9 cfproto: 0.49
discriminative_power_9 growing-spheres: 0.63
discriminative_power_9 actionabl

In [101]:
# build dataframe from experiment scores
experiment1_df = pd.DataFrame(experiment_scores).round(2)
experiment1_df

Unnamed: 0,proximity,k_feasibility_3,discriminative_power_9,sparsity,plausibility,coverage,actionable
dice,1.93,3.91,0.45,2.17,4.67,1.0,1.0
cadex,1.23,3.61,0.42,2.65,4.45,1.0,1.0
fimap,6.55,2.92,0.66,9.57,4.1,1.0,1.0
wachter,0.52,3.31,0.69,3.17,3.99,1.0,1.0
cem,0.38,3.42,0.65,1.74,4.18,1.0,1.0
cfproto,5.54,4.9,0.49,7.3,5.41,1.0,0.91
growing-spheres,7.93,5.81,0.63,10.83,5.77,1.0,1.0
actionable-recourse,0.75,3.39,0.42,0.88,4.04,0.35,0.35
face,4.66,1.72,0.62,7.48,4.03,1.0,0.96
ideal_point_eucli,1.49,2.78,0.86,3.48,3.8,1.0,1.0


In [102]:
max_metric = ['discriminative_power_9', 'coverage', 'actionable']

def highlight_top3(s):
    #print(s)
    if s.name in max_metric:
        top = sorted(s, reverse=True)[:3]
    else:
        top = sorted(s)[:3]
    return ['font-weight: bold' if v  in top else '' for v in s]

# bold top 3 in each metric
res = experiment1_df.style.apply(highlight_top3, axis=0)
# Round to 2 decimals
res = res.format(precision=2)
res

Unnamed: 0,proximity,k_feasibility_3,discriminative_power_9,sparsity,plausibility,coverage,actionable
dice,1.93,3.91,0.45,2.17,4.67,1.0,1.0
cadex,1.23,3.61,0.42,2.65,4.45,1.0,1.0
fimap,6.55,2.92,0.66,9.57,4.1,1.0,1.0
wachter,0.52,3.31,0.69,3.17,3.99,1.0,1.0
cem,0.38,3.42,0.65,1.74,4.18,1.0,1.0
cfproto,5.54,4.9,0.49,7.3,5.41,1.0,0.91
growing-spheres,7.93,5.81,0.63,10.83,5.77,1.0,1.0
actionable-recourse,0.75,3.39,0.42,0.88,4.04,0.35,0.35
face,4.66,1.72,0.62,7.48,4.03,1.0,0.96
ideal_point_eucli,1.49,2.78,0.86,3.48,3.8,1.0,1.0


In [103]:
# pandas dataframe to latex table
print(res.to_latex())

\begin{tabular}{lrrrrrrr}
 & proximity & k_feasibility_3 & discriminative_power_9 & sparsity & plausibility & coverage & actionable \\
dice & 1.93 & 3.91 & 0.45 & \font-weightbold 2.17 & 4.67 & \font-weightbold 1.00 & \font-weightbold 1.00 \\
cadex & 1.23 & 3.61 & 0.42 & 2.65 & 4.45 & \font-weightbold 1.00 & \font-weightbold 1.00 \\
fimap & 6.55 & 2.92 & 0.66 & 9.57 & 4.10 & \font-weightbold 1.00 & \font-weightbold 1.00 \\
wachter & \font-weightbold 0.52 & 3.31 & \font-weightbold 0.69 & 3.17 & \font-weightbold 3.99 & \font-weightbold 1.00 & \font-weightbold 1.00 \\
cem & \font-weightbold 0.38 & 3.42 & 0.65 & \font-weightbold 1.74 & 4.18 & \font-weightbold 1.00 & \font-weightbold 1.00 \\
cfproto & 5.54 & 4.90 & 0.49 & 7.30 & 5.41 & \font-weightbold 1.00 & 0.91 \\
growing-spheres & 7.93 & 5.81 & 0.63 & 10.83 & 5.77 & \font-weightbold 1.00 & \font-weightbold 1.00 \\
actionable-recourse & \font-weightbold 0.75 & 3.39 & 0.42 & \font-weightbold 0.88 & 4.04 & 0.35 & 0.35 \\
face & 4.66 & \fon