# FACE Hyperparameter Results

In [1]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../../..'))

%load_ext autoreload
%autoreload 2

import joblib
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn import model_selection

import matplotlib.pyplot as plt
import seaborn as sns
from models import model_interface, model_loader, model_constants
from data import data_loader
from data.adapters import continuous_adapter
from scripts import fit_kde

In [16]:
RECOURSE_METHOD = 'face'
RESULTS_DIR = '../../../experiment_results/face/logistic_regression/give_me_credit/face_hyperparam'

# Preliminaries -- load everything

In [17]:
DATASET, DATASET_INFO = data_loader.load_data(data_loader.DatasetName('give_me_credit'), split="train")
MODEL = model_loader.load_model(model_constants.ModelType('logistic_regression'), data_loader.DatasetName('give_me_credit'))
ADAPTER = continuous_adapter.StandardizingAdapter(
    label_column = DATASET_INFO.label_column, positive_label=DATASET_INFO.positive_label
).fit(DATASET)

DROP_COLUMNS = ['step_id', 'path_id', 'run_id', 'batch_id']  # columns which are convenient to drop from the path_df

config_df = pd.read_csv(os.path.join(RESULTS_DIR, 'experiment_config_df.csv'))
path_df = pd.read_csv(os.path.join(RESULTS_DIR, f'{RECOURSE_METHOD}_paths_df.csv'))
config_df

Unnamed: 0,batch_id,run_id,run_seed,confidence_cutoff,counterfactual_mode,distance_threshold,graph_directory,max_iterations,noise_ratio,num_paths,rescale_ratio,elapsed_recourse_seconds
0,0,16,5500,0.7,True,0.75,recourse_methods/face_graphs/give_me_credit,50,,3,,1.119539
1,0,25,5403,0.7,True,0.75,recourse_methods/face_graphs/give_me_credit,50,,3,,0.777298
2,1,54,3069,0.7,True,1.00,recourse_methods/face_graphs/give_me_credit,50,,3,,1.794332
3,0,28,6779,0.7,True,0.75,recourse_methods/face_graphs/give_me_credit,50,,3,,0.728267
4,0,3,478,0.7,True,0.75,recourse_methods/face_graphs/give_me_credit,50,,3,,0.671424
...,...,...,...,...,...,...,...,...,...,...,...,...
85,2,82,171,0.7,True,1.50,recourse_methods/face_graphs/give_me_credit,50,,3,,10.006279
86,1,37,5653,0.7,True,1.00,recourse_methods/face_graphs/give_me_credit,50,,3,,2.099621
87,0,24,3069,0.7,True,0.75,recourse_methods/face_graphs/give_me_credit,50,,3,,0.940304
88,2,61,3183,0.7,True,1.50,recourse_methods/face_graphs/give_me_credit,50,,3,,8.163157


## Load or Fit a KDE

In [18]:
KDE_DIRECTORY = '../../../saved_models/kde/give_me_credit_kde.joblib'

if os.path.exists(KDE_DIRECTORY):
    KDE = joblib.load(KDE_DIRECTORY)
else:
    KDE = fit_kde.fit_kde('give_me_credit', KDE_DIRECTORY)

# Analyze the results

In [19]:
SPARSITY_EPSILON = 1e-5

def get_poi_cfes(path_df: pd.DataFrame):
    """Isolate the POIs (Points of Interest) and CFEs (Counterfactual Examples) from the full path results.
    
    POIs and CFEs are listed in the order they originally appear in. There is one POI and one CFE
    for every path that appears in the DataFrame."""
    pathscopy = path_df.copy()
    pathscopy['next_step_id'] = 0
    pathscopy.loc[:,'next_step_id'].iloc[0:-1] = pathscopy.loc[:,'step_id'].iloc[1:]
    cfes = pathscopy[pathscopy.step_id >= pathscopy.next_step_id].drop(columns='next_step_id')
    return pathscopy[pathscopy.step_id == 0].drop(columns='next_step_id'), cfes

def get_sparsity(path: pd.DataFrame):
    """Returns the maximum number of features changed in any single iteration
    along the path."""
    if path.shape[0] == 1:
        return np.nan
    path_sparsity = np.zeros(path.shape[0])
    for i in range(1, path.shape[0]):
        path_sparsity[i] = (np.abs(path.iloc[i] - path.iloc[i - 1]) > SPARSITY_EPSILON).sum()
    return np.max(path_sparsity)

def get_path_length(path: pd.DataFrame):
    """Returns the sum of euclidean distances along the path."""
    total = 0
    for i in range(1, path.shape[0]):
        total += np.linalg.norm(path.iloc[i] - path.iloc[i - 1])
    if total == 0:
        return np.nan
    return total

def get_cfe_distance(path: pd.DataFrame):
    """Returns the euclidean distance between the first and last points in the path."""
    if len(path) == 1:
        return np.nan
    return np.linalg.norm(path.iloc[-1] - path.iloc[0])


def analyze_paths(paths: pd.DataFrame, poi_kdes, cfe_kdes, cfe_probs, config_df):
    """Returns a DataFrame containing per-path results.
    
    Each row corresponds to a specific path. Each column is a result metric.
    
    Args:
        paths: The path_df DataFrame to analyze.
        poi_kdes: The KDE scores for the POIs.
        cfe_kdes: The KDE scores for the CFEs.
        config_df: The experiment_config_df for the experiment."""
    columns = ['run_id', 'path_id', 'success', 'proximity', 'path_length',
               'iteration_count', 'poi_density', 'cfe_density', 
               'actual_sparsity']
    col_idx = {}
    for i, col in enumerate(columns):
        col_idx[col] = i

    results = np.zeros((len(poi_kdes), len(columns)))

    i = 0
    for run_id in paths.run_id.unique():
        run_paths = paths[paths.run_id == run_id]
        for path_id in run_paths.path_id.unique():
            path = ADAPTER.transform(run_paths[run_paths.path_id == path_id].drop(columns=DROP_COLUMNS))
            results[i,col_idx['run_id']] = run_id
            results[i,col_idx['path_id']] = path_id

            desired_proba = config_df[config_df.run_id == run_id].confidence_cutoff.iloc[0]
            actual_proba = cfe_probs[i]

            results[i,col_idx['success']] = 1 if actual_proba >= desired_proba else 0
            results[i,col_idx['path_length']] = get_path_length(path)
            results[i,col_idx['iteration_count']] = len(path)
            results[i,col_idx['proximity']] = get_cfe_distance(path)
            results[i,col_idx['poi_density']] = poi_kdes[i]
            results[i,col_idx['cfe_density']] = cfe_kdes[i]
            results[i,col_idx['actual_sparsity']] = get_sparsity(path)
            i += 1

    return pd.DataFrame(data=results, columns=columns)

pois, cfes = get_poi_cfes(path_df)
poi_kdes = KDE.score_samples(ADAPTER.transform(pois.drop(columns=DROP_COLUMNS)))
cfe_kdes = KDE.score_samples(ADAPTER.transform(cfes.drop(columns=DROP_COLUMNS)))
cfe_probs = MODEL.predict_pos_proba(cfes.drop(columns=DROP_COLUMNS)).to_numpy()

results = analyze_paths(path_df, poi_kdes, cfe_kdes, cfe_probs, config_df)
results = results.merge(config_df, how='left', on='run_id').drop(
    columns=['max_iterations', 'noise_ratio',
             'rescale_ratio', 'run_seed', 'graph_directory', 'counterfactual_mode'])  # uninteresting columns
results

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathscopy.loc[:,'next_step_id'].iloc[0:-1] = pathscopy.loc[:,'step_id'].iloc[1:]


Unnamed: 0,run_id,path_id,success,proximity,path_length,iteration_count,poi_density,cfe_density,actual_sparsity,batch_id,confidence_cutoff,distance_threshold,num_paths,elapsed_recourse_seconds
0,16.0,0.0,1.0,2.034369,2.034369,2.0,-5.497607,-6.336970,5.0,0,0.7,0.75,3,1.119539
1,16.0,1.0,1.0,2.005167,2.005167,2.0,-5.497607,-6.710693,5.0,0,0.7,0.75,3,1.119539
2,16.0,2.0,1.0,1.983740,1.983740,2.0,-5.497607,-6.472136,5.0,0,0.7,0.75,3,1.119539
3,25.0,0.0,1.0,1.652801,1.652801,2.0,-4.544518,-5.838878,7.0,0,0.7,0.75,3,0.777298
4,25.0,1.0,1.0,1.855439,1.855439,2.0,-4.544518,-3.962193,7.0,0,0.7,0.75,3,0.777298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,61.0,1.0,1.0,1.499565,1.499565,2.0,-5.539524,-4.250454,6.0,2,0.7,1.50,3,8.163157
266,61.0,2.0,1.0,1.498950,1.498950,2.0,-5.539524,-4.907369,6.0,2,0.7,1.50,3,8.163157
267,64.0,0.0,1.0,2.699303,2.699303,2.0,-8.029804,-4.537520,7.0,2,0.7,1.50,3,6.527399
268,64.0,1.0,1.0,2.773817,2.773817,2.0,-8.029804,-4.134180,7.0,2,0.7,1.50,3,6.527399


# Choosing metrics

We must select values for:
* num_paths
* confidence_cutoff
* distance_threshold

Can we use the metrics chosen by StEP?
* num_paths=3
* confidence_cutoff=0.7

Let's see what the best-performing parameter settings look like.

In [20]:
DROP_METRICS = ['run_id', 'elapsed_recourse_seconds', 'negative_cfe_density',
                'path_id', 'batch_id', 'actual_sparsity', 'negative_success']

results['negative_cfe_density'] = -results['cfe_density']
results['negative_success'] = -results['success']
results.groupby('batch_id', as_index=False).mean().sort_values(
    ['negative_success', 'path_length', 'proximity', 'negative_cfe_density', 'iteration_count']).iloc[:5].drop(
        columns=DROP_METRICS)

Unnamed: 0,success,proximity,path_length,iteration_count,poi_density,cfe_density,confidence_cutoff,distance_threshold,num_paths
1,1.0,1.737506,1.737506,2.0,-6.328154,-5.713465,0.7,1.0,3.0
2,1.0,1.946078,1.946078,2.0,-6.328154,-5.346134,0.7,1.5,3.0
0,0.9,1.58068,1.58068,1.9,-6.328154,-6.366191,0.7,0.75,3.0


# Final Parameters

We see that success is effected, in order of significance, by:
* distance_threshold
* confidence_cutoff
* num_paths

The StEP parameters perform tied for 5th-best and succeed 10% less frequently
than the optimal parameters. To keep consistency with DiCE and StEP, we choose
these parameters.

* confidence_cutoff: 0.7
* num_paths: 3
* distance_threshold: 1.0