# DICE Basic Results

In [1]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../../..'))

%load_ext autoreload
%autoreload 2

import numba
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn import model_selection

import matplotlib.pyplot as plt
import seaborn as sns
from models import model_interface, model_loader, model_constants
from data import data_loader
from data.adapters import continuous_adapter

# Preliminaries -- load everything

In [2]:
DATASET, DATASET_INFO = data_loader.load_data(data_loader.DatasetName('credit_card_default'))
MODEL = model_loader.load_model(model_constants.ModelType('logistic_regression'), data_loader.DatasetName('credit_card_default'))
ADAPTER = continuous_adapter.StandardizingAdapter(
    label_column = DATASET_INFO.label_column, positive_label=DATASET_INFO.positive_label
).fit(DATASET)

results_dir = '../../../experiment_results/dice_results/dice_step_size'

index_df = pd.read_csv(os.path.join(results_dir, 'experiment_config_df.csv'))
path_df = pd.read_csv(os.path.join(results_dir, 'dice_paths_df.csv'))
index_df

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,batch_id,run_id,run_seed,confidence_cutoff,dataset_name,max_iterations,max_step_size,model_type,noise_ratio,num_paths,rescale_ratio
0,2,89,227,0.7,credit_card_default,30,,logistic_regression,,3,
1,0,29,227,0.7,credit_card_default,30,1.0,logistic_regression,,3,
2,2,81,9251,0.7,credit_card_default,30,,logistic_regression,,3,
3,2,64,4404,0.7,credit_card_default,30,,logistic_regression,,3,
4,0,28,6779,0.7,credit_card_default,30,1.0,logistic_regression,,3,
...,...,...,...,...,...,...,...,...,...,...,...
85,0,22,171,0.7,credit_card_default,30,1.0,logistic_regression,,3,
86,2,69,1498,0.7,credit_card_default,30,,logistic_regression,,3,
87,0,15,3439,0.7,credit_card_default,30,1.0,logistic_regression,,3,
88,0,24,3069,0.7,credit_card_default,30,1.0,logistic_regression,,3,


## Fit a KDE

In [6]:
kfold = model_selection.KFold(n_splits=5)
transformed_data = ADAPTER.transform(DATASET.drop(columns='Y')).sample(frac=1, replace=False)

bandwidths = np.logspace(-1, 0, 6)
scores = []

for bw in bandwidths:
    print("Evaluate bandwidth", bw)
    score = 0
    for train_indices, val_indices in kfold.split(transformed_data):
        kde = neighbors.KernelDensity(bandwidth=bw).fit(transformed_data.iloc[train_indices])
        score += kde.score(transformed_data.iloc[val_indices])
    scores.append(score / len(bandwidths))
    print(scores[-1])
best_bandwidth = bandwidths[np.argmax(scores)]
print("Finished! Selected bandwidth is ", best_bandwidth)
KDE = neighbors.KernelDensity(bandwidth=bw).fit(transformed_data)
print("Scoring full dataset...")
scores = KDE.score_samples(transformed_data)

Evaluate bandwidth 0.1


KeyboardInterrupt: 

### Evaluate the KDE qualitatively

The scores are between -28 and -20. Unsurprisingly, most points have relatively high density.
Overall the histogram seems reasonable.

In [3]:
#best_bandwidth = bandwidths[np.argmax(scores)]
transformed_data = ADAPTER.transform(DATASET.drop(columns='Y')).sample(frac=1, replace=False)
best_bandwidth = 0.251188643150958
KDE = neighbors.KernelDensity(bandwidth=best_bandwidth).fit(transformed_data)
#KDE_SCORES = KDE.score_samples(transformed_data)
#sns.histplot(KDE_SCORES)
#pd.DataFrame({'density': KDE_SCORES}).describe()

# Analyze the results

In [4]:
# Generate some numpy arrays so we can pass of to numba
# The slowest here will be KDE

# order the paths dataframe
ordered_paths = path_df.sort_values(['run_id', 'path_id', 'step_id'])
run_ids = ordered_paths.run_id.to_numpy()
path_ids = ordered_paths.path_id.to_numpy()

# get the raw data -- we've already extracted the run_id and path_id.
paths = ordered_paths.drop(columns=['run_id', 'batch_id', 'step_id', 'path_id'])

# pos_proba and target_proba are calculated for every point in every path
pos_proba = MODEL.predict_pos_proba(paths).to_numpy()
target_proba = ordered_paths.merge(index_df[['run_id', 'confidence_cutoff']], how='left', on='run_id', validate='many_to_one').confidence_cutoff.to_numpy()

numpy_paths = ADAPTER.transform(paths).to_numpy()

# the first path begins at boundary_indices[0]. The second path begins at boundary_indices[1]. There is no path beginning at boundary_indices[-1].
boundary_indices = np.arange(run_ids.shape[0])[(path_ids != np.hstack([[-1], path_ids[:-1]]))]
boundary_indices = np.hstack([boundary_indices, path_ids.shape[0]])

# run KDE over the POIs and CFEs
pois = paths.iloc[boundary_indices[:-1]]
counterfactuals = paths.iloc[boundary_indices[1:] - 1]
poi_kde = KDE.score_samples(ADAPTER.transform(pois))
cfe_kde = KDE.score_samples(ADAPTER.transform(counterfactuals))

In [5]:
@numba.jit(nopython=True)
def get_sparsity(path: np.ndarray) -> int:
    if path.shape[0] == 1:
        return np.nan
    path_sparsity = np.zeros(path.shape[0])
    for i in range(1, path.shape[0]):
        path_sparsity[i] = ((path[i] - path[i - 1]) != 0).sum()
    return np.max(path_sparsity)

@numba.jit(nopython=True)
def get_path_length(path: np.ndarray) -> float:
    total = 0
    for i in range(1, path.shape[0]):
        total += np.linalg.norm(path[i] - path[i - 1])
    if total == 0:
        return np.nan
    return total

@numba.jit(nopython=True)
def analyze_paths(
    paths: np.ndarray,
    run_ids: np.ndarray,
    path_ids: np.ndarray,
    pos_proba: np.ndarray,
    target_proba: np.ndarray,
    boundary_indices: np.ndarray,
    poi_kde: np.ndarray,
    cfe_kde: np.ndarray
):
    columns = ['run_id', 'path_id', 'success', 'path_length', 'poi_density', 'cfe_density', 'sparsity']
    col_idx = {}
    for i, col in enumerate(columns):
        col_idx[col] = i
    results = np.zeros((len(boundary_indices) - 1, len(columns)))
    for i in range(boundary_indices.shape[0]-1):
        start_idx, end_idx = boundary_indices[i:i+2]
        path = paths[start_idx:end_idx]
        results[i,col_idx['run_id']] = run_ids[start_idx]
        results[i,col_idx['path_id']] = path_ids[start_idx]
        results[i,col_idx['success']] = 1 if pos_proba[end_idx - 1] >= target_proba[end_idx - 1] else 0
        results[i,col_idx['path_length']] = get_path_length(path)
        results[i,col_idx['poi_density']] = poi_kde[i]
        results[i,col_idx['cfe_density']] = cfe_kde[i]
        results[i,col_idx['sparsity']] = get_sparsity(path)

    return results, columns

In [6]:
numpy_results, columns = analyze_paths(numpy_paths, run_ids, path_ids, pos_proba, target_proba, boundary_indices, poi_kde, cfe_kde)

results = pd.DataFrame(data=numpy_results, columns=columns).merge(index_df, how='left', on='run_id').drop(columns=['dataset_name', 'max_iterations', 'model_type', 'noise_ratio', 'rescale_ratio', 'run_seed'])
results

Unnamed: 0,run_id,path_id,success,path_length,poi_density,cfe_density,sparsity,batch_id,confidence_cutoff,max_step_size,num_paths
0,0.0,0.0,1.0,14.000000,-1.036496,-381.132768,2.0,0,0.7,1.0,3
1,0.0,1.0,1.0,15.000000,-1.036496,-373.072976,2.0,0,0.7,1.0,3
2,0.0,2.0,1.0,12.000000,-1.036496,-32.333564,3.0,0,0.7,1.0,3
3,1.0,0.0,1.0,17.000000,-1.039537,-202.683678,2.0,0,0.7,1.0,3
4,1.0,1.0,1.0,13.000000,-1.039537,-56.756210,2.0,0,0.7,1.0,3
...,...,...,...,...,...,...,...,...,...,...,...
265,88.0,1.0,1.0,44.741088,3.196688,-8413.627287,2.0,2,0.7,,3
266,88.0,2.0,1.0,15.378054,3.196688,-1561.712336,1.0,2,0.7,,3
267,89.0,0.0,1.0,31.650337,-1.056702,-808.113308,3.0,2,0.7,,3
268,89.0,1.0,1.0,49.404998,-1.056702,-12574.889835,3.0,2,0.7,,3


# Let's look at the average metrics across the full batch


In [7]:
results.groupby('batch_id').mean().sort_values(['confidence_cutoff', 'num_paths'])

Unnamed: 0_level_0,run_id,path_id,success,path_length,poi_density,cfe_density,sparsity,confidence_cutoff,max_step_size,num_paths
batch_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,14.5,1.0,0.944444,15.058283,0.081015,-361.105595,2.344444,0.7,1.0,3.0
1,44.5,1.0,1.0,16.251618,0.081015,-346.905052,2.255556,0.7,2.0,3.0
2,74.5,1.0,1.0,38.715193,0.081015,-4066.177519,2.0,0.7,,3.0
