# MRMC Basic Results

In [1]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../../..'))

%load_ext autoreload
%autoreload 2

import numba
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn import model_selection

import matplotlib.pyplot as plt
import seaborn as sns
from models import model_interface, model_loader, model_constants
from data import data_loader
from data.adapters import continuous_adapter

# Preliminaries -- load everything

In [2]:
DATASET, DATASET_INFO = data_loader.load_data(data_loader.DatasetName('credit_card_default'))
MODEL = model_loader.load_model(model_constants.ModelType('logistic_regression'), data_loader.DatasetName('credit_card_default'))
ADAPTER = continuous_adapter.StandardizingAdapter(
    label_column = DATASET_INFO.label_column, positive_label=DATASET_INFO.positive_label
).fit(DATASET)

results_dir = '../../../experiment_results/mrmc_results/mrmc_sparsity'

cluster_df = pd.read_csv(os.path.join(results_dir, 'cluster_df.csv'))
index_df = pd.read_csv(os.path.join(results_dir, 'experiment_config_df.csv'))
path_df = pd.read_csv(os.path.join(results_dir, 'mrmc_paths_df.csv'))
index_df

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,batch_id,run_id,run_seed,cluster_seed,confidence_cutoff,dataset_name,max_iterations,model_type,noise_ratio,num_clusters,rescale_ratio,sparsity,step_size,volcano_cutoff,volcano_degree
0,3,99,1498,1834823,0.7,credit_card_default,30,logistic_regression,,3,,4.0,1,0.5,2
1,1,42,4470,1834823,0.7,credit_card_default,30,logistic_regression,,3,,2.0,1,0.5,2
2,4,124,4404,1834823,0.7,credit_card_default,30,logistic_regression,,3,,5.0,1,0.5,2
3,4,142,171,1834823,0.7,credit_card_default,30,logistic_regression,,3,,5.0,1,0.5,2
4,4,127,5653,1834823,0.7,credit_card_default,30,logistic_regression,,3,,5.0,1,0.5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,2,83,211,1834823,0.7,credit_card_default,30,logistic_regression,,3,,3.0,1,0.5,2
176,1,30,7527,1834823,0.7,credit_card_default,30,logistic_regression,,3,,2.0,1,0.5,2
177,0,6,4340,1834823,0.7,credit_card_default,30,logistic_regression,,3,,1.0,1,0.5,2
178,1,33,478,1834823,0.7,credit_card_default,30,logistic_regression,,3,,2.0,1,0.5,2


## Fit a KDE

In [6]:
kfold = model_selection.KFold(n_splits=5)
transformed_data = ADAPTER.transform(DATASET.drop(columns='Y')).sample(frac=1, replace=False)

bandwidths = np.logspace(-1, 0, 6)
scores = []

for bw in bandwidths:
    print("Evaluate bandwidth", bw)
    score = 0
    for train_indices, val_indices in kfold.split(transformed_data):
        kde = neighbors.KernelDensity(bandwidth=bw).fit(transformed_data.iloc[train_indices])
        score += kde.score(transformed_data.iloc[val_indices])
    scores.append(score / len(bandwidths))
    print(scores[-1])
best_bandwidth = bandwidths[np.argmax(scores)]
print("Finished! Selected bandwidth is ", best_bandwidth)
KDE = neighbors.KernelDensity(bandwidth=bw).fit(transformed_data)
print("Scoring full dataset...")
scores = KDE.score_samples(transformed_data)

Evaluate bandwidth 0.1


KeyboardInterrupt: 

### Evaluate the KDE qualitatively

The scores are between -28 and -20. Unsurprisingly, most points have relatively high density.
Overall the histogram seems reasonable.

In [3]:
# best_bandwidth = bandwidths[np.argmax(scores)]
transformed_data = ADAPTER.transform(DATASET.drop(columns='Y')).sample(frac=1, replace=False)
best_bandwidth = 0.251188643150958
KDE = neighbors.KernelDensity(bandwidth=best_bandwidth).fit(transformed_data)
if False:
    KDE_SCORES = KDE.score_samples(transformed_data)
    sns.histplot(KDE_SCORES)
    pd.DataFrame({'density': KDE_SCORES}).describe()

# Analyze the results

In [100]:
# Generate some numpy arrays so we can pass of to numba
# The slowest here will be KDE

# order the paths dataframe
ordered_paths = path_df.sort_values(['run_id', 'path_id', 'step_id'])
run_ids = ordered_paths.run_id.to_numpy()
path_ids = ordered_paths.path_id.to_numpy()

# get the raw data -- we've already extracted the run_id and path_id.
paths = ordered_paths.drop(columns=['run_id', 'batch_id', 'step_id', 'path_id'])

# pos_proba and target_proba are calculated for every point in every path
pos_proba = MODEL.predict_pos_proba(paths).to_numpy()
target_proba = ordered_paths.merge(index_df[['run_id', 'confidence_cutoff']], how='left', on='run_id', validate='many_to_one').confidence_cutoff.to_numpy()

numpy_paths = ADAPTER.transform(paths).to_numpy()

# the first path begins at boundary_indices[0]. The second path begins at boundary_indices[1]. There is no path beginning at boundary_indices[-1].
boundary_indices = np.arange(run_ids.shape[0])[(path_ids != np.hstack([[-1], path_ids[:-1]]))]
boundary_indices = np.hstack([boundary_indices, path_ids.shape[0]])

# run KDE over the POIs and CFEs
pois = paths.iloc[boundary_indices[:-1]]
counterfactuals = paths.iloc[boundary_indices[1:] - 1]
poi_kde = KDE.score_samples(ADAPTER.transform(pois))
cfe_kde = KDE.score_samples(ADAPTER.transform(counterfactuals))

In [101]:
SPARSITY_EPSILON = 1e-9

@numba.jit(nopython=True)
def get_sparsity(path: np.ndarray) -> int:
    if path.shape[0] == 1:
        return np.nan
    path_sparsity = np.zeros(path.shape[0])
    for i in range(1, path.shape[0]):
        path_sparsity[i] = (np.abs(path[i] - path[i - 1]) > SPARSITY_EPSILON).sum()
    return np.max(path_sparsity)

@numba.jit(nopython=True)
def get_path_length(path: np.ndarray) -> float:
    total = 0
    for i in range(1, path.shape[0]):
        total += np.linalg.norm(path[i] - path[i - 1])
    if total == 0:
        return np.nan
    return total

@numba.jit(nopython=True)
def get_cfe_distance(path: np.ndarray) -> float:
    if len(path) == 1:
        return np.nan
    return np.linalg.norm(path[-1] - path[0])

@numba.jit(nopython=True)
def analyze_paths(
    paths: np.ndarray,
    run_ids: np.ndarray,
    path_ids: np.ndarray,
    pos_proba: np.ndarray,
    target_proba: np.ndarray,
    boundary_indices: np.ndarray,
    poi_kde: np.ndarray,
    cfe_kde: np.ndarray
):
    columns = ['run_id', 'path_id', 'success', 'path_length', 'proximity', 'poi_density', 'cfe_density', 'actual_sparsity']
    col_idx = {}
    for i, col in enumerate(columns):
        col_idx[col] = i
    results = np.zeros((len(boundary_indices) - 1, len(columns)))
    for i in range(boundary_indices.shape[0]-1):
        start_idx, end_idx = boundary_indices[i:i+2]
        path = paths[start_idx:end_idx]
        results[i,col_idx['run_id']] = run_ids[start_idx]
        results[i,col_idx['path_id']] = path_ids[start_idx]
        results[i,col_idx['success']] = 1 if pos_proba[end_idx - 1] >= target_proba[end_idx - 1] else 0
        results[i,col_idx['path_length']] = get_path_length(path)
        results[i,col_idx['proximity']] = get_cfe_distance(path)
        results[i,col_idx['poi_density']] = poi_kde[i]
        results[i,col_idx['cfe_density']] = cfe_kde[i]
        results[i,col_idx['actual_sparsity']] = get_sparsity(path)

    return results, columns

In [102]:
numpy_results, columns = analyze_paths(numpy_paths, run_ids, path_ids, pos_proba, target_proba, boundary_indices, poi_kde, cfe_kde)

results = pd.DataFrame(data=numpy_results, columns=columns).merge(index_df, how='left', on='run_id').drop(columns=['dataset_name', 'max_iterations', 'model_type', 'noise_ratio', 'rescale_ratio', 'step_size', 'cluster_seed', 'volcano_cutoff', 'volcano_degree', 'run_seed'])
results

Unnamed: 0,run_id,path_id,success,path_length,proximity,poi_density,cfe_density,actual_sparsity,batch_id,confidence_cutoff,num_clusters,sparsity
0,0.0,0.0,0.0,10.352204,4.566623,-1.056702,-76.138072,1.0,0,0.7,3,1.0
1,0.0,1.0,0.0,12.624809,5.994554,-1.056702,-31.205549,1.0,0,0.7,3,1.0
2,0.0,2.0,1.0,16.162116,16.162116,-1.056702,-557.934594,1.0,0,0.7,3,1.0
3,1.0,0.0,0.0,9.736041,3.143489,-0.133078,-37.784682,1.0,0,0.7,3,1.0
4,1.0,1.0,1.0,10.489761,4.783662,-0.133078,-1.711673,1.0,0,0.7,3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
535,178.0,1.0,1.0,5.000000,4.987076,0.599680,-3.266544,20.0,5,0.7,3,
536,178.0,2.0,1.0,6.000000,5.999915,0.599680,-31.415306,17.0,5,0.7,3,
537,179.0,0.0,1.0,8.000000,7.931908,-1.055231,-1.467903,20.0,5,0.7,3,
538,179.0,1.0,1.0,5.000000,4.986981,-1.055231,-2.994621,20.0,5,0.7,3,


# Let's look at the average metrics across the full batch

We see that success ratio suffers with increasingly strict sparsity requirements.
Why?

Also, the measured sparsity is slightly higher than the enforced sparsity. Why?

In [103]:
results.groupby('batch_id').mean().sort_values(['confidence_cutoff', 'num_clusters'])

Unnamed: 0_level_0,run_id,path_id,success,path_length,proximity,poi_density,cfe_density,actual_sparsity,confidence_cutoff,num_clusters,sparsity
batch_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,14.5,1.0,0.522222,10.606038,6.625812,-0.169042,-97.491542,1.0,0.7,3.0,1.0
1,44.5,1.0,0.7,10.845534,7.16543,-0.169042,-87.74678,2.0,0.7,3.0,2.0
2,74.5,1.0,0.911111,10.486472,7.537673,-0.169042,-87.229629,3.0,0.7,3.0,3.0
3,104.5,1.0,0.977778,9.759773,7.573356,-0.169042,-82.183096,4.0,0.7,3.0,4.0
4,134.5,1.0,0.988889,9.199464,7.592576,-0.169042,-73.588066,5.0,0.7,3.0,5.0
5,164.5,1.0,1.0,7.033333,7.009076,-0.169042,-44.571499,18.922222,0.7,3.0,


In [105]:
results[results.success == 0].groupby('sparsity').count()

Unnamed: 0_level_0,run_id,path_id,success,path_length,proximity,poi_density,cfe_density,actual_sparsity,batch_id,confidence_cutoff,num_clusters
sparsity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,43,43,43,43,43,43,43,43,43,43,43
2.0,27,27,27,27,27,27,27,27,27,27,27
3.0,8,8,8,8,8,8,8,8,8,8,8
4.0,2,2,2,2,2,2,2,2,2,2,2
5.0,1,1,1,1,1,1,1,1,1,1,1


# Let's look at failed runs

First take a look at an example POI

In [106]:
SPARSITY_EPSILON = 1e-9

def get_changed_columns(path):
    path = ADAPTER.transform(path)
    columns = set()
    stepwise_changes = []
    for i in range(1, len(path)):
        change_mask = (np.abs(path.iloc[i] - path.iloc[i - 1])) > SPARSITY_EPSILON
        cols = path.columns[change_mask].to_list()
        stepwise_changes.append(cols)
        columns = columns.union(cols)
    return list(columns), stepwise_changes

example_run_id = results[(results.success == 0) & (results.sparsity == 1)].iloc[0].run_id
example_path = path_df[(path_df.run_id == example_run_id) & (path_df.path_id == 0)].drop(columns=['step_id', 'path_id', 'run_id', 'batch_id'])

print("POI:")
print(example_path.iloc[0])

POI:
LIMIT_BAL    80000.0
AGE             42.0
PAY_1            1.0
PAY_2            4.0
PAY_3            3.0
PAY_4            2.0
PAY_5            0.0
PAY_6            0.0
BILL_AMT1    97841.0
BILL_AMT2    94992.0
BILL_AMT3    87801.0
BILL_AMT4    81545.0
BILL_AMT5    51338.0
BILL_AMT6    50826.0
PAY_AMT1         0.0
PAY_AMT2       639.0
PAY_AMT3         0.0
PAY_AMT4     50918.0
PAY_AMT5      2000.0
PAY_AMT6      2000.0
Name: 6862, dtype: float64


## Model confidence changes slowly

The recourse is working, but very very slowly. Why?

In [107]:
print("Recourse path (only changed columns):")
pos_proba = MODEL.predict_pos_proba(example_path)

changed_columns, changes = get_changed_columns(example_path)

flat_changes = []
for changelist in changes:
    flat_changes.append(' '.join(changelist))

flat_changes = [None] + flat_changes

display_path = example_path.copy()
display_path['changed_columns'] = flat_changes
display_path['pos_proba'] = pos_proba
display_path[changed_columns + ['changed_columns', 'pos_proba']]

Recourse path (only changed columns):


Unnamed: 0,LIMIT_BAL,PAY_2,BILL_AMT3,BILL_AMT5,PAY_3,PAY_4,PAY_AMT4,BILL_AMT6,BILL_AMT4,changed_columns,pos_proba
6862,80000.0,4.0,87801.0,51338.0,3.0,2.0,50918.0,50826.0,81545.0,,0.163446
6863,80000.0,3.572235,87801.0,51338.0,3.0,2.0,50918.0,50826.0,81545.0,PAY_2,0.167835
6864,80000.0,3.178028,87801.0,51338.0,3.0,2.0,50918.0,50826.0,81545.0,PAY_2,0.171962
6865,80000.0,2.817554,87801.0,51338.0,3.0,2.0,50918.0,50826.0,81545.0,PAY_2,0.175807
6866,80000.0,2.817554,87801.0,51338.0,2.653559,2.0,50918.0,50826.0,81545.0,PAY_3,0.184094
6867,80000.0,2.482734,87801.0,51338.0,2.653559,2.0,50918.0,50826.0,81545.0,PAY_2,0.187857
6868,80000.0,2.482734,87801.0,51338.0,2.333426,2.0,50918.0,50826.0,81545.0,PAY_3,0.195905
6869,80000.0,2.175071,87801.0,51338.0,2.333426,2.0,50918.0,50826.0,81545.0,PAY_2,0.199528
6870,80000.0,2.175071,87801.0,51338.0,2.040807,2.0,50918.0,50826.0,81545.0,PAY_3,0.207213
6871,80000.0,1.895446,87801.0,51338.0,2.040807,2.0,50918.0,50826.0,81545.0,PAY_2,0.210645


## Step size is small

The program flow is:
* MRMC generates an unnormalized direction
* The step size is set to 1
* Low-magnitude indices are zeroed out

Notice that this means the step size will no longer be 1.

It seems that as the POI approaches the decision boundary (even as far away as 0.29 confidence), the direction magnitude is more evenly spread out, causing the step size to decrease.

In [108]:
def check_stepwise_distances(path):
    path = ADAPTER.transform(path)
    poi = path.iloc[0]
    stepwise_distances = []
    counterfactual_distances = []
    for i in range(1, len(path)):
        stepwise_distances.append(
            np.linalg.norm(path.iloc[i] - path.iloc[i - 1]))
        counterfactual_distances.append(
            np.linalg.norm(path.iloc[i] - poi)
        )
    return np.array(stepwise_distances), np.array(counterfactual_distances)

stepwise_distances, counterfactual_distances = check_stepwise_distances(example_path)

print("Stepwise distances along the path:")
print(stepwise_distances)

print("Distances to each counterfactual:")
print(counterfactual_distances)

Stepwise distances along the path:
[0.53356304 0.49170562 0.44962835 0.43821408 0.41763017 0.40493639
 0.38375713 0.37013328 0.34878428 0.34100625 0.3399839  0.31886357
 0.3223116  0.32331451 0.31899082 0.3204394  0.32169988 0.31167714
 0.30521982 0.30484106 0.30553971 0.30579209 0.30775249 0.30475407
 0.2930161  0.29473383 0.29131471 0.29356496 0.29449669 0.29453899]
Distances to each counterfactual:
[0.53356304 1.02526866 1.47489701 1.53862041 1.94259895 2.07184985
 2.42742104 2.57944329 2.89189255 2.91192853 3.06918731 3.34599196
 3.36147987 3.37699263 3.42394364 3.43890557 3.59568274 3.60916569
 3.86216636 3.89945697 3.93658355 3.97318191 4.03573012 4.18602444
 4.21797486 4.22825968 4.45601792 4.46567756 4.51639797 4.566623  ]


# What next?

The step size becomes too small when enforcing sparsity. Instead, we can try preserving the original magnitude and just... seeing what happens

# Are there any exceptions?

Check out this run where the third path fails but has a length of 25. Let's take a look.

In [161]:
results[results.success == 0].sort_values('path_length')


Unnamed: 0,run_id,path_id,success,path_length,proximity,poi_density,cfe_density,actual_sparsity,batch_id,confidence_cutoff,num_clusters,sparsity
12,4.0,0.0,0.0,8.959344,2.772969,-0.464024,-19.853159,1.0,0,0.7,3,1.0
48,16.0,0.0,0.0,8.959396,2.956939,-1.056702,-26.483375,1.0,0,0.7,3,1.0
51,17.0,0.0,0.0,9.139622,2.792244,0.330734,-28.088616,1.0,0,0.7,3,1.0
81,27.0,0.0,0.0,9.309215,2.909458,-0.501757,-25.817556,1.0,0,0.7,3,1.0
66,22.0,0.0,0.0,9.311193,4.030686,-1.056702,-2.435991,1.0,0,0.7,3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
225,75.0,0.0,0.0,17.546840,10.853372,-1.056702,-7.835409,3.0,2,0.7,3,3.0
306,102.0,0.0,0.0,17.943088,10.080584,-1.056702,-2.924785,4.0,3,0.7,3,4.0
426,142.0,0.0,0.0,18.540166,11.087569,-1.056702,-3.241707,5.0,4,0.7,3,5.0
68,22.0,2.0,0.0,25.623098,25.623098,-1.056702,-1604.623926,1.0,0,0.7,3,1.0


In [110]:
example_run_id = results[results.success == 0].sort_values('path_length').iloc[-2].run_id
results[results.run_id == example_run_id]

Unnamed: 0,run_id,path_id,success,path_length,proximity,poi_density,cfe_density,actual_sparsity,batch_id,confidence_cutoff,num_clusters,sparsity
66,22.0,0.0,0.0,9.311193,4.030686,-1.056702,-2.435991,1.0,0,0.7,3,1.0
67,22.0,1.0,0.0,12.14535,5.468158,-1.056702,-41.075732,1.0,0,0.7,3,1.0
68,22.0,2.0,0.0,25.623098,25.623098,-1.056702,-1604.623926,1.0,0,0.7,3,1.0


## It almost crosses the boundary! But doesn't make it

This is surprising.
* The only column changed is PAY_AMT2, probably because of the cluster selection? (what do the clusters even look like?)
* The changes are very large, but cause slow model confidence change

In [155]:
example_path = path_df[(path_df.run_id == example_run_id) & (path_df.path_id == 2)].drop(columns=['step_id', 'path_id', 'run_id', 'batch_id'])

pos_proba = MODEL.predict_pos_proba(example_path)

changed_columns, changes = get_changed_columns(example_path)

flat_changes = []
for changelist in changes:
    flat_changes.append(' '.join(changelist))

flat_changes = [None] + flat_changes

display_path = example_path.copy()
display_path['changed_columns'] = flat_changes
display_path['pos_proba'] = pos_proba
ADAPTER.transform(display_path[changed_columns + ['changed_columns', 'pos_proba']])

Unnamed: 0,PAY_AMT2,changed_columns,pos_proba
591,-0.204907,,0.032847
592,0.70072,PAY_AMT2,0.037694
593,1.604135,PAY_AMT2,0.04321
594,2.505252,PAY_AMT2,0.049475
595,3.403974,PAY_AMT2,0.056574
596,4.300204,PAY_AMT2,0.064599
597,5.193835,PAY_AMT2,0.073646
598,6.084756,PAY_AMT2,0.083813
599,6.972847,PAY_AMT2,0.095201
600,7.85798,PAY_AMT2,0.107909


## Let's check what happens without sparsity

We can see that the path length is large, but not as large as 25. Of course, it should be smaller since it can move along the hypotenuse. More interesting would be the counterfactual distance, which is ~17. Hmmmm...

About 95% of the magntitude of each step is explained by the top 3 features, which are PAY_AMT1, PAY_AMT2, and BILL_AMT3.

In [113]:
exception_run_seed = index_df[index_df.run_id == example_run_id].iloc[0].run_seed
exception_run_id = index_df[(index_df.run_seed == exception_run_seed) & (np.isnan(index_df.sparsity))].iloc[0].run_id
results[results.run_id == exception_run_id]

Unnamed: 0,run_id,path_id,success,path_length,proximity,poi_density,cfe_density,actual_sparsity,batch_id,confidence_cutoff,num_clusters,sparsity
516,172.0,0.0,1.0,12.0,11.916629,-1.056702,-2.1854,20.0,5,0.7,3,
517,172.0,1.0,1.0,10.0,9.934989,-1.056702,-1.154226,20.0,5,0.7,3,
518,172.0,2.0,1.0,17.0,16.996728,-1.056702,-413.192294,20.0,5,0.7,3,


In [154]:
def get_top_k_changed_columns(path, k):
    path = ADAPTER.transform(path)
    poi = path.iloc[0]
    cfe = path.iloc[-1]
    top_k_indices = np.argpartition(-np.abs(cfe - poi).to_numpy(), k)[:k]
    top_k_columns = path.columns[top_k_indices]
    
    top_k_magnitude_ratio = [np.nan]
    for i in range(1, len(path)):
        full_norm = np.linalg.norm(path.iloc[i] - path.iloc[i-1])
        top_k_norm = np.linalg.norm(path[top_k_columns].iloc[i] - path[top_k_columns].iloc[i-1])
        top_k_magnitude_ratio.append(top_k_norm / full_norm)
    return top_k_columns, np.array(top_k_magnitude_ratio)

exception_path = path_df[(path_df.run_id == exception_run_id) & (path_df.path_id == 2)].drop(columns=['step_id', 'run_id', 'batch_id', 'path_id'])

pos_proba = MODEL.predict_pos_proba(exception_path)

top_k_columns, top_k_impact = get_top_k_changed_columns(exception_path, 3)

display_path = exception_path[top_k_columns].copy()
display_path['top_k_impact'] = top_k_impact
display_path['pos_proba'] = pos_proba
ADAPTER.transform(display_path)

Unnamed: 0,PAY_AMT2,PAY_AMT1,BILL_AMT3,top_k_impact,pos_proba
7833,-0.204907,-0.269491,-0.514421,,0.032847
7834,0.70072,-0.03375,-0.313021,0.957233,0.04226
7835,1.606985,0.200431,-0.111107,0.957563,0.054206
7836,2.51391,0.432973,0.091343,0.957902,0.069268
7837,3.421518,0.66379,0.294354,0.958249,0.088101
7838,4.32983,0.892791,0.497949,0.958605,0.111411
7839,5.238871,1.119876,0.702157,0.95897,0.139903
7840,6.148665,1.34494,0.907005,0.959344,0.174202
7841,7.059237,1.56787,1.112524,0.959727,0.214749
7842,7.970612,1.788543,1.318745,0.960119,0.261666


## Summary so far...

__Without sparsity__
* PAY_AMT2: 0 -> 15
* PAY_AMT1: 0 -> 3
* BILL_AMT3: 0 -> 3

__With sparsity__
* PAY_AMT2: 0 -> 25

Once it reaches PAY_AMT2=15, why does it continue to increase PAY_AMT2 when changing PAY_AMT1 slightly would cross the decision boundary?

Hypothesis: it is running parallel to the decision boundary.

In [159]:
cluster_df[(cluster_df.run_id == exception_run_id) & (cluster_df.path_id == 2)][top_k_columns]

Unnamed: 0,PAY_AMT2,PAY_AMT1,BILL_AMT3
518,55.637317,20.359179,-0.399186
