In [1]:
from emutils.imports import *
from emutils.geometry.metrics import adist

from emutils.utils import (attrdict, in_ipynb, pandas_max, load_pickle, save_pickle)

from emutils.model.wrappers import XGBClassifierSKLWrapper

from utils import *

# Suppress warnings
import warnings
# warnings.filterwarnings(action="error", category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Suppress scientific notation
# np.set_printoptions(suppress=True)
np.seterr(all='raise')

pandas_max(100, 200)

Python  3.6.13
Python Executable: /home/ubuntu/anaconda3/envs/cfshap22/bin/python
CWD =  /home/ubuntu/air/xai/cfshap-release/cf-shap-facct22
NumPy 1.19.2 | Pandas 1.1.5 | SciPy 1.5.2 | NetworkX 2.5.1 | StatsModels 0.12.2
scikit-learn 0.24.2 | xgboost 1.3.3 
MatPlotLib 3.3.4 | Seaborn 0.11.1 | 





In [2]:
from constants import EXPLANATIONS_DIR, MODEL_DIR, DATA_DIR

parser = ArgumentParser(sys.argv)

parser.add_argument('--data_path', type=str, default=DATA_DIR, required=False)
parser.add_argument('--model_path', type=str, default=MODEL_DIR)
parser.add_argument('--results_path', type=str, default=EXPLANATIONS_DIR)

parser.add_argument('--random_state', type=int, default=2021, required=False)
parser.add_argument('--model_type', type=str, default='xgb')

args, unknown = parser.parse_known_args()
args = attrdict(vars(args))

# Show graphs and stuff or not?
args.show = in_ipynb()

# if 'variate' in args.dataset or 'test' in args.results_version:
#     args.override = True

print(args)

{'data_path': './data',
 'model_path': './models',
 'model_type': 'xgb',
 'random_state': 2021,
 'results_path': './explanations',
 'show': True}


In [3]:
MODELS = [
    ('heloc', 'v2', 'v5'), 
    ('lendingclub', 'v2', 'v5'), 
    ('wines', 'v2', 'v5'), 
]

RESULTS = [
    'v5_close', 
    'v5_far', 
]

In [4]:
data = {
    (dataset_to_name(dataset), model_version_to_name(model_version)): attrdict({
        **{
            "Dataset": dataset_to_name(dataset),
            "Model": model_version_to_name(model_version)
        },
        **load_data_and_model(dataset, data_version, model_version, args),
    })
    for dataset, data_version, model_version in tqdm(MODELS)
}

  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
explanations = {}
for dataset, data_version, model_version in tqdm(MODELS, desc='Loading Explanations'):
    for results_version in RESULTS:
        try:
            explanations[(
                dataset_to_name(dataset), 
                model_version_to_name(model_version),
                result_version_to_name(results_version),
            )] = attrdict({
                **{
                    'Decision Boundary': result_version_to_name(results_version)
                },
                **data[(dataset_to_name(dataset), model_version_to_name(model_version))],
                **load_explanations(args, dataset, data_version, model_version, results_version=results_version),
            })
        except FileNotFoundError:
            warnings.warn(f'\nCould not load explanations for {dataset} {data_version} / {model_version} / {results_version}')
            
for dataset, model_name in set(tuple(map(tuple, np.array(tuple(explanations.keys()))[:, :-1]))):
    if (dataset, model_name, 'Close (<50%)') in explanations and (dataset, model_name, 'Far (>50%)') in explanations:
        A = explanations[(dataset, model_name, 'Close (<50%)')].copy()
        B = explanations[(dataset, model_name, 'Far (>50%)')].copy()
        C = A.copy()
        C['Decision Boundary'] = 'All'
        for c in ['x', 'pred', 'prob', 'index']:
            C.metadata[c] = np.concatenate([A.metadata[c], B.metadata[c]])
        C.metadata['manifold'] = None
        for c in C.values.keys():
            C.values[c] = np.concatenate([A.values[c], B.values[c]])

        for c in C.trends.keys():
            C.trends[c] = np.concatenate([A.trends[c], B.trends[c]])

        if 'backgrounds' in C:
            for c in C.backgrounds.keys():
                C.backgrounds[c] = np.concatenate([A.backgrounds[c], B.backgrounds[c]])

        print((dataset, model_name, 'All'))
        explanations[(dataset, model_name, 'All')] = C

Loading Explanations:   0%|          | 0/3 [00:00<?, ?it/s]

('Wine Quality (White)', 'Non-Monotonic', 'All')
('HELOC (Home Equity Line of Credit)', 'Non-Monotonic', 'All')
('Lending Club (2007-2018)', 'Non-Monotonic', 'All')


In [6]:
def logit(x):
    return np.log(x / (1 - x))

additivity_divergence = []
additivity_divergence_all = []
for (dataset_name, model_name, result_name), d in explanations.items():
    if result_name != 'All':
        continue
    model = d.model
    X = d.metadata['x']
    for method, vals in tqmd(d.values.items(), desc = str((dataset_name, model_name, result_name))):
        pred = d.model.decision_function(X)
        shap_sum = vals.sum(axis = 1)
        shap_exp = - shap_sum + pred
        logit_thr = logit(model.threshold)
        real_sum = (pred - logit_thr)
#         delta = (shap_sum - real_sum) / real_sum
        delta = np.abs(shap_exp - logit_thr)
        delta = delta[~np.isnan(delta)]
        
        additivity_divergence.append({
            'Dataset': dataset_name,
            'Model': model_name,
            'Decision Boundary': result_name,
            'Method': method,
            'Divergences': delta,
            'Mean': np.abs(delta).mean(),
            'Variance': np.abs(delta).var(ddof=1),
        })
        for elta in delta:
            additivity_divergence_all.append({
                'Dataset': dataset_name,
                'Model': model_name,
                'Decision Boundary': result_name,
                'Method': method,
                'Divergence': elta,
            })

('Wine Quality (White)', 'Non-Monotonic', 'All'):   0%|          | 0/17 [00:00<?, ?it/s]

('HELOC (Home Equity Line of Credit)', 'Non-Monotonic', 'All'):   0%|          | 0/17 [00:00<?, ?it/s]

('Lending Club (2007-2018)', 'Non-Monotonic', 'All'):   0%|          | 0/17 [00:00<?, ?it/s]

In [7]:
def condition(m):
    for i in [1, 7, 3, 50]:
        if f'knn{i}_' in m:
            return False
    return True

In [8]:
results_ = [r for r in additivity_divergence_all if condition(r['Method'])]

df2 = pd.DataFrame(results_)
df2 = df2[df2['Model'] == 'Non-Monotonic']

df2 = df2[['Method', 'Dataset', 'Divergence']]
df2 = df2.sort_values(['Dataset', 'Method'])
df2['Method'] = df2['Method'].apply(method_to_name)

map_cols = {
    'HELOC': 'HELOC',
    'Lending': 'LC',
    'Wine': 'WQ',
}

def map_colsf(c):
    for a, b in map_cols.items():
        if a in c:
            return b

df2['Dataset'] = df2['Dataset'].apply(map_colsf)

In [9]:
results_ = [r for r in additivity_divergence if condition(r['Method'])]

df = pd.DataFrame(results_)
# df = df[df['Decision Boundary'] == 'All']
df = df[df['Model'] == 'Non-Monotonic']
del df['Divergences']

df = df[['Method', 'Dataset', 'Mean', 'Variance']]
df = df.sort_values(['Dataset', 'Method'])
df['Method'] = df['Method'].apply(method_to_name)

map_cols = {
    'HELOC': 'HELOC',
    'Lending': 'LC',
    'Wine': 'WQ',
}

def map_colsf(c):
    for a, b in map_cols.items():
        if a in c:
            return b

df['Dataset'] = df['Dataset'].apply(map_colsf)


pivot = df.pivot(index = 'Method', columns = 'Dataset', values=['Mean'])
pivot.columns = pivot.columns.droplevel()
pivot

Dataset,HELOC,LC,WQ
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CF-SHAP $10$-NNQ+L1,0.392888,0.191904,0.488839
CF-SHAP $100$-NNQ+L1,0.52461,0.268826,0.663118
CF-SHAP $100$-NNQ+L1$^*$,0.067102,0.080836,0.076111
CF-SHAP $1000$-NNQ+L1,0.778727,0.445226,0.845896
CF-SHAP $20$-NNQ+L1,0.424532,0.209391,0.53725
CF-SHAP $250$-NNQ+L1,0.604559,0.322399,0.740091
CF-SHAP $5$-NNQ+L1,0.364897,0.177237,0.455308
CF-SHAP $500$-NNQ+L1,0.682279,0.376471,0.797765
SHAP D-LAB,0.33096,0.03721,0.56772
SHAP D-LAB (n = 100),0.38454,0.03472,0.606445


In [10]:
print(pivot.to_latex(bold_rows=False, escape = False, column_format='rllll')
      .replace('toprule', 'hline')
      .replace('midrule', 'hline')
      .replace('\\bottomrule\n', '')
      .replace('Dataset', '')
      .replace('Method', '')
)

\begin{tabular}{rllll}
\hline
 &     HELOC &        LC &        WQ \\
                   &           &           &           \\
\hline
CF-SHAP $10$-NNQ+L1      &  0.392888 &  0.191904 &  0.488839 \\
CF-SHAP $100$-NNQ+L1     &  0.524610 &  0.268826 &  0.663118 \\
CF-SHAP $100$-NNQ+L1$^*$ &  0.067102 &  0.080836 &  0.076111 \\
CF-SHAP $1000$-NNQ+L1    &  0.778727 &  0.445226 &  0.845896 \\
CF-SHAP $20$-NNQ+L1      &  0.424532 &  0.209391 &  0.537250 \\
CF-SHAP $250$-NNQ+L1     &  0.604559 &  0.322399 &  0.740091 \\
CF-SHAP $5$-NNQ+L1       &  0.364897 &  0.177237 &  0.455308 \\
CF-SHAP $500$-NNQ+L1     &  0.682279 &  0.376471 &  0.797765 \\
SHAP D-LAB               &  0.330960 &  0.037210 &  0.567720 \\
SHAP D-LAB (n = 100)     &  0.384540 &  0.034720 &  0.606445 \\
SHAP D-PRED              &  0.940211 &  0.693693 &  0.880631 \\
SHAP D-PRED (n = 100)    &  1.000237 &  0.662427 &  0.852235 \\
SHAP TRAIN               &  0.437677 &  0.102876 &  0.456406 \\
SHAP TRAIN (n = 100)     &  0.412