In [1]:
import joblib 
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
if 'src' in os.getcwd() or 'results' in os.getcwd():
    os.chdir('..')
    print(os.getcwd())
if 'src' not in os.getcwd():
    os.chdir('src')

from experiments_helpers import ExperimentResults

if 'src' in os.getcwd() or 'results' in os.getcwd():
    os.chdir('..')
    print(os.getcwd())

folder = 'results_experiment_corr'
name = 'mlp-torch-fico-statrob-TwoSamplesOneDatasetExperimentData-gs-config_a060.yml_0'

es_base: ExperimentResults = ExperimentResults.load_results_from_file(f'{folder}/{name}.joblib')

es_base.pretty_print_robust_vs_base()

/home/ignacy/research/robust-cf
/home/ignacy/research/robust-cf/src


2024-03-05 22:31:18.123580: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 22:31:18.153654: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


/home/ignacy/research/robust-cf
ExperimentResults with 32 metrics and 0 artifacts.
############################## Metrics ##############################
------------------------- Base metrics -------------------------
validity: 1.00 (std: 0.00)
proximityL1: 1.72 (std: 1.53)
proximityL2: 0.45 (std: 0.39)
lof: -1.38 (std: 0.38)
cf_counterfactual_stability: 0.41 (std: 0.08)
dpow: 0.47 (std: 0.14)
plausibility: 0.87 (std: 0.33)
generation_time: 0.03 (std: 0.02)
------------------------- Base metrics 2 -------------------------
validity_2: 0.36 (std: 0.48)
proximityL1_2: 1.72 (std: 1.53)
proximityL2_2: 0.45 (std: 0.39)
lof_2: -1.46 (std: 0.43)
cf_counterfactual_stability_2: 0.47 (std: 0.08)
dpow_2: 0.52 (std: 0.10)
plausibility_2: 0.88 (std: 0.34)
------------------------- Robust metrics -------------------------
robust_validity
robust_validity: 1.00 (std: 0.00)
robust_proximityL1
robust_proximityL1: 1.93 (std: 1.47)
robust_proximityL2
robust_proximityL2: 0.50 (std: 0.38)
robust_lof
robust_

In [None]:
es_base.get_results_as_pandas()

In [None]:
res = es_base.get_results_as_pandas()
res = res.dropna() #TODO investigate what causes nulls 
res = res[res['robust_validity'] > 0] #TODO fix so robust cf is always valid for model 1
cols = [x for x in res.columns if 'prox' in x]
res[cols].mean()

In [None]:
name = 'mlp-torch-fico-statrob-TwoSamplesOneDatasetExperimentData-config_a0'

robs = []
robs_std = []
results_dfs = []

threshs = np.array([60])
for i, a in enumerate(threshs):
    f = f'{folder}/{name}{a}.yml_0.joblib'
    print(f)
    er = ExperimentResults.load_results_from_file(f)
    res = er.get_results_as_pandas()
    res = res.dropna() #TODO investigate what causes nulls 
    res = res[res['robust_validity'] > 0] #TODO fix so robust cf is always valid for model 1
    cols = [x for x in res.columns if 'validity' in x]
    means = res[cols].mean().to_numpy()
    stds = res[cols].std().to_numpy()
    robs.append(means)
    robs_std.append(stds)
    results_dfs.append(res)

df_stdev = pd.DataFrame(robs_std, columns=cols)
df_stdev.columns = ['validity_orig_1', 'validity_orig_2', 'validity_statrob_1', 'validity_statrob_2']
df_stdev['alpha-Rob'] = [float(f'0.{t}') for t in threshs] 
print('stddevs:', df_stdev.round(3))
df = pd.DataFrame(robs, columns=cols).round(3)
df.columns = ['validity_orig_1', 'validity_orig_2', 'validity_statrob_1', 'validity_statrob_2']
df['alpha-Rob'] = [float(f'0.{t}') for t in threshs] 
df

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
method = 'Statrob' if 'statrob' in name else 'Robx'

sns.scatterplot(data=df, x='alpha-Rob', y='validity_statrob_2', s=100, legend=False, label=f'GS + {method}')
#Add error bars
plt.errorbar(df['alpha-Rob'], df['validity_statrob_2'], yerr=df_stdev['validity_statrob_2'], 
             fmt='o', color='blue', capsize=5, alpha=0.5, label=f'GS + {method} std')

sns.scatterplot(data=df, x='alpha-Rob', y='validity_orig_2', s=100, legend=False, label='GS')
plt.errorbar(df['alpha-Rob'], df['validity_orig_2'], yerr=df_stdev['validity_orig_2'],
                fmt='o', color='orange', capsize=5, alpha=0.5, label='GS std')


plt.title(f'Empirical Validity of {method} and its alpha confidence thresholds')
plt.xlabel('alpha-Robustness')
plt.ylabel('Empirical Validity on Changed Model')
# Set y-limit to 1
plt.ylim(0, 1)

#Plot the numbers next to the points
for i, row in df.iterrows():
    plt.text(row['alpha-Rob'], row['validity_statrob_2'] - 0.02, f'{row["validity_statrob_2"]:.2f}', ha='left', va='top', color='blue')
    plt.text(row['alpha-Rob'], row['validity_orig_2'], f'{row["validity_orig_2"]:.2f}', ha='right', va='bottom', color='orange')


# Move the legend outside the figure
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Method')
plt.savefig(f'{folder}/{name}_val_rob_corr.png')

plt.tight_layout()
plt.show()

In [None]:
table = df.drop('alpha-Rob', axis=1).iloc[3].to_frame().round(3)
# table.index = ['original_1', 'original_2', 'statrob_1', 'statrob_2']
table.columns = ['Validity']
table['StdDev'] = df_stdev.iloc[3].to_numpy().round(3)
table['model'] = ['original', 'changed', 'original', 'changed']
table['Method'] = ['GrowingSpheres', 'GrowingSpheres', 'GrowingSpheres + Statrobxplus(0.9-Rob)', 'GrowingSpheres + Statrobxplus(0.9-Rob)']



# Plotting the bar plot
sns.catplot(
    data=table, kind="bar",
    y="Validity", x="Method", hue="model",
    palette="dark", alpha=.6, height=6, legend=True, legend_out=False, aspect=2
)

# Adding error bars based on StdDev
for i, (validity, std_dev) in enumerate(zip(table['Validity'], table['StdDev'])):
    if i % 2 == 1:
        plt.errorbar(i / 2 - 0.3 , validity, yerr=std_dev, fmt='none', ecolor='black', capsize=5, alpha=0.3)

# Adding numbers on top of bars
ax = plt.gca()
for cont in ax.containers[:2]:
    for bar in cont:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom')

# Adjusting legend position
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')

plt.title('Validity of counterfactuals on the original and changed model. \nExperiment: Same data, same model with different seed')
plt.tight_layout()
plt.savefig(f'{folder}/{name}_barplot.png')
plt.show()


print(table)

In [None]:
name = 'torch-fico-robx-var01-config0'

robs = []
robs_std = []
notna_count = []

threshs = np.array([6,7,8,9])
for i, a in enumerate(threshs):
    f = f'{folder}/{name}{a}_0.joblib'
    print(f)
    er = ExperimentResults.load_results_from_file(f)
    res = er.get_results_as_pandas()
    # res = res.dropna() #TODO investigate what causes nulls 
    # res = res[res['robust_validity'] > 0] #TODO fix so robust cf is always valid for model 1
    cols = [x for x in res.columns if 'validity' in x]
    means = res[cols].mean().to_numpy()
    stds = res[cols].std().to_numpy()
    robs.append(means)
    robs_std.append(stds)
    
    notna_count.append(res[cols].notna().sum().to_numpy())

df_stdev = pd.DataFrame(robs_std, columns=cols)
print('stddevs:', df_stdev.round(3))
df = pd.DataFrame(robs, columns=cols).round(3)
df.columns = ['validity_orig_1', 'validity_orig_2', 'validity_statrob_1', 'validity_statrob_2']
df['alpha-Rob'] = threshs / 10
df

In [None]:
table = df.drop('alpha-Rob', axis=1).iloc[3].to_frame().round(3)
# table.index = ['original_1', 'original_2', 'statrob_1', 'statrob_2']
table.columns = ['Validity']
table['StdDev'] = df_stdev.iloc[3].to_numpy().round(3)
table['model'] = ['original', 'changed', 'original', 'changed']
table['Method'] = ['GrowingSpheres', 'GrowingSpheres', 'GrowingSpheres + RobX(tau=0.6,std=0.1)', 'GrowingSpheres + RobX(tau=0.6,std=0.1)']



# Plotting the bar plot
sns.catplot(
    data=table, kind="bar",
    y="Validity", x="Method", hue="model",
    palette="dark", alpha=.6, height=6, legend=True, legend_out=False, aspect=2
)

# Adding error bars based on StdDev
for i, (validity, std_dev) in enumerate(zip(table['Validity'], table['StdDev'])):
    if i % 2 == 1:
        plt.errorbar(i / 2 - 0.3 , validity, yerr=std_dev, fmt='none', ecolor='black', capsize=5, alpha=0.3)

# Adding numbers on top of bars
ax = plt.gca()
for cont in ax.containers[:2]:
    for bar in cont:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom')

# Adjusting legend position
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')

plt.title('Validity of counterfactuals on the original and changed model. \nExperiment: Same data, same model with different seed')
plt.tight_layout()
plt.savefig(f'{folder}/{name}_barplot.png')
plt.show()


print(table)

In [None]:
notna_df = pd.DataFrame(notna_count, columns=cols)
notna_df.index = ['tau=0.6', 'tau=0.7', 'tau=0.8', 'tau=0.9']
notna_df.columns = ['validity_base_original', 'validity_base_changed', 'validity_robx_original', 'validity_robx_changed']
print('How many instances were correctly returned by the explainer? For how many instances was RobX able to find a valid CF?')
notna_df