In [1]:
import copy
import pandas as pd
import numpy as np
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap, GeneralClassifier
from cfmining.action_set import ActionSet
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice
from cfmining.criteria import *

from experiments_helper import run_experiments, format_df_table, summarize_results, get_data_model, get_action_set


%load_ext autoreload
%autoreload 2

In [2]:
SEED = 0

In [3]:
max_changes = 3
objective = "abs_diff"

## 1 objective

### MAPOFCEM

In [4]:
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = objective,
        max_changes = max_changes if dataset_name != "taiwan" else 3,
        outlier_contamination = dataset.outlier_contamination,
        estimate_outlier=True,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/mapofcem_deep_{objective}.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [05:00<00:00,  6.01s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:27<00:00,  1.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:40<00:00,  1.24it/s]


### MAPOCAM

In [None]:
for dataset_name in [
    "german", 
    "taiwan", 
    "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = objective,
        max_changes = max_changes,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset_name}/mapocam_{objective}.csv"
    );


  4%|██████▌                                                                                                                                                              | 2/50 [00:14<04:50,  6.06s/it]

### DICE

In [None]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Dice(
        X_train,
        Y_train,
        model,
        n_cfs = 1,
        mutable_features = dataset.mutable_features,
        continuous_features = dataset.continuous_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/dice.csv"
    )


### NICE

In [15]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Nice(
        X_train,
        Y_train,
        model = model,
        cat_features = dataset.categoric_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/nice.csv"
    );

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 29.36it/s]
PermutationExplainer explainer: 101it [00:11,  1.28it/s]                                                                                                                                                 
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 28.79it/s]


### Results

In [16]:
method_list = ["mapofcem_deep_abs_diff", "mapocam_abs_diff", "dice", "nice"]

In [17]:
dataset = "german"
results = []
for method in method_list:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,percentile_costs,lp_costs,max_dist_costs,abs_diff_costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.436 (+-0.269) | 0.886,4.336 (+-1.244) | 6.128,3.639 (+-0.985) | 4.807,0.222 (+-0.08) | 0.319,2.04 (+-0.45) | 2.0,0.14 (+-0.351) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.176 (+-0.027) | 0.186
1,mapocam_abs_diff,0.391 (+-0.227) | 0.859,2.29 (+-1.459) | 4.778,2.09 (+-1.236) | 4.028,0.108 (+-0.085) | 0.265,1.98 (+-0.742) | 3.0,0.1 (+-0.303) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,33.04 (+-38.475) | 122.782
2,mapofcem_deep_abs_diff,0.401 (+-0.22) | 0.86,2.134 (+-1.241) | 4.485,1.958 (+-1.1) | 4.028,0.099 (+-0.068) | 0.246,1.917 (+-0.71) | 3.0,0.021 (+-0.144) | 0.0,0.0 (+-0.0) | 0.0,0.96 (+-0.198) | 1.0,6.006 (+-5.883) | 17.452
3,nice,0.459 (+-0.254) | 0.8,3.273 (+-1.586) | 6.306,2.846 (+-1.274) | 5.103,0.175 (+-0.122) | 0.396,2.48 (+-1.474) | 4.55,0.06 (+-0.24) | 0.55,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.031 (+-0.016) | 0.053


In [18]:
dataset = "taiwan"
results = []
for method in method_list:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,percentile_costs,lp_costs,max_dist_costs,abs_diff_costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.663 (+-0.252) | 0.933,24.993 (+-13.406) | 47.144,24.775 (+-13.473) | 47.135,2.421 (+-1.24) | 4.342,1.68 (+-0.471) | 2.0,0.98 (+-0.141) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.226 (+-0.009) | 0.24
1,mapocam_abs_diff,0.328 (+-0.071) | 0.441,2.093 (+-0.963) | 3.546,1.832 (+-0.819) | 3.101,0.251 (+-0.123) | 0.431,2.0 (+-0.535) | 3.0,0.02 (+-0.141) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,86.972 (+-83.696) | 236.193
2,mapofcem_deep_abs_diff,0.33 (+-0.071) | 0.441,2.115 (+-0.96) | 3.554,1.854 (+-0.813) | 3.115,0.253 (+-0.123) | 0.431,1.98 (+-0.52) | 3.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,0.545 (+-0.645) | 0.893
3,nice,0.587 (+-0.244) | 0.926,3.827 (+-2.373) | 8.831,3.191 (+-2.17) | 8.217,0.556 (+-0.348) | 1.126,3.62 (+-1.354) | 6.0,0.38 (+-0.49) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.047 (+-0.014) | 0.072


In [19]:
dataset = "adult"
results = []
for method in method_list:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,percentile_costs,lp_costs,max_dist_costs,abs_diff_costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.847 (+-0.122) | 0.929,8.893 (+-3.363) | 13.949,8.571 (+-3.43) | 13.402,1.261 (+-0.477) | 1.965,1.54 (+-0.503) | 2.0,0.96 (+-0.198) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.152 (+-0.005) | 0.161
1,mapocam_abs_diff,0.658 (+-0.213) | 0.879,1.897 (+-0.817) | 3.044,1.702 (+-0.651) | 2.291,0.29 (+-0.163) | 0.537,1.56 (+-0.733) | 3.0,0.34 (+-0.479) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,4.673 (+-4.673) | 13.218
2,mapofcem_deep_abs_diff,0.62 (+-0.204) | 0.87,1.927 (+-0.871) | 2.973,1.768 (+-0.775) | 2.231,0.298 (+-0.161) | 0.536,1.776 (+-0.848) | 3.0,0.02 (+-0.143) | 0.0,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,0.802 (+-0.452) | 1.195
3,nice,0.774 (+-0.264) | 0.891,2.691 (+-1.316) | 4.349,2.464 (+-1.221) | 4.034,0.436 (+-0.245) | 0.778,2.04 (+-0.88) | 3.0,0.7 (+-0.463) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.032 (+-0.009) | 0.042


## Multi-objective

### MAPOFCEM

In [None]:
max_changes = 3
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    #setting multiple criteria
    range_calc = RangeCalculator(action_set)
    perc_calc = PercentileCalculator(action_set = action_set)

    def compare_call(pivot):
        criteria_list = [
            MaxDistCriterion(
                pivot,
                range_calc,
            ),
            NumberChangesCriterion(pivot),
            PercentileCriterion(
                pivot,
                perc_calc,
            )
        ]
        return MultiCriterion(criteria_list, pivot)

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = compare_call,
        max_changes = max_changes,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=True,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/mapofcem_deep_multi.csv"
    );


 48%|██████████████████████████████████████████████████████████████████████████████▋                                                                                     | 24/50 [02:54<04:14,  9.80s/it]

### MAPOCAM

In [None]:
max_changes = 3
for dataset_name in [
    "german", 
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    #setting multiple criteria
    range_calc = RangeCalculator(action_set)
    perc_calc = PercentileCalculator(action_set = action_set)

    def compare_call(pivot):
        criteria_list = [
            MaxDistCriterion(
                pivot,
                range_calc,
            ),
            NumberChangesCriterion(pivot),
            PercentileCriterion(
                pivot,
                perc_calc,
            )
        ]
        return MultiCriterion(criteria_list, pivot)
    
    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = compare_call,
        max_changes = max_changes,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset_name}/mapocam_multi.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [41:21<00:00, 49.63s/it]
PermutationExplainer explainer: 101it [00:12,  1.55it/s]                                                                                                                                                 
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 42/50 [1:23:26<14:49, 111.18s/it]

### DICE

In [None]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    # get number of solutions of mapofcem
    results_cur = pd.read_csv(f"../results/mlp/{dataset_name}/mapofcem_deep_multi.csv")
    results_cur = summarize_results(results_cur, dataset_name)
    n_cfs = results_cur.n_solutions.max()


    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )


    method = Dice(
        X_train,
        Y_train,
        model,
        n_cfs = n_cfs,
        mutable_features = dataset.mutable_features,
        continuous_features = dataset.continuous_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/dice_multi.csv"
    )


In [12]:
dataset = "german"
results = []
for method in ["mapofcem_deep_multi", "mapocam_multi", "dice_multi"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,percentile_costs,lp_costs,max_dist_costs,n_changes,outlier,diversity,n_solutions,time
0,dice_multi,0.475 (+-0.156) | 0.715,1.884 (+-0.626) | 3.11,0.959 (+-0.1) | 1.006,2.072 (+-0.608) | 3.33,0.084 (+-0.222) | 0.4,0.735 (+-0.172) | 0.971,5.0 (+-0.0) | 5.0,0.225 (+-0.059) | 0.319
1,mapocam_multi,0.176 (+-0.147) | 0.456,1.257 (+-0.603) | 2.017,0.852 (+-0.206) | 1.0,1.601 (+-0.541) | 2.433,0.068 (+-0.223) | 0.5,0.489 (+-0.485) | 0.998,1.92 (+-1.192) | 4.0,28.192 (+-16.771) | 53.969
2,mapofcem_deep_multi,0.2 (+-0.149) | 0.456,1.327 (+-0.686) | 2.5,0.852 (+-0.206) | 1.0,1.697 (+-0.633) | 3.0,0.031 (+-0.158) | 0.0,0.548 (+-0.48) | 0.998,2.0 (+-1.195) | 4.0,2.832 (+-1.813) | 7.001


In [13]:
dataset = "taiwan"
results = []
for method in ["mapofcem_deep_multi", "mapocam_multi", "dice_multi"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,percentile_costs,lp_costs,max_dist_costs,n_changes,outlier,diversity,n_solutions,time
0,dice_multi,0.643 (+-0.176) | 0.903,6.932 (+-1.826) | 9.242,6.325 (+-1.683) | 8.484,1.624 (+-0.178) | 1.889,0.847 (+-0.21) | 1.0,0.988 (+-0.049) | 0.999,9.0 (+-0.0) | 9.0,1.52 (+-6.035) | 0.335
1,mapocam_multi,0.352 (+-0.102) | 0.507,0.952 (+-0.516) | 1.69,0.574 (+-0.255) | 0.9,2.243 (+-0.346) | 2.667,0.031 (+-0.098) | 0.296,0.835 (+-0.29) | 1.0,3.3 (+-1.799) | 6.55,120.226 (+-111.701) | 306.863
2,mapofcem_deep_multi,0.349 (+-0.105) | 0.51,0.855 (+-0.429) | 1.515,0.537 (+-0.223) | 0.9,2.173 (+-0.329) | 2.617,0.017 (+-0.08) | 0.0,0.77 (+-0.359) | 1.0,2.9 (+-1.982) | 6.55,8.378 (+-6.171) | 16.855


In [14]:
dataset = "adult"
results = []
for method in ["mapofcem_deep_multi", "mapocam_multi", "dice_multi"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,percentile_costs,lp_costs,max_dist_costs,n_changes,outlier,diversity,n_solutions,time
0,dice_multi,0.805 (+-0.069) | 0.873,2.895 (+-0.633) | 3.708,2.589 (+-0.578) | 3.462,1.508 (+-0.139) | 1.738,0.628 (+-0.228) | 1.0,0.978 (+-0.101) | 0.998,17.0 (+-0.0) | 17.0,0.243 (+-0.012) | 0.266
1,mapocam_multi,0.576 (+-0.186) | 0.78,0.352 (+-0.295) | 0.905,0.239 (+-0.175) | 0.55,1.772 (+-0.459) | 2.429,0.042 (+-0.108) | 0.303,0.939 (+-0.24) | 1.0,6.7 (+-5.1) | 17.0,1.08 (+-1.665) | 4.779
2,mapofcem_deep_multi,0.572 (+-0.183) | 0.762,0.343 (+-0.281) | 0.87,0.24 (+-0.177) | 0.562,1.765 (+-0.45) | 2.389,0.02 (+-0.056) | 0.167,0.939 (+-0.24) | 1.0,6.3 (+-4.418) | 14.55,2.998 (+-4.112) | 11.535
