In [9]:
import copy
import pandas as pd
import numpy as np
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap, GeneralClassifier
from cfmining.action_set import ActionSet
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice

from experiments_helper import run_experiments, format_df_table, summarize_results, get_data_model, get_action_set


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
SEED = 0

## 1 objective

### MAPOFCEM

In [None]:
max_changes = np.inf
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    # invert individuals ordering
    individuals = individuals.iloc[::-1]
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = "percentile",
        max_changes = max_changes if dataset_name != "taiwan" else 3,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=False,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/mapofcem_percentile.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [24:13<00:00, 29.07s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [16:45<00:00, 20.11s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:08<00:00,  6.23it/s]


In [None]:
max_changes = np.inf
for dataset_name in [
    "taiwan", 
    "adult",
    "german",
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    # invert individuals ordering
    individuals = individuals.iloc[::-1]
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = "percentile",
        max_changes = max_changes if dataset_name != "taiwan" else 3,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=False,
        estimate_proba_max=True,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/mapofcem_percentile_estimate_proba_max.csv"
    );


 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 46/50 [15:37<01:24, 21.10s/it]

In [None]:
max_changes = np.inf
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    # invert individuals ordering
    individuals = individuals.iloc[::-1]
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="permutation",
    )

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = "percentile",
        max_changes = max_changes if dataset_name != "taiwan" else 3,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=False,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/mapofcem_percentile_perm.csv"
    );


### MAPOCAM

In [None]:
max_changes = np.inf
for dataset_name in [
    "german", 
    "taiwan", 
    "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = "percentile",
        max_changes = max_changes if dataset_name != "taiwan" else 3,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset_name}/mapocam_percentile.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [07:36<00:00,  9.12s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [17:00<00:00, 20.42s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  4.13it/s]


### DICE

In [None]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Dice(
        X_train,
        Y_train,
        model,
        n_cfs = 1,
        mutable_features = dataset.mutable_features,
        continuous_features = dataset.continuous_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/dice.csv"
    )


### NICE

In [85]:
for dataset_name in ["taiwan"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

In [28]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Nice(
        X_train,
        Y_train,
        model = model,
        cat_features = dataset.categoric_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/nice.csv"
    );

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 41.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 23.29it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 34.24it/s]


### Results

In [87]:
dataset = "german"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "mapofcem_percentile_perm", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.456 (+-0.276) | 0.91,2.0 (+-0.452) | 2.55,0.12 (+-0.328) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.174 (+-0.022) | 0.203
1,mapocam_percentile,0.063 (+-0.023) | 0.103,2.04 (+-1.16) | 4.55,0.18 (+-0.388) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,9.121 (+-40.07) | 21.08
2,mapofcem_percentile,0.069 (+-0.033) | 0.14,2.22 (+-1.433) | 5.0,0.1 (+-0.303) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,29.063 (+-131.733) | 65.315
3,mapofcem_percentile_perm,0.069 (+-0.033) | 0.14,2.66 (+-1.451) | 5.0,0.1 (+-0.303) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,16.532 (+-70.975) | 41.773
4,nice,0.413 (+-0.288) | 0.868,2.06 (+-1.284) | 4.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.022 (+-0.012) | 0.039


In [88]:
dataset = "taiwan"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "mapofcem_percentile_perm", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.656 (+-0.215) | 0.923,1.7 (+-0.463) | 2.0,0.88 (+-0.328) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.238 (+-0.009) | 0.258
1,mapocam_percentile,0.262 (+-0.079) | 0.338,2.28 (+-0.536) | 3.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,20.415 (+-19.029) | 59.75
2,mapofcem_percentile,0.262 (+-0.079) | 0.338,2.22 (+-0.616) | 3.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,20.101 (+-20.808) | 65.305
3,mapofcem_percentile_perm,0.262 (+-0.079) | 0.338,2.22 (+-0.616) | 3.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,20.192 (+-20.821) | 65.425
4,nice,0.708 (+-0.164) | 0.923,3.46 (+-1.249) | 6.0,0.4 (+-0.495) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.04 (+-0.011) | 0.064


In [89]:
dataset = "adult"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "mapofcem_percentile_perm", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.781 (+-0.171) | 0.891,1.4 (+-0.495) | 2.0,0.54 (+-0.503) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.139 (+-0.006) | 0.149
1,mapocam_percentile,0.547 (+-0.244) | 0.692,1.74 (+-0.487) | 2.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.239 (+-0.324) | 0.912
2,mapofcem_percentile,0.547 (+-0.244) | 0.692,1.74 (+-0.487) | 2.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.16 (+-0.144) | 0.456
3,mapofcem_percentile_perm,0.547 (+-0.244) | 0.692,1.74 (+-0.487) | 2.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.156 (+-0.253) | 0.437
4,nice,0.634 (+-0.352) | 0.883,1.66 (+-0.688) | 3.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.027 (+-0.006) | 0.038


## Multi-objective

In [None]:
max_changes = 3
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = "non_dom",
        max_changes = max_changes,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=False,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/mapofcem_non_dom.csv"
    );


In [None]:
max_changes = 3
for dataset_name in [
    "german", 
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = "non_dom",
        max_changes = max_changes,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset_name}/mapocam_non_dom.csv"
    );
