In [5]:
import copy
import pandas as pd
import numpy as np
import sys
sys.path.append("../")

from cfmining.algorithms import P2CE
from cfmining.predictors import GeneralClassifier_Shap, GeneralClassifier
from cfmining.action_set import ActionSet
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice
from cfmining.criteria import *

from experiments_helper import run_experiments, format_df_table, summarize_results, get_data_model, get_action_set


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
SEED = 0

In [7]:
max_changes = 3
objective = "abs_diff"

## 1 objective

### P2CE

In [4]:
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    method = P2CE(
        action_set = action_set,
        classifier = model_wrap,
        compare = objective,
        max_changes = max_changes if dataset_name != "taiwan" else 3,
        outlier_contamination = dataset.outlier_contamination,
        estimate_outlier=True,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/p2ce_deep_{objective}.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:44<00:00,  5.69s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:37<00:00,  1.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:32<00:00,  1.86s/it]


### MAPOCAM

In [None]:
for dataset_name in [
    "german", 
    "taiwan", 
    "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = objective,
        max_changes = max_changes,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset_name}/mapocam_{objective}.csv"
    );


  4%|██████▌                                                                                                                                                              | 2/50 [00:14<04:50,  6.06s/it]

### DICE

In [None]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Dice(
        X_train,
        Y_train,
        model,
        n_cfs = 1,
        mutable_features = dataset.mutable_features,
        continuous_features = dataset.continuous_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/dice.csv"
    )


### NICE

In [15]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Nice(
        X_train,
        Y_train,
        model = model,
        cat_features = dataset.categoric_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/nice.csv"
    );

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 29.36it/s]
PermutationExplainer explainer: 101it [00:11,  1.28it/s]                                                                                                                                                 
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 28.79it/s]


### Results

In [16]:
method_list = ["p2ce_deep_abs_diff", "mapocam_abs_diff", "dice", "nice"]

In [17]:
dataset = "german"
results = []
for method in method_list:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

In [18]:
dataset = "taiwan"
results = []
for method in method_list:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

In [19]:
dataset = "adult"
results = []
for method in method_list:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

## Multi-objective

### P2CE

In [8]:
max_changes = 3
for dataset_name in [
    "german",
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="deep_pipe",
    )

    #setting multiple criteria
    range_calc = RangeCalculator(action_set)
    perc_calc = PercentileCalculator(action_set = action_set)

    def compare_call(pivot):
        criteria_list = [
            MaxDistCriterion(
                pivot,
                range_calc,
            ),
            NumberChangesCriterion(pivot),
            PercentileCriterion(
                pivot,
                perc_calc,
            )
        ]
        return MultiCriterion(criteria_list, pivot)

    method = P2CE(
        action_set = action_set,
        classifier = model_wrap,
        compare = compare_call,
        max_changes = max_changes,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=True,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset}/p2ce_deep_multi.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [18:59<00:00, 22.80s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:47<00:00,  1.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [05:38<00:00,  6.76s/it]


### MAPOCAM

In [None]:
max_changes = 3
for dataset_name in [
    "german", 
    "taiwan", 
    "adult"
    ]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    #setting multiple criteria
    range_calc = RangeCalculator(action_set)
    perc_calc = PercentileCalculator(action_set = action_set)

    def compare_call(pivot):
        criteria_list = [
            MaxDistCriterion(
                pivot,
                range_calc,
            ),
            NumberChangesCriterion(pivot),
            PercentileCriterion(
                pivot,
                perc_calc,
            )
        ]
        return MultiCriterion(criteria_list, pivot)
    
    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = compare_call,
        max_changes = max_changes,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/mlp/{dataset_name}/mapocam_multi.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [41:21<00:00, 49.63s/it]
PermutationExplainer explainer: 101it [00:12,  1.55it/s]                                                                                                                                                 
 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 42/50 [1:23:26<14:49, 111.18s/it]

### DICE

In [None]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "MLPClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    n_cfs = 4


    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )


    method = Dice(
        X_train,
        Y_train,
        model,
        n_cfs = n_cfs,
        mutable_features = dataset.mutable_features,
        continuous_features = dataset.continuous_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/mlp/{dataset_name}/dice_multi.csv"
    )


In [12]:
dataset = "german"
results = []
for method in ["p2ce_deep_multi", "mapocam_multi", "dice_multi"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

In [13]:
dataset = "taiwan"
results = []
for method in ["p2ce_deep_multi", "mapocam_multi", "dice_multi"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

In [14]:
dataset = "adult"
results = []
for method in ["p2ce_deep_multi", "mapocam_multi", "dice_multi"]:
    results_cur = pd.read_csv(f"../results/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])