In [60]:
import copy
import pandas as pd
import numpy as np
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap, GeneralClassifier
from cfmining.action_set import ActionSet
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice

from experiments_helper import get_data_model, run_experiments, format_df_table, summarize_results, get_action_set


%load_ext autoreload
%autoreload 2

SEED = 0

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1 objective

### MAPOFCEM

In [None]:
max_changes = np.inf
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "LGBMClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)

    model_wrap = GeneralClassifier_Shap(
        model,
        outlier_detection,
        X_train,
        shap_explainer="tree_pipe",
    )

    method = MAPOFCEM(
        action_set = action_set,
        classifier = model_wrap,
        compare = "percentile",
        max_changes = max_changes,
        outlier_contamination= dataset.outlier_contamination,
        estimate_outlier=False,
        time_limit=np.inf,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/lgbm/{dataset}/mapofcem_percentile.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:55<00:00,  4.71s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:00<00:00,  3.62s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.31it/s]


### MAPOCAM

In [None]:
max_changes = np.inf
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "LGBMClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination
    action_set = get_action_set(dataset, X_train, default_step_size=0.05)
    for feat in action_set:
        feat.flip_direction = 1
        feat.update_grid()

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = MAPOCAM(
        action_set = action_set,
        model = model_wrap,
        criteria = "percentile",
        max_changes = max_changes,
    )

    run_experiments(
        method,
        individuals=individuals,
        model=model_wrap,
        output_file=f"../results/lgbm/{dataset_name}/mapocam_percentile.csv"
    );


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [09:01<00:00, 10.83s/it]
PermutationExplainer explainer: 101it [00:11,  1.06it/s]                                                                                                                                 
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [07:47<00:00,  9.35s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:49<00:00,  1.01it/s]


### NICE

In [77]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "LGBMClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )

    method = Nice(
        X_train,
        Y_train,
        model = model,
        cat_features = dataset.categoric_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/lgbm/{dataset_name}/nice.csv"
    );

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 44.52it/s]
PermutationExplainer explainer: 101it [00:11,  1.06it/s]                                                                                                                                 
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 29.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 31.14it/s]


### DICE

In [None]:
for dataset_name in ["german", "taiwan", "adult"]:
    dataset, X_train, Y_train, model, outlier_detection, individuals = get_data_model(dataset_name, "LGBMClassifier")
    individuals = individuals.sample(n = 50, random_state=SEED)
    outlier_detection.contamination = dataset.outlier_contamination

    model_wrap = GeneralClassifier(
        model,
        outlier_detection,
        X_train,
    )


    method = Dice(
        X_train,
        Y_train,
        model,
        n_cfs = 1,
        mutable_features = dataset.mutable_features,
        continuous_features = dataset.continuous_features,
    )

    run_experiments(
        method,
        individuals = individuals,
        model = model_wrap,
        output_file=f"../results/lgbm/{dataset_name}/dice.csv"
    )


### Results

In [83]:
dataset = "german"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/lgbm/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.506 (+-0.265) | 0.911,1.78 (+-0.465) | 2.0,0.14 (+-0.351) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.175 (+-0.011) | 0.187
1,mapocam_percentile,0.062 (+-0.033) | 0.149,2.16 (+-1.149) | 4.55,0.16 (+-0.37) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,10.829 (+-59.039) | 21.395
2,mapofcem_percentile,0.066 (+-0.038) | 0.149,2.24 (+-1.349) | 5.0,0.08 (+-0.274) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,4.707 (+-25.488) | 8.696
3,nice,0.274 (+-0.211) | 0.742,1.4 (+-0.7) | 3.0,0.02 (+-0.141) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.019 (+-0.008) | 0.037


In [84]:
dataset = "taiwan"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/lgbm/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.578 (+-0.262) | 0.923,1.68 (+-0.471) | 2.0,0.7 (+-0.463) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.236 (+-0.008) | 0.254
1,mapocam_percentile,0.042 (+-0.024) | 0.087,1.96 (+-1.309) | 5.0,0.04 (+-0.198) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,9.347 (+-32.235) | 31.686
2,mapofcem_percentile,0.044 (+-0.026) | 0.101,2.1 (+-1.313) | 5.0,0.02 (+-0.141) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,3.618 (+-12.789) | 25.578
3,nice,0.208 (+-0.193) | 0.565,1.82 (+-1.063) | 4.0,0.04 (+-0.198) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.031 (+-0.011) | 0.052


In [85]:
dataset = "adult"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/lgbm/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,diversity,n_solutions,time
0,dice,0.861 (+-0.125) | 0.937,1.68 (+-0.471) | 2.0,0.84 (+-0.37) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.149 (+-0.006) | 0.158
1,mapocam_percentile,0.491 (+-0.315) | 0.866,1.62 (+-0.725) | 3.0,0.1 (+-0.303) | 1.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.984 (+-1.858) | 6.097
2,mapofcem_percentile,0.542 (+-0.31) | 0.866,1.48 (+-0.505) | 2.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.764 (+-1.585) | 4.872
3,nice,0.442 (+-0.376) | 0.817,1.28 (+-0.454) | 2.0,0.0 (+-0.0) | 0.0,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.029 (+-0.006) | 0.038


## Multiple Objectives