In [1]:
import pandas as pd
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap
from cfmining.action_set import ActionSet
from cfmining.utils import get_data_model, FakeOutlierDetection

from experiments_helper import run_experiments, summarize_results, format_df_table


%load_ext autoreload
%autoreload 2

In [2]:
SEED = 0

## German

In [3]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "LGBMClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [4]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

In [None]:
outlier_percentile_list = [0.005, 0.01, 0.05, 0.075, 0.1]
for outlier_percentile in outlier_percentile_list:
    perc = int(outlier_percentile * 1000)
    method = MAPOFCEM(
        action_set,
        model_shap,
        compare = "percentile",
        estimate_prob_max = False,
        estimate_outlier=True,
        max_changes = 3,
        outlier_percentile=outlier_percentile
    )
    run_experiments(
        method,
        individuals,
        model_shap,
        f"../results/outlier_percentile/german/mapofcem_v2_{perc}.csv"
    )


    method = MAPOFCEM(
        action_set,
        model_shap,
        compare = "percentile",
        estimate_prob_max = False,
        estimate_outlier=False,
        max_changes = 3,
        outlier_percentile=outlier_percentile
    )
    run_experiments(
        method,
        individuals,
        model_shap,
        f"../results/outlier_percentile/german/mapofcem_v1_{perc}.csv"
    )



In [None]:
model_shap = GeneralClassifier_Shap(model, FakeOutlierDetection(), X_train, shap_explainer="tree", threshold = 0.5)
perc = 0
method = MAPOFCEM(
    action_set,
    model_shap,
    compare = "percentile",
    estimate_prob_max = False,
    estimate_outlier=True,
    max_changes = 3,
    outlier_percentile=outlier_percentile
)
run_experiments(
    method,
    individuals,
    model_shap,
    f"../results/outlier_percentile/german/mapofcem_v2_{perc}.csv"
)


method = MAPOFCEM(
    action_set,
    model_shap,
    compare = "percentile",
    estimate_prob_max = False,
    estimate_outlier=False,
    max_changes = 3,
    outlier_percentile=outlier_percentile
)
run_experiments(
    method,
    individuals,
    model_shap,
    f"../results/outlier_percentile/german/mapofcem_v1_{perc}.csv"
);

### Results

In [32]:
outlier_percentile_list = [0, 0.005, 0.01, 0.05, 0.075, 0.1]
all_results = []
for i, outlier_percentile in enumerate(outlier_percentile_list):
    perc = int(outlier_percentile * 1000)

    results = pd.read_csv(f"../results/outlier_percentile/german/mapofcem_v2_{perc}.csv")
    results = summarize_results(results, dataset = "german", outlier_percentile=0.05)
    results["outlier percentile"] = outlier_percentile
    results["method"] = f"MAPOFCEM_v2_{i}"
    all_results.append(results)

    results = pd.read_csv(f"../results/outlier_percentile/german/mapofcem_v1_{perc}.csv")
    results = summarize_results(results, dataset = "german", outlier_percentile=0.05)
    results["outlier percentile"] = outlier_percentile
    results["method"] = f"MAPOFCEM_v1_{i}"
    all_results.append(results)

results = pd.concat(all_results)
format_df_table(results, "method", results.columns[:-1])

Unnamed: 0,method,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time,outlier percentile
0,MAPOFCEM_v1_0,0.077 (+-0.077),2.04 (+-0.856),0.18 (+-0.388),0.519 (+-0.045),0.0 (+-0.0),1.0 (+-0.0),0.189 (+-0.25),0.0 (+-0.0)
1,MAPOFCEM_v1_1,0.091 (+-0.085),2.14 (+-0.783),0.2 (+-0.404),0.517 (+-0.05),0.0 (+-0.0),1.0 (+-0.0),0.241 (+-0.335),0.005 (+-0.0)
2,MAPOFCEM_v1_2,0.095 (+-0.09),2.1 (+-0.789),0.12 (+-0.328),0.511 (+-0.043),0.0 (+-0.0),1.0 (+-0.0),0.256 (+-0.36),0.01 (+-0.0)
3,MAPOFCEM_v1_3,0.105 (+-0.103),2.08 (+-0.778),0.04 (+-0.198),0.507 (+-0.039),0.0 (+-0.0),1.0 (+-0.0),0.325 (+-0.501),0.05 (+-0.0)
4,MAPOFCEM_v1_4,0.136 (+-0.176),2.083 (+-0.739),0.0 (+-0.0),0.499 (+-0.035),0.0 (+-0.0),0.96 (+-0.198),0.581 (+-1.437),0.075 (+-0.0)
5,MAPOFCEM_v1_5,0.14 (+-0.174),2.062 (+-0.755),0.0 (+-0.0),0.493 (+-0.03),0.0 (+-0.0),0.96 (+-0.198),0.598 (+-1.445),0.1 (+-0.0)
6,MAPOFCEM_v2_0,0.077 (+-0.077),2.04 (+-0.856),0.18 (+-0.388),0.519 (+-0.045),0.0 (+-0.0),1.0 (+-0.0),0.192 (+-0.252),0.0 (+-0.0)
7,MAPOFCEM_v2_1,0.091 (+-0.085),2.14 (+-0.783),0.2 (+-0.404),0.517 (+-0.05),0.0 (+-0.0),1.0 (+-0.0),0.265 (+-0.348),0.005 (+-0.0)
8,MAPOFCEM_v2_2,0.095 (+-0.09),2.1 (+-0.789),0.12 (+-0.328),0.511 (+-0.043),0.0 (+-0.0),1.0 (+-0.0),0.279 (+-0.373),0.01 (+-0.0)
9,MAPOFCEM_v2_3,0.105 (+-0.103),2.08 (+-0.778),0.04 (+-0.198),0.507 (+-0.039),0.0 (+-0.0),1.0 (+-0.0),0.346 (+-0.51),0.05 (+-0.0)


## Taiwan

In [5]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("taiwan", "LGBMClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [6]:
not_mutable_features = ['Single', 'Age_in_25_to_40', 'Married', 'Age_lt_25', 'Age_in_40_to_59', 'Age_geq_60', 'EducationLevel']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

In [7]:
outlier_percentile_list = [0.005, 0.01, 0.05, 0.075, 0.1]
for outlier_percentile in outlier_percentile_list:
    perc = int(outlier_percentile * 1000)
    method = MAPOFCEM(
        action_set,
        model_shap,
        compare = "percentile",
        estimate_prob_max = False,
        estimate_outlier=True,
        max_changes = 3,
        outlier_percentile=outlier_percentile,
        time_limit=180
    )
    run_experiments(
        method,
        individuals,
        model_shap,
        f"../results/outlier_percentile/taiwan/mapofcem_v2_{perc}.csv"
    )


    method = MAPOFCEM(
        action_set,
        model_shap,
        compare = "percentile",
        estimate_prob_max = False,
        estimate_outlier=False,
        max_changes = 3,
        outlier_percentile=outlier_percentile,
        time_limit=180
    )
    run_experiments(
        method,
        individuals,
        model_shap,
        f"../results/outlier_percentile/taiwan/mapofcem_v1_{perc}.csv"
    )



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:49<00:00,  4.58s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:49<00:00,  4.59s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:51<00:00,  4.64s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:49<00:00,  4.60s/it]
100%|███████████████

In [8]:
model_shap = GeneralClassifier_Shap(model, FakeOutlierDetection(), X_train, shap_explainer="tree", threshold = 0.5)
perc = 0
method = MAPOFCEM(
    action_set,
    model_shap,
    compare = "percentile",
    estimate_prob_max = False,
    estimate_outlier=True,
    max_changes = 3,
    outlier_percentile=outlier_percentile,
    time_limit=180
)
run_experiments(
    method,
    individuals,
    model_shap,
    f"../results/outlier_percentile/taiwan/mapofcem_v2_{perc}.csv"
)


method = MAPOFCEM(
    action_set,
    model_shap,
    compare = "percentile",
    estimate_prob_max = False,
    estimate_outlier=False,
    max_changes = 3,
    outlier_percentile=outlier_percentile,
    time_limit=180
)
run_experiments(
    method,
    individuals,
    model_shap,
    f"../results/outlier_percentile/taiwan/mapofcem_v1_{perc}.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:36<00:00,  1.39it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:35<00:00,  1.39it/s]


### Results

In [10]:
outlier_percentile_list = [0, 0.005, 0.01, 0.05, 0.075, 0.1]
all_results = []
for i, outlier_percentile in enumerate(outlier_percentile_list):
    perc = int(outlier_percentile * 1000)

    results = pd.read_csv(f"../results/outlier_percentile/taiwan/mapofcem_v2_{perc}.csv")
    results = summarize_results(results, dataset = "taiwan", outlier_percentile=0.05)
    results["outlier percentile"] = outlier_percentile
    results["method"] = f"MAPOFCEM_v2_{i}"
    all_results.append(results)

    results = pd.read_csv(f"../results/outlier_percentile/taiwan/mapofcem_v1_{perc}.csv")
    results = summarize_results(results, dataset = "taiwan", outlier_percentile=0.05)
    results["outlier percentile"] = outlier_percentile
    results["method"] = f"MAPOFCEM_v1_{i}"
    all_results.append(results)

results = pd.concat(all_results)
format_df_table(results, "method", results.columns[:-1])

Unnamed: 0,method,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time,outlier percentile
0,MAPOFCEM_v1_0,0.062 (+-0.073) | 0.141,2.4 (+-0.833) | 3.0,0.06 (+-0.24) | 0.55,0.453 (+-0.053) | 0.548,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.714 (+-0.862) | 3.014,0.0 (+-0.0) | 0.0
1,MAPOFCEM_v1_1,0.059 (+-0.059) | 0.141,2.48 (+-0.789) | 3.0,0.06 (+-0.24) | 0.55,0.454 (+-0.051) | 0.55,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,4.589 (+-25.368) | 3.541,0.005 (+-0.0) | 0.005
2,MAPOFCEM_v1_2,0.06 (+-0.059) | 0.141,2.469 (+-0.793) | 3.0,0.041 (+-0.2) | 0.0,0.45 (+-0.043) | 0.535,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,4.592 (+-25.373) | 3.541,0.01 (+-0.0) | 0.01
3,MAPOFCEM_v1_3,0.065 (+-0.066) | 0.218,2.469 (+-0.793) | 3.0,0.041 (+-0.2) | 0.0,0.449 (+-0.04) | 0.535,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,8.123 (+-35.485) | 7.489,0.05 (+-0.0) | 0.05
4,MAPOFCEM_v1_4,0.065 (+-0.061) | 0.144,2.458 (+-0.798) | 3.0,0.0 (+-0.0) | 0.0,0.446 (+-0.036) | 0.524,0.0 (+-0.0) | 0.0,0.96 (+-0.198) | 1.0,11.178 (+-40.729) | 89.793,0.075 (+-0.0) | 0.075
5,MAPOFCEM_v1_5,0.071 (+-0.073) | 0.247,2.447 (+-0.802) | 3.0,0.0 (+-0.0) | 0.0,0.444 (+-0.033) | 0.509,0.0 (+-0.0) | 0.0,0.94 (+-0.24) | 1.0,15.926 (+-49.29) | 180.02,0.1 (+-0.0) | 0.1
6,MAPOFCEM_v2_0,0.062 (+-0.073) | 0.141,2.4 (+-0.833) | 3.0,0.06 (+-0.24) | 0.55,0.453 (+-0.053) | 0.548,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.718 (+-0.869) | 3.039,0.0 (+-0.0) | 0.0
7,MAPOFCEM_v2_1,0.059 (+-0.059) | 0.141,2.48 (+-0.789) | 3.0,0.06 (+-0.24) | 0.55,0.454 (+-0.051) | 0.55,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,4.578 (+-24.909) | 3.714,0.005 (+-0.0) | 0.005
8,MAPOFCEM_v2_2,0.06 (+-0.059) | 0.141,2.469 (+-0.793) | 3.0,0.041 (+-0.2) | 0.0,0.45 (+-0.043) | 0.535,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,4.633 (+-25.365) | 3.706,0.01 (+-0.0) | 0.01
9,MAPOFCEM_v2_3,0.064 (+-0.062) | 0.185,2.469 (+-0.793) | 3.0,0.041 (+-0.2) | 0.0,0.449 (+-0.041) | 0.535,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,7.077 (+-30.62) | 7.581,0.05 (+-0.0) | 0.05
