In [1]:
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap
from cfmining.action_set import ActionSet
from cfmining.utils import get_data_model
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice

from experiments_helper import summarize_results, format_df_table, run_experiments

%load_ext autoreload
%autoreload 2

In [2]:
SEED = 0

## LGBM

### German

In [3]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "LGBMClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [4]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome', "NumberOfLiableIndividuals", "NumberOfOtherLoansAtBank"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]

action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

#### MAPOFCEM

In [21]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "non_dom",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/german/mapofcem_non_dom.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:46<00:00,  2.13s/it]


In [5]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile_change",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/german/mapofcem_percentile_change.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:25<00:00,  1.95it/s]


#### MAPOCAM

In [25]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "non_dom",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/german/mapocam_non_dom.csv"
);

  0%|                                                                                                                                                                                                                        | 0/50 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [12:37<00:00, 15.15s/it]


In [6]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile_change",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/german/mapocam_percentile_change.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:46<00:00,  1.07it/s]


#### DICE

In [None]:
# get mapofcem mean number of solutions
results = pd.read_csv("../results/multi_obj/lgbm/german/mapofcem_non_dom.csv")
n_cfs = int(results["n_solutions"].mean())

method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = n_cfs,
    mutable_features = mutable_features,
    continuous_features = continuous_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/multi_obj/lgbm/german/dice.csv"
);

#### Results

In [7]:
dataset = "german"
results = []
for method in ["mapofcem_non_dom", "mapofcem_percentile_change", "mapocam_non_dom", "mapocam_percentile_change", "dice"]:
    results_cur = pd.read_csv(f"../results/multi_obj/lgbm/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset, 0.05)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time
0,dice,0.508 (+-0.123) | 0.698,1.773 (+-0.337) | 2.08,4.08 (+-6.272) | 18.55,0.508 (+-0.045) | 0.581,0.138 (+-0.184) | 0.548,22.0 (+-0.0) | 22.0,0.303 (+-0.074) | 0.435
1,mapocam_non_dom,0.436 (+-0.13) | 0.627,2.092 (+-0.637) | 2.841,10.32 (+-18.098) | 43.05,0.514 (+-0.043) | 0.587,0.067 (+-0.093) | 0.236,36.4 (+-26.026) | 81.05,15.135 (+-13.875) | 39.759
2,mapocam_percentile_change,0.112 (+-0.106) | 0.32,1.29 (+-0.392) | 2.0,0.3 (+-0.58) | 1.55,0.511 (+-0.05) | 0.597,0.434 (+-0.495) | 1.0,1.5 (+-0.614) | 2.55,0.929 (+-2.667) | 3.017
3,mapofcem_non_dom,0.434 (+-0.147) | 0.628,2.053 (+-0.615) | 2.821,1.367 (+-2.243) | 6.0,0.503 (+-0.032) | 0.549,0.157 (+-0.2) | 0.618,22.74 (+-18.045) | 61.85,2.12 (+-1.916) | 6.275
4,mapofcem_percentile_change,0.15 (+-0.146) | 0.465,1.36 (+-0.535) | 2.775,0.02 (+-0.141) | 0.0,0.496 (+-0.036) | 0.546,0.376 (+-0.485) | 1.0,1.38 (+-0.49) | 2.0,0.511 (+-0.769) | 1.772


### Taiwan

In [29]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("taiwan")
individuals = individuals.sample(n = 50, random_state=SEED)

In [30]:
not_mutable_features = ['Single', 'Age_in_25_to_40', 'Married', 'Age_lt_25', 'Age_in_40_to_59', 'Age_geq_60', 'EducationLevel']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                       "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1",
                       "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5",
                       "PAY_AMT6", "MaxBillAmountOverLast6Months", "MaxPaymentAmountOverLast6Months",
                       "MostRecentBillAmount", "MostRecentPaymentAmount", "MostRecentPaymentAmount", "TotalMonthsOverdue",
                       "MonthsWithZeroBalanceOverLast6Months", "MonthsWithLowSpendingOverLast6Months", "MonthsWithHighSpendingOverLast6Months", "TotalOverdueCounts"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]
action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

#### MAPOFCEM

In [32]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "non_dom",
    max_changes = 3,
    outlier_percentile = 0.01
)

results = run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/taiwan/mapofcem_non_dom.csv"
);

  0%|                                                                                                                                                                                                                        | 0/50 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [2:01:21<00:00, 145.63s/it]


In [42]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3,
    outlier_percentile = 0.01
)

results = run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/taiwan/mapofcem_percentile.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:48<00:00,  4.57s/it]


#### MAPOCAM

In [None]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "non_dom",
    max_changes=3
)

# did not finished running
run_experiments(
    method,
    individuals=individuals.iloc[:5],
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/taiwan/mapocam_non_dom.csv"
);

In [41]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/lgbm/taiwan/mapocam_percentile.csv"
);

  0%|                                                                                                                                                                                                                        | 0/50 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [06:17<00:00,  7.55s/it]


#### Dice

In [None]:
# get mapofcem mean number of solutions
results = pd.read_csv("../results/multi_obj/lgbm/taiwan/mapofcem_non_dom.csv")
n_cfs = int(results["n_solutions"].mean())

method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = n_cfs,
    mutable_features = mutable_features,
    continuous_features = continuous_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/multi_obj/lgbm/taiwan/dice.csv"
);

#### Results

In [43]:
dataset = "taiwan"
metrics = []
for method in ["mapofcem_non_dom", "mapofcem_percentile", "mapocam_percentile", "dice"]:
    results = pd.read_csv(f"../results/multi_obj/lgbm/{dataset}/{method}.csv")
    metrics.append(summarize_results(results, dataset))
    metrics[-1]["model"] = method
metrics = pd.concat(metrics)
format_df_table(metrics, "model", metrics.columns.tolist()[:-1])

Unnamed: 0,model,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time
0,dice,0.68 (+-0.145) | 0.887,2.14 (+-1.143) | 4.963,39.12 (+-49.596) | 144.6,0.515 (+-0.047) | 0.592,0.242 (+-0.322) | 0.87,151.0 (+-0.0) | 151.0,4.118 (+-4.789) | 15.324
1,mapocam_percentile,0.054 (+-0.05) | 0.141,2.54 (+-0.762) | 3.0,0.06 (+-0.24) | 0.55,0.454 (+-0.053) | 0.548,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,7.542 (+-20.91) | 48.562
2,mapofcem_non_dom,0.38 (+-0.099) | 0.541,2.517 (+-0.468) | 2.927,9.959 (+-43.448) | 23.2,0.457 (+-0.041) | 0.534,0.447 (+-0.3) | 0.898,151.0 (+-100.66) | 359.7,145.577 (+-59.874) | 180.083
3,mapofcem_percentile,0.058 (+-0.051) | 0.141,2.49 (+-0.767) | 3.0,0.041 (+-0.2) | 0.0,0.448 (+-0.043) | 0.532,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,4.566 (+-25.367) | 3.593


## MLP

### German

In [8]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "MLPClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [9]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome', "NumberOfLiableIndividuals", "NumberOfOtherLoansAtBank"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]

action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="permutation", threshold = 0.5)

#### MAPOFCEM

In [55]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "non_dom",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/german/mapofcem_non_dom.csv"
);

  0%|                                                                                                                                                                                                                        | 0/50 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [21:34<00:00, 25.90s/it]


In [11]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile_change",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/german/mapofcem_percentile_change.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:37<00:00,  1.95s/it]


#### MAPOCAM

In [47]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "non_dom",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/german/mapocam_non_dom.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [32:29<00:00, 38.98s/it]


In [12]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile_change",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/german/mapocam_percentile_change.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:40<00:00,  2.01s/it]


#### Dice

In [None]:
# get mapofcem mean number of solutions
results = pd.read_csv("../results/multi_obj/mlp/german/mapofcem_non_dom.csv")
n_cfs = int(np.ceil(results["n_solutions"].mean()))

method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = n_cfs,
    mutable_features = mutable_features,
    continuous_features = continuous_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/multi_obj/mlp/german/dice.csv"
);

#### Results

In [16]:
dataset = "german"
results = []
for method in ["mapofcem_non_dom", "mapocam_non_dom", "mapofcem_percentile_change",  "mapocam_percentile_change", "dice"]:
    results_cur = pd.read_csv(f"../results/multi_obj/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset, 0.05)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time
0,dice,0.513 (+-0.118) | 0.74,2.372 (+-1.131) | 4.582,9.56 (+-12.221) | 37.55,0.52 (+-0.045) | 0.598,0.003 (+-0.008) | 0.016,41.0 (+-0.0) | 41.0,0.531 (+-0.212) | 1.018
1,mapocam_non_dom,0.514 (+-0.136) | 0.678,2.292 (+-0.626) | 2.989,5.9 (+-10.622) | 29.1,0.509 (+-0.045) | 0.581,0.062 (+-0.109) | 0.243,47.34 (+-38.204) | 114.55,38.965 (+-20.562) | 65.718
2,mapocam_percentile_change,0.087 (+-0.124) | 0.245,1.37 (+-0.579) | 2.55,0.26 (+-0.443) | 1.0,0.524 (+-0.045) | 0.6,0.096 (+-0.291) | 0.937,1.1 (+-0.303) | 2.0,2.006 (+-6.998) | 9.34
3,mapofcem_non_dom,0.517 (+-0.123) | 0.664,2.304 (+-0.601) | 2.926,1.771 (+-3.639) | 10.95,0.499 (+-0.033) | 0.545,0.109 (+-0.2) | 0.439,40.1 (+-31.255) | 86.75,25.885 (+-15.407) | 48.052
4,mapofcem_percentile_change,0.094 (+-0.091) | 0.265,1.375 (+-0.579) | 2.5,0.167 (+-0.377) | 1.0,0.514 (+-0.037) | 0.564,0.123 (+-0.328) | 0.991,1.08 (+-0.396) | 2.0,1.944 (+-3.707) | 4.383


### Taiwan

In [17]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("taiwan", "MLPClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [18]:
not_mutable_features = ['Single', 'Age_in_25_to_40', 'Married', 'Age_lt_25', 'Age_in_40_to_59', 'Age_geq_60', 'EducationLevel']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                       "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1",
                       "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5",
                       "PAY_AMT6", "MaxBillAmountOverLast6Months", "MaxPaymentAmountOverLast6Months",
                       "MostRecentBillAmount", "MostRecentPaymentAmount", "MostRecentPaymentAmount", "TotalMonthsOverdue",
                       "MonthsWithZeroBalanceOverLast6Months", "MonthsWithLowSpendingOverLast6Months", "MonthsWithHighSpendingOverLast6Months", "TotalOverdueCounts"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]
action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="permutation", threshold = 0.5)

PermutationExplainer explainer: 1001it [00:29, 23.56it/s]                                                                                                                                                                                           


#### MAPOFCEM

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "non_dom",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/taiwan/mapofcem_non_dom.csv"
);

In [20]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile_change",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/taiwan/mapofcem_percentile_change.csv"
);

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [1:18:11<00:00, 93.83s/it]


#### MAPOCAM

In [None]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "non_dom",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/taiwan/mapocam_non_dom.csv"
);

In [21]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile_change",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/multi_obj/mlp/taiwan/mapocam_percentile_change.csv"
);

 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                             | 27/50 [1:35:12<1:21:05, 211.56s/it]


KeyboardInterrupt: 

#### Dice

In [None]:
# get mapofcem mean number of solutions
results = pd.read_csv("../results/multi_obj/mlp/taiwan/mapofcem_percentile_change.csv")
n_cfs = int(np.ceil(results["n_solutions"].mean()))

method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = n_cfs,
    mutable_features = mutable_features,
    continuous_features = continuous_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/multi_obj/mlp/german/dice.csv"
);

#### Results

In [None]:
dataset = "german"
results = []
for method in ["mapofcem_non_dom", "mapocam_non_dom", "dice"]:
    results_cur = pd.read_csv(f"../results/multi_obj/mlp/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset, 0.05)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])