In [44]:
import copy
import pandas as pd
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap
from cfmining.action_set import ActionSet
from cfmining.utils import get_data_model
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice

from experiments_helper import run_experiments, format_df_table, summarize_results


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
SEED = 0

## German

In [4]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "LGBMClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [21]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome', "NumberOfLiableIndividuals", "NumberOfOtherLoansAtBank"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]

action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

### MAPOFCEM

In [9]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/mapofcem_percentile.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:17<00:00,  2.79it/s]


### MAPOCAM

In [10]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/mapocam_percentile.csv"
);

  0%|                                                                                                                                                                                                                        | 0/50 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:23<00:00,  2.17it/s]


### Bruteforce

In [None]:
model_shap.use_predict_max = False # to not use prediction max
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()


method = Bruteforce(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/bruteforce_percentile.csv"
)

### DICE

In [None]:
method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = 1,
    mutable_features = mutable_features,
    continuous_features = continuous_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/german/dice.csv"
)

### NICE

In [35]:
method = Nice(
    X_train,
    Y_train,
    model = model,
    cat_features = categoric_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/german/nice.csv"
);

  0%|                                                                                                                                                                                                                        | 0/50 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 110.87it/s]


### Results

In [45]:
dataset = "german"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/lgbm/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset, 0.05)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time
0,dice,0.52 (+-0.27) | 0.91,1.72 (+-0.497) | 2.0,0.14 (+-0.351) | 1.0,0.502 (+-0.054) | 0.6,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.198 (+-0.038) | 0.21
1,mapocam_percentile,0.065 (+-0.041) | 0.114,2.14 (+-0.881) | 3.0,0.22 (+-0.418) | 1.0,0.521 (+-0.05) | 0.608,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.458 (+-1.258) | 1.462
2,mapofcem_percentile,0.099 (+-0.101) | 0.321,2.08 (+-0.877) | 3.0,0.06 (+-0.24) | 0.55,0.509 (+-0.035) | 0.557,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.355 (+-0.599) | 1.276
3,nice,0.385 (+-0.265) | 0.782,1.42 (+-0.673) | 2.55,0.12 (+-0.328) | 1.0,0.481 (+-0.05) | 0.57,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.005 (+-0.002) | 0.009


## Taiwan

In [46]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("taiwan")
individuals = individuals.sample(n = 50, random_state=SEED)

In [58]:
not_mutable_features = ['Single', 'Age_in_25_to_40', 'Married', 'Age_lt_25', 'Age_in_40_to_59', 'Age_geq_60', 'EducationLevel']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                       "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1",
                       "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5",
                       "PAY_AMT6", "MaxBillAmountOverLast6Months", "MaxPaymentAmountOverLast6Months",
                       "MostRecentBillAmount", "MostRecentPaymentAmount", "MostRecentPaymentAmount", "TotalMonthsOverdue",
                       "MonthsWithZeroBalanceOverLast6Months", "MonthsWithLowSpendingOverLast6Months", "MonthsWithHighSpendingOverLast6Months", "TotalOverdueCounts"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]
action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

### MAPOFCEM

In [63]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3,
    outlier_percentile=0.01
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/taiwan/mapofcem_percentile.csv"
);

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:38<00:00,  4.38s/it]


### MAPOCAM

In [None]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/taiwan/mapocam_percentile.csv"
);

### DICE

In [None]:
method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = 1,
    mutable_features = mutable_features,
    continuous_features = continuous_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/taiwan/dice.csv"
)

### NICE

In [None]:
method = Nice(
    X_train,
    Y_train,
    model = model,
    cat_features = categoric_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/taiwan/nice.csv"
)

### Results

In [64]:
dataset = "taiwan"
results = []
for method in ["mapofcem_percentile", "mapocam_percentile", "dice", "nice"]:
    results_cur = pd.read_csv(f"../results/lgbm/{dataset}/{method}.csv")
    results_cur = summarize_results(results_cur, dataset, 0.05)
    results_cur["method"] = method
    results.append(results_cur)
results = pd.concat(results)
format_df_table(results, "method", results.columns.tolist()[:-1])

Unnamed: 0,method,costs,n_changes,outlier,outliers_score,diversity,n_solutions,time
0,dice,0.608 (+-0.294) | 0.926,1.64 (+-0.485) | 2.0,0.2 (+-0.404) | 1.0,0.504 (+-0.054) | 0.602,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.239 (+-0.009) | 0.252
1,mapocam_percentile,0.054 (+-0.05) | 0.141,2.56 (+-0.76) | 3.0,0.06 (+-0.24) | 0.55,0.454 (+-0.053) | 0.548,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,7.664 (+-21.23) | 49.131
2,mapofcem_percentile,0.065 (+-0.079) | 0.147,2.49 (+-0.794) | 3.0,0.041 (+-0.2) | 0.0,0.45 (+-0.043) | 0.534,0.0 (+-0.0) | 0.0,0.98 (+-0.141) | 1.0,4.374 (+-25.374) | 4.45
3,nice,0.238 (+-0.203) | 0.671,2.22 (+-1.582) | 5.55,0.06 (+-0.24) | 0.55,0.457 (+-0.055) | 0.55,0.0 (+-0.0) | 0.0,1.0 (+-0.0) | 1.0,0.018 (+-0.004) | 0.026
