In [1]:
import json
import os
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap
from cfmining.action_set import ActionSet
from cfmining.utils import get_data_model
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice


%load_ext autoreload
%autoreload 2

In [2]:
SEED = 0

## Helper

In [3]:
def run_experiments(
        method,
        individuals, 
        model, 
        output_file = None,
    ):
    results = []

    if not output_file is None:
        folder = "/".join(output_file.split("/")[:-1])
        if not os.path.exists(folder):
            os.makedirs(folder, exist_ok = True)

    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[i]
        try:
            model.clear_cache()
        except:
            pass
        start = time.time()
        method.fit(individual.values)
        end = time.time()

        solutions = method.solutions
        
        results.append({
            "individual" : individual.values.tolist(),
            "prob" : model.predict_proba(individual.values),
            "time" : end - start,
            "n_solutions" : len(method.solutions),
            "solutions" : solutions,
        })

        #print(f"Prob. max counter: {method.prob_max_counter} | Prob: {results[-1]['prob']:.2f}")
        if output_file is not None:
            pd.DataFrame(results).to_csv(output_file, index=False)

        

    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results

## German

In [4]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "LGBMClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [5]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome']
categoric_features = [col for col in X_train.columns if col not in continuous_features]

action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)
class FakeOutlierDetection():
    def predict(self, X):
        return [1]
model_shap_no_outlier = GeneralClassifier_Shap(model, FakeOutlierDetection(), X_train, shap_explainer="tree", threshold = 0.5)

### MAPOFCEM

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/mapofcem_percentile.csv"
)

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    estimate_outlier=True,
    max_changes = 3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/mapofcem_v2_percentile.csv"
)

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap_no_outlier,
    compare = "percentile",
    estimate_outlier=False,
    max_changes = 3,
    outlier_percentile=0.0
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/mapofcem_no_outlier_percentile.csv"
)

### MAPOCAM

In [9]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/mapocam_percentile.csv"
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:22<00:00,  2.17it/s]


### Bruteforce

In [None]:
model_shap.use_predict_max = False # to not use prediction max
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()


method = Bruteforce(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/german/bruteforce_percentile.csv"
)

### DICE

In [None]:
method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = 1,
    mutable_features = mutable_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/german/dice_1sol.csv"
)

### NICE

In [12]:
method = Nice(
    X_train,
    Y_train,
    model = model,
    cat_features = categoric_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/german/nice.csv"
)

  0%|                                                                                                                  | 0/50 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 115.67it/s]


## Taiwan

In [10]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("taiwan")
#convert bools to int
X_train = X_train.astype(int)
individuals = individuals.astype(int)
individuals = individuals.sample(n = 50, random_state=SEED)

In [11]:
not_mutable_features = ['Single', 'Age_in_25_to_40', 'Married', 'Age_lt_25', 'Age_in_40_to_59', 'Age_geq_60', 'EducationLevel']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                       "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1",
                       "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5",
                       "PAY_AMT6", "MaxBillAmountOverLast6Months", "MaxPaymentAmountOverLast6Months",
                       "MostRecentBillAmount", "MostRecentPaymentAmount", "MostRecentPaymentAmount", "TotalMonthsOverdue"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]
action_set = ActionSet(X = X_train, default_step_size = 0.1, mutable_features = mutable_features)
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)
class FakeOutlierDetection():
    def predict(self, X):
        return [1]
model_shap_no_outlier = GeneralClassifier_Shap(model, FakeOutlierDetection(), X_train, shap_explainer="tree", threshold = 0.5)

### MAPOFCEM

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3,
    outlier_percentile = 0.01,
    estimate_prob_max=False,
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/taiwan/mapofcem_percentile.csv"
)

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3,
    outlier_percentile = 0.01,
    estimate_outlier = True,
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/taiwan/mapofcem_v2_percentile.csv"
)

In [None]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap_no_outlier,
    compare = "percentile",
    estimate_outlier=False,
    max_changes = 3,
    outlier_percentile=0.0,
    estimate_prob_max=False
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/taiwan/mapofcem_no_outlier_percentile.csv"
)

### MAPOCAM

In [17]:
model_shap.use_predict_max = False # to not use max prediction
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

method = MAPOCAM(
    action_set_,
    model_shap,
    criteria = "percentile",
    max_changes=3
)

run_experiments(
    method,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/lgbm/taiwan/mapocam_percentile.csv"
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:40<00:00,  2.01s/it]


### DICE

In [None]:
method = Dice(
    X_train,
    Y_train,
    model_shap.clf,
    n_cfs = 1,
    mutable_features = mutable_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/taiwan/dice_1sol.csv"
)

### NICE

In [19]:
method = Nice(
    X_train,
    Y_train,
    model = model,
    cat_features = categoric_features,
)

run_experiments(
    method,
    individuals = individuals,
    model = model_shap,
    output_file=f"../results/lgbm/taiwan/nice.csv"
)

  0%|                                                                                                                  | 0/50 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 45.45it/s]
