In [43]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import joblib
import time
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import GeneralClassifier_Shap
from cfmining.action_set import ActionSet

import dice_ml


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
def get_mapofcem_results(action_set, individuals, model, outlier_detection, criteria_type = "Percentile", criteria = None, percCalc = None):
    results = []

    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[i]
        start = time.time()
        if criteria_type == "Percentile":
            criteria = PercentileChangesCriterion(individual, percCalc)
        
        start = time.time()
        mapofcem = MAPOFCEM(
            action_set, 
            individual.values, 
            model, 
            outlier_detection=outlier_detection,
            estimate_outlier=True,
            total_CFs = 3,
            max_changes= 3, 
            compare = criteria,
        )
        mapofcem.fit()
        end = time.time()

        solutions = mapofcem.solutions
        if len(solutions) > 0:
            costs = [criteria.f(solution) for solution in solutions]
        else:
            costs = [np.inf]

        results.append({
            "individual" : individual.values,
            "prob" : model.predict_proba(individual.values),
            "time" : end - start,
            "n_solutions" : len(mapofcem.solutions),
            "solutions" : mapofcem.solutions,
            "costs" : costs,
            "min_cost" : np.min(costs),
            "mean_cost" : np.mean(costs),
        })

    results = pd.DataFrame(results)
    return results

In [76]:
def get_dice_results(
    dice_model,
    dice_data,
    individuals,
    mutable_features,
    outlier_detection,
    criteria_type = "Percentile",
    criteria = None,
    percCalc = None,
):
    exp = dice_ml.Dice(dice_data, dice_model)
    results = []
    
    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[[i]]
        start = time.time()
        if criteria_type == "Percentile":
            criteria = PercentileChangesCriterion(individual.iloc[0], percCalc)
        
        start = time.time()
        dice_exp = exp.generate_counterfactuals(
            individual,
            total_CFs = 3,
            desired_class = "opposite",
        )
        end = time.time()

        # convert dice solutions to an adequated format
        solutions = json.loads(dice_exp.to_json())["cfs_list"][0]

        # remove last element of all solutions
        solutions = [solution[:-1] for solution in solutions]

        if len(solutions) > 0:
            costs = [criteria.f(solution) for solution in solutions]
        else:
            costs = [np.inf]

        results.append({
            "individual" : individual.values,
            "prob" : dice_model.model.predict_proba(individual.values)[0, 1],
            "time" : end - start,
            "n_solutions" : len(solutions),
            "solutions" : solutions,
            "cost" : costs,
            "min_cost" : np.min(costs),
            "mean_cost" : np.mean(costs),
        })
    results = pd.DataFrame(results)
    return results

## German

In [48]:
VAL_RATIO = 1/7
TEST_RATIO = 0.3
SEED = 0

df = pd.read_csv("../data/german.csv")
X = df.drop("GoodCustomer", axis=1)
Y = df["GoodCustomer"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

model = joblib.load("../models/german/LGBMClassifier.pkl")
isolation_tree = joblib.load("../models/german/IsolationForest.pkl")

denied_individ = model.predict(X_test) == 0
individuals = X_test.iloc[denied_individ].reset_index(drop = True)

In [49]:
action_set = ActionSet(X = X_train)
not_mutable_features = ['Age', 'JobClassIsSkilled', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome']
for feat in action_set:
    if feat.name in not_mutable_features:
        feat.mutable = False
    if not feat.name in not_mutable_features:
        feat.mutable = True

    feat.step_direction = 0
    feat.update_grid()

## mapofcem

In [50]:
percCalc = PercentileCalculator(action_set = action_set)
model_shap = GeneralClassifier_Shap(model, X_train, ['ForeignWorker', 'Single'], tree = True)

In [51]:
results_mapofcem = get_mapofcem_results(
    action_set, 
    individuals, 
    model_shap, 
    isolation_tree, 
    criteria_type = "Percentile", 
    criteria = None, 
    percCalc = percCalc
)

  0%|          | 0/66 [00:00<?, ?it/s]

  5%|▍         | 3/66 [00:04<01:26,  1.38s/it]

Stoped due to maximum CFs


  9%|▉         | 6/66 [03:38<50:30, 50.51s/it]  

Stoped due to maximum CFs


 11%|█         | 7/66 [04:03<41:33, 42.26s/it]

Stoped due to maximum CFs


 29%|██▉       | 19/66 [08:18<14:56, 19.07s/it]

Stoped due to maximum CFs


 30%|███       | 20/66 [08:48<17:06, 22.33s/it]

Stoped due to maximum CFs


 32%|███▏      | 21/66 [08:50<12:06, 16.14s/it]

Stoped due to maximum CFs


 44%|████▍     | 29/66 [15:19<35:39, 57.83s/it]   

Stoped due to maximum CFs


 48%|████▊     | 32/66 [15:21<11:26, 20.19s/it]

Stoped due to maximum CFs


 53%|█████▎    | 35/66 [16:05<08:04, 15.61s/it]

Stoped due to maximum CFs


 56%|█████▌    | 37/66 [16:07<03:58,  8.24s/it]

Stoped due to maximum CFs


 65%|██████▌   | 43/66 [17:18<06:26, 16.81s/it]

Stoped due to maximum CFs


 70%|██████▉   | 46/66 [17:30<02:33,  7.69s/it]

Stoped due to maximum CFs


 71%|███████   | 47/66 [19:06<10:50, 34.23s/it]

Stoped due to maximum CFs


 79%|███████▉  | 52/66 [19:29<02:01,  8.68s/it]

Stoped due to maximum CFs


 86%|████████▋ | 57/66 [22:07<02:07, 14.17s/it]

Stoped due to maximum CFs


 92%|█████████▏| 61/66 [25:49<04:06, 49.37s/it]

Stoped due to maximum CFs


100%|██████████| 66/66 [25:54<00:00, 23.55s/it]


In [67]:
results_mapofcem[["prob", "time", "n_solutions", "min_cost", "mean_cost"]].mean()

prob            0.319699
time           23.548833
n_solutions     1.666667
min_cost        0.052203
mean_cost       0.758347
dtype: float64

In [22]:
r_ = results_mapofcem[["time", "n_solutions", "cost"]] 
r_mean = r_.mean().to_frame().T,
r_std = r_.std().to_frame().T
r_mean, r_std

((       time  n_solutions      cost
  0  0.957373          2.4  0.352023,),
        time  n_solutions      cost
 0  0.743886     0.894427  0.368422)

## dice

In [54]:
dice_model = dice_ml.Model(
    model = model,
    backend = "sklearn",
    model_type = "classifier"
)
X_train_extended = X_train.copy()
X_train_extended["target"] = Y_train
dice_data = dice_ml.Data(
    dataframe = X_train_extended,
    continuous_features = X_train.columns.tolist(),#continuous_features,
    outcome_name = "target",
)


In [77]:
results_dice = get_dice_results(
    dice_model,
    dice_data,
    individuals,
    mutable_features,
    outlier_detection = isolation_tree,
    criteria_type = "Percentile",
    criteria = None,
    percCalc = percCalc,
)


  0%|          | 0/66 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  9.64it/s]
100%|██████████| 1/1 [00:00<00:00, 10.92it/s]]
100%|██████████| 1/1 [00:00<00:00, 10.76it/s]]
100%|██████████| 1/1 [00:00<00:00, 10.55it/s]]
100%|██████████| 1/1 [00:00<00:00, 11.65it/s]]
100%|██████████| 1/1 [00:00<00:00, 12.33it/s]
100%|██████████| 1/1 [00:00<00:00,  9.75it/s]]
100%|██████████| 1/1 [00:00<00:00, 10.52it/s]]
100%|██████████| 1/1 [00:00<00:00,  9.05it/s]]
100%|██████████| 1/1 [00:00<00:00, 10.40it/s]]
100%|██████████| 1/1 [00:00<00:00,  4.80it/s]s]
100%|██████████| 1/1 [00:00<00:00, 10.98it/s]s]
100%|██████████| 1/1 [00:00<00:00, 12.11it/s]s]
100%|██████████| 1/1 [00:00<00:00, 13.64it/s]
100%|██████████| 1/1 [00:00<00:00, 13.51it/s]s]
100%|██████████| 1/1 [00:00<00:00, 14.02it/s]
100%|██████████| 1/1 [00:00<00:00, 10.84it/s]s]
100%|██████████| 1/1 [00:00<00:00, 12.73it/s]s]
100%|██████████| 1/1 [00:00<00:00, 13.36it/s]
100%|██████████| 1/1 [00:00<00:00, 12.40it/s]s]
100%|██████████| 1/1 [00:00<00:00, 13.36it/s]
100%|███████

In [78]:
results_dice[["prob", "time", "n_solutions", "min_cost", "mean_cost"]].mean()

prob           0.319699
time           0.108277
n_solutions    3.000000
min_cost       0.265085
mean_cost      1.127524
dtype: float64

In [51]:
r_ = results_dice[["time", "n_solutions", "cost"]] 
r_mean = r_.mean().to_frame().T,
r_std = r_.std().to_frame().T
r_mean, r_std

((       time  n_solutions      cost
  0  0.108025          3.0  0.430318,),
        time  n_solutions      cost
 0  0.070547          0.0  0.300087)