In [1]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import joblib
import time
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import GeneralClassifier_Shap
from cfmining.action_set import ActionSet

import dice_ml


%load_ext autoreload
%autoreload 2

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
def get_mapofcem_results(
        mapofcem_setter,
        individuals, 
        model, 
        outlier_detection, 
        output_file = None,
    ):
    results = []

    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[i]
        
        start = time.time()
        mapofcem = mapofcem_setter(individual.values, model)
        mapofcem.fit()
        end = time.time()

        solutions = mapofcem.solutions
        is_outlier = sum([outlier_detection.predict(np.array(solution)[None, :]) == -1 for solution in solutions], 0)
        
        results.append({
            "individual" : individual.values,
            "prob" : model.predict_proba(individual.values),
            "time" : end - start,
            "n_solutions" : len(mapofcem.solutions),
            "solutions" : mapofcem.solutions,
            "outlier" : is_outlier,
        })

    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results

In [3]:
def get_dice_results(
    dice_model,
    dice_data,
    individuals,
    mutable_features,
    outlier_detection,
    total_CFs,
    sparsity_weight = 0.2,
    output_file = None,
):
    exp = dice_ml.Dice(dice_data, dice_model)
    results = []
    
    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[[i]]
        start = time.time()
        
        start = time.time()
        dice_exp = exp.generate_counterfactuals(
            individual,
            total_CFs = total_CFs,
            desired_class = "opposite",
            sparsity_weight = sparsity_weight,
            features_to_vary=mutable_features,
        )
        end = time.time()

        # convert dice solutions to an adequated format
        solutions = json.loads(dice_exp.to_json())["cfs_list"][0]

        # remove last element of all solutions
        solutions = [solution[:-1] for solution in solutions]
        is_outlier = sum([outlier_detection.predict(np.array(solution)[None, :]) == -1 for solution in solutions], 0)

        results.append({
            "individual" : individual.values,
            "prob" : dice_model.model.predict_proba(individual.values)[0, 1],
            "time" : end - start,
            "n_solutions" : len(solutions),
            "solutions" : solutions,
            "outlier" : is_outlier,
        })
    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results

In [4]:
def mapofcem_wrapper(
        action_set,
        criteria,
        criteria_param,
        outlier_detection,
        estimate_outlier,
        total_CFs,
        max_changes
):  
    def f_(ind, model):
        if criteria == "Percentile":
            compare = PercentileCriterion(ind, criteria_param)
        elif criteria == "NonDom":
            compare = NonDomCriterion(criteria_param)
        elif criteria == "PercentileChanges":
            compare = PercentileChangesCriterion(ind, criteria_param)

        return MAPOFCEM(
            action_set,
            ind,
            model,
            outlier_detection=outlier_detection,
            estimate_outlier=estimate_outlier,
            total_CFs=total_CFs,
            max_changes=max_changes,
            compare = compare
        )
    return f_

## German

In [5]:
VAL_RATIO = 1/7
TEST_RATIO = 0.3
SEED = 0

df = pd.read_csv("../data/german.csv")
X = df.drop("GoodCustomer", axis=1)
Y = df["GoodCustomer"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

model = joblib.load("../models/german/LGBMClassifier.pkl")
isolation_tree = joblib.load("../models/german/IsolationForest.pkl")

denied_individ = model.predict(X_test) == 0
individuals = X_test.iloc[denied_individ].reset_index(drop = True)

In [6]:
action_set = ActionSet(X = X_train)
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome']
for feat in action_set:
    if feat.name in not_mutable_features:
        feat.mutable = False
    if not feat.name in not_mutable_features:
        feat.mutable = True

    feat.step_direction = 0
    feat.update_grid()

In [7]:
extra_not_mutable_features = [
    'LoanRateAsPercentOfIncome', 
    'NumberOfOtherLoansAtBank', 
    'NumberOfLiableIndividuals', 
    'HasTelephone',
    'CheckingAccountBalance_geq_0', 
    'CheckingAccountBalance_geq_200',
    'SavingsAccountBalance_geq_100', 
    'SavingsAccountBalance_geq_500',
    'MissedPayments', 
    'CriticalAccountOrLoansElsewhere', 
    'HasCoapplicant', 
    'HasGuarantor',
]

## mapofcem

In [8]:
percCalc = PercentileCalculator(action_set = action_set)
model_shap = GeneralClassifier_Shap(model, X_train, tree = True)

experiment to fint the best solution

In [None]:
for (i, n_extra_not_mutable) in enumerate([0, 4, 8, 12]):
    extra_not_mut = extra_not_mutable_features[:n_extra_not_mutable]

    not_mut = not_mutable_features + extra_not_mut
    mut = [feat for feat in X_train.columns if feat not in not_mut]
    for feat in action_set:
        if feat.name in not_mut:
            feat.mutable = False
        if not feat.name in not_mut:
            feat.mutable = True

        feat.step_direction = 0
        feat.update_grid()
    

    mapofcem_setter = mapofcem_wrapper(
        action_set=action_set,
        criteria="Percentile",
        criteria_param=percCalc,
        outlier_detection=isolation_tree,
        estimate_outlier=True,
        total_CFs=np.inf,
        max_changes=4
    )

    get_mapofcem_results(
        mapofcem_setter=mapofcem_setter,
        individuals=individuals,
        model=model_shap,
        outlier_detection=isolation_tree,
        output_file=f"../results/german/mapofcem_best_sol_{i}.csv"
    )

experiment to find solutions in the pareto front

In [None]:
for (i, n_extra_not_mutable) in enumerate([0]): #, 4, 8, 12]):
    extra_not_mut = extra_not_mutable_features[:n_extra_not_mutable]

    not_mut = not_mutable_features + extra_not_mut
    mut = [feat for feat in X_train.columns if feat not in not_mut]
    for feat in action_set:
        if feat.name in not_mut:
            feat.mutable = False
        if not feat.name in not_mut:
            feat.mutable = True

        feat.step_direction = 0
        feat.update_grid()
    

    mapofcem_setter = mapofcem_wrapper(
        action_set=action_set,
        criteria="PercentileChanges",
        criteria_param=percCalc,
        outlier_detection=isolation_tree,
        estimate_outlier=True,
        total_CFs=np.inf,
        max_changes=4
    )

    get_mapofcem_results(
        mapofcem_setter=mapofcem_setter,
        individuals=individuals,
        model=model_shap,
        outlier_detection=isolation_tree,
        output_file=f"../results/german/mapofcem_pareto_{i}.csv"
    )

## dice

In [7]:
dice_model = dice_ml.Model(
    model = model,
    backend = "sklearn",
    model_type = "classifier"
)
X_train_extended = X_train.copy()
X_train_extended["target"] = Y_train
dice_data = dice_ml.Data(
    dataframe = X_train_extended,
    continuous_features = X_train.columns.tolist(),#continuous_features,
    outcome_name = "target",
)


experiment to find the best solution

In [29]:
for (i, n_extra_not_mutable) in enumerate([0, 4, 8, 12]):
    extra_not_mut = extra_not_mutable_features[:n_extra_not_mutable]

    not_mut = not_mutable_features + extra_not_mut
    mut = [feat for feat in X_train.columns if feat not in not_mut]
    
    get_dice_results(
        dice_model = dice_model,
        dice_data = dice_data,
        individuals = individuals,
        mutable_features = mut, 
        outlier_detection=isolation_tree,
        total_CFs=1,
        sparsity_weight=0.2,
        output_file=f"../results/german/dice_best_sol_{i}.csv"
    )

  0%|          | 0/78 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  6.62it/s]
100%|██████████| 1/1 [00:00<00:00, 11.46it/s]]
100%|██████████| 1/1 [00:00<00:00, 12.40it/s]
100%|██████████| 1/1 [00:00<00:00, 10.17it/s]]
100%|██████████| 1/1 [00:00<00:00,  8.73it/s]]
100%|██████████| 1/1 [00:00<00:00,  8.51it/s]]
100%|██████████| 1/1 [00:00<00:00, 10.59it/s]]
100%|██████████| 1/1 [00:00<00:00, 10.50it/s]]
100%|██████████| 1/1 [00:00<00:00,  9.38it/s]]
100%|██████████| 1/1 [00:00<00:00, 11.14it/s]]
100%|██████████| 1/1 [00:00<00:00, 11.91it/s]
100%|██████████| 1/1 [00:00<00:00,  8.92it/s]s]
100%|██████████| 1/1 [00:00<00:00,  8.72it/s]s]
100%|██████████| 1/1 [00:00<00:00, 11.18it/s]s]
100%|██████████| 1/1 [00:00<00:00, 11.55it/s]s]
100%|██████████| 1/1 [00:00<00:00, 11.36it/s]s]
100%|██████████| 1/1 [00:00<00:00, 11.71it/s]s]
100%|██████████| 1/1 [00:00<00:00, 11.24it/s]
100%|██████████| 1/1 [00:00<00:00, 11.62it/s]s]
100%|██████████| 1/1 [00:00<00:00, 11.95it/s]
100%|██████████| 1/1 [00:00<00:00, 11.37it/s]s]
100%|█████