In [1]:
import json
import os
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM, BruteForce, MAPOCAM
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import GeneralClassifier_Shap, GeneralClassifier
from cfmining.action_set import ActionSet
from cfmining.utils import get_data_model

import dice_ml
from nice import NICE


%load_ext autoreload
%autoreload 2

In [2]:
SEED = 0

## Helper

In [3]:
class DiCE_wrapper:
    def __init__(self, dice_data, dice_model, total_CFs, mutable_features, sparsity_weight = 0.2):
        self.total_CFs = total_CFs
        self.sparsity_weight = sparsity_weight
        self.mutable_features = mutable_features
        self.exp = dice_ml.Dice(dice_data, dice_model)


    def fit(self, individual):
        dice_exp = self.exp.generate_counterfactuals(
            individual,
            total_CFs = self.total_CFs,
            desired_class = "opposite",
            sparsity_weight = self.sparsity_weight,
            features_to_vary= self.mutable_features,
        )
        solutions  = json.loads(dice_exp.to_json())["cfs_list"][0]
        self.solutions = [solution[:-1] for solution in solutions]
        return self


def get_dice_results(
    method,
    model,
    individuals,
    output_file = None,
):
    results = []
    
    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[[i]]
        model.clear_cache()
        start = time.time()
        method.fit(individual)
        end = time.time()

        results.append({
            "individual" : individual.values.tolist()[0],
            "prob" : model.predict_proba(individual.values[0]),
            "time" : end - start,
            "n_solutions" : len(method.solutions),
            "solutions" : method.solutions,
        })
    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results

In [4]:
def get_nice_results(
    method,
    model,
    individuals,
    output_file=None,
   
):
    

    results = []

    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[[i]]
        start = time.time()

        # Generate counterfactuals using NICE
        cf = method.explain(individual.values).tolist()

        end = time.time()

        results.append({
            "individual": individual.values.tolist()[0],
            "prob": model.predict_proba(individual.values[0]),
            "time": end - start,
            "n_solutions": len(cf),
            "solutions": cf,
        })

    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results


In [5]:
class MAPOCAM_wrapper:
    def __init__(self, action_set, model, criteria, max_changes):
        self.action_set = action_set
        self.model = model
        if criteria == "percentile":
            perc_calc = PercentileCalculator(action_set = action_set)
            self.compare = lambda ind : PercentileCriterion(ind, perc_calc)
        elif criteria == "percentile_changes":
            perc_calc = PercentileCalculator(action_set = action_set)
            self.compare = lambda ind : PercentileChangesCriterion(ind, perc_calc)
        elif criteria == "nom_dom":
            self.compare = lambda ind : NonDomCriterion(ind)
            
        self.max_changes = max_changes

    def fit(self, individual):
        m = MAPOCAM(
            self.action_set,
            individual,
            self.model, 
            max_changes = self.max_changes,
            compare = self.compare(individual)
        )
        m.fit()
        self.solutions = m.solutions
        return self


In [6]:
class BruteForce_wrapper:
    def __init__(self, action_set, model, criteria, max_changes):
        self.action_set = action_set
        self.model = model
        if criteria == "percentile":
            perc_calc = PercentileCalculator(action_set = action_set)
            self.compare = lambda ind : PercentileCriterion(ind, perc_calc)
        elif criteria == "percentile_changes":
            perc_calc = PercentileCalculator(action_set = action_set)
            self.compare = lambda ind : PercentileChangesCriterion(ind, perc_calc)
        elif criteria == "nom_dom":
            self.compare = lambda ind : NonDomCriterion(ind)
            
        self.max_changes = max_changes

    def fit(self, individual):
        m = BruteForce(
            self.action_set,
            individual,
            self.model, 
            max_changes = self.max_changes,
            compare = self.compare(individual)
        )
        m.fit()
        self.solutions = m.solutions
        return self

In [7]:
def get_mapofcem_results(
        method,
        individuals, 
        model, 
        output_file = None,
    ):
    results = []

    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[i]
        model.clear_cache()
        start = time.time()
        method.fit(individual.values)
        end = time.time()

        solutions = method.solutions
        solutions = [s.tolist() for s in solutions]
        
        results.append({
            "individual" : individual.values.tolist(),
            "prob" : model.predict_proba(individual.values),
            "time" : end - start,
            "n_solutions" : len(method.solutions),
            "solutions" : solutions,
        })

        if output_file is not None:
            pd.DataFrame(results).to_csv(output_file, index=False)

        

    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results
    
    
def run_all_mapofcem_variations(
    dataset_name,
    action_set,
    individuals,
    model_shap,
):
    # MAPOFCEM

    ## MPC

    method = MAPOFCEM(
        action_set,
        model_shap,
        compare = "percentile",
        estimate_prob_max=True,
        estimate_outlier=False,
        max_changes = 3
    )
    

    get_mapofcem_results(
        method=method,
        individuals=individuals,
        model=model_shap,
        output_file=f"../results/{dataset_name}/mapofcem_percentile.csv"
    )


    method = MAPOFCEM(
        action_set,
        model_shap,
        compare = "percentile",
        estimate_prob_max=True,
        estimate_outlier=True,
        max_changes = 3
    )

    get_mapofcem_results(
        method=method,
        individuals=individuals,
        model=model_shap,
        output_file=f"../results/{dataset_name}/mapofcem_v2_percentile.csv"
    )


    return

## German

In [8]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "LGBMClassifier")
individuals = individuals.sample(n = 50, random_state=SEED)

In [10]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome']
categoric_features = [col for col in X_train.columns if col not in continuous_features]

action_set = ActionSet(X = X_train, default_step_size = 0.05, mutable_features = mutable_features)

In [11]:
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

### MAPOFCEM

In [93]:
run_all_mapofcem_variations(
    dataset_name="german",
    action_set=action_set,
    individuals=individuals,
    model_shap=model_shap,
)

  0%|                                                                                                           | 0/50 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:25<00:00,  1.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:36<00:00,  1.38it/s]


### MAPOCAM

In [96]:
model_shap.use_predict_max = False

In [102]:
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()


mapocam = MAPOCAM_wrapper(
    action_set = action_set_,
    model = model_shap,
    criteria = "percentile",
    max_changes = 3
)

get_mapofcem_results(
    mapocam,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/german/mapocam_percentile.csv"
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:19<00:00,  2.60it/s]


### Bruteforce

In [106]:
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

bruteforce = BruteForce_wrapper(
    action_set=action_set_,
    model = model_shap,
    criteria="percentile",
    max_changes=3
)

get_mapofcem_results(
    bruteforce,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/german/bruteforce_percentiles.csv"
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [13:46<00:00, 16.54s/it]


### DICE

In [12]:
dice_model = dice_ml.Model(
    model = model,
    backend = "sklearn",
    model_type = "classifier"
)
X_train_extended = X_train.copy()
X_train_extended["target"] = Y_train
dice_data = dice_ml.Data(
    dataframe = X_train_extended,
    continuous_features = X_train.columns.tolist(),
    outcome_name = "target",
)
method = DiCE_wrapper(
    dice_data = dice_data,
    dice_model = dice_model,
    total_CFs= 1,
    mutable_features= mutable_features
)

In [13]:
get_dice_results(
    method,
    model_shap,
    individuals = individuals,
    output_file=f"../results/german/dice_1sol.csv"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.51it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████████

### NICE

In [14]:
predict_fn = lambda x: model.predict_proba(x)

columns_list = X_train.columns.tolist()
cat_feat = [columns_list.index(feat) for feat in categoric_features]
num_feat = [columns_list.index(feat) for feat in continuous_features]

method = NICE(
    X_train=X_train.values,
    predict_fn=predict_fn,
    y_train=Y_train,
    cat_feat=cat_feat,
    num_feat=num_feat,
    distance_metric='HEOM',
    num_normalization='minmax',
    optimization='proximity',
    justified_cf=True
)


get_nice_results(
    method, 
    model_shap,
    individuals=individuals,
    output_file="../results/german/nice_sol.csv"
)


  0%|                                                                                                                                                                             | 0/50 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 183.07it/s]


## Taiwan

In [30]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("taiwan")
#convert bools to int
X_train = X_train.astype(int)
individuals = individuals.astype(int)
individuals = individuals.sample(n = 50, random_state=SEED)

In [31]:
not_mutable_features = ['Single', 'Age_in_25_to_40', 'Married', 'Age_lt_25', 'Age_in_40_to_59', 'Age_geq_60', 'EducationLevel']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ["LIMIT_BAL", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                       "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1",
                       "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5",
                       "PAY_AMT6", "MaxBillAmountOverLast6Months", "MaxPaymentAmountOverLast6Months",
                       "MostRecentBillAmount", "MostRecentPaymentAmount", "MostRecentPaymentAmount", "TotalMonthsOverdue"]
categoric_features = [col for col in X_train.columns if col not in continuous_features]
action_set = ActionSet(X = X_train, default_step_size = 0.1, mutable_features = mutable_features)

### MAPOFCEM

In [37]:
model_shap = GeneralClassifier_Shap(model, outlier_detection, X_train, shap_explainer="tree", threshold = 0.5)

In [38]:
run_all_mapofcem_variations(
    dataset_name="taiwan",
    action_set=action_set,
    individuals=individuals,
    model_shap=model_shap,
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:49<00:00,  1.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:55<00:00,  1.11s/it]


### MAPOCAM

In [32]:
model_shap.use_predict_max = False

In [33]:
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

mapocam = MAPOCAM_wrapper(
    action_set = action_set_,
    model = model_shap,
    criteria="percentile",
    max_changes=3
)


get_mapofcem_results(
    mapocam,
    individuals=individuals,
    model=model_shap,
    output_file=f"../results/taiwan/mapocam_percentile.csv"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:48<00:00,  2.17s/it]


### Bruteforce

In [None]:
# little fix to action set
action_set_ = copy.deepcopy(action_set)
for feat in action_set_:
    feat.flip_direction = 1
    feat.update_grid()

bruteforce = BruteForce_wrapper(
    action_set=action_set_,
    model = model_shap,
    criteria="percentile",
    max_changes=3
)

# get_mapofcem_results(
#     bruteforce,
#     individuals=individuals,
#     model=model_shap,
#     output_file=f"../results/taiwan/bruteforce_percentile.csv"
# )

### DICE

In [34]:
dice_model = dice_ml.Model(
    model = model,
    backend = "sklearn",
    model_type = "classifier"
)
X_train_extended = X_train.copy()
X_train_extended["target"] = Y_train
dice_data = dice_ml.Data(
    dataframe = X_train_extended,
    continuous_features = X_train.columns.tolist(),#continuous_features,
    outcome_name = "target",
)

method = DiCE_wrapper(
    dice_data = dice_data,
    dice_model = dice_model,
    total_CFs= 1,
    mutable_features= mutable_features
)




In [35]:
get_dice_results(
    method,
    model_shap,
    individuals = individuals,
    output_file=f"../results/taiwan/dice_1sol.csv"
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.26it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  

### NICE

In [36]:
predict_fn = lambda x: model.predict_proba(x)

columns_list = X_train.columns.tolist()
cat_feat = [columns_list.index(feat) for feat in categoric_features]
num_feat = [columns_list.index(feat) for feat in continuous_features]

method = NICE(
    X_train=X_train.values,
    predict_fn=predict_fn,
    y_train=Y_train,
    cat_feat=cat_feat,
    num_feat=num_feat,
    distance_metric='HEOM',
    num_normalization='minmax',
    optimization='proximity',
    justified_cf=True
)


get_nice_results(
    method, 
    model_shap,
    individuals=individuals,
    output_file="../results/taiwan/nice_sol.csv"
)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 42.37it/s]
