In [1]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

from sklearn import preprocessing, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from isotree import IsolationForest
from sklearn import tree, ensemble
from cfmining.mip_algorithms import ForestRecourseActions
from cfmining.predictors import TreeClassifier

from cfmining.mip_algorithms import LinearRecourseActions, LinearRecourseActionsMulti
from cfmining.algorithms import MAPOCAM, BruteForce, Greedy, MAPOCAM2
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier, GeneralClassifier, TreeClassifier, LinearClassifier, LinearRule, GeneralClassifier_Shap
from cfmining.action_set import ActionSet

from results import save_result

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

## Preparing dataset

In [48]:
data_dir = "data/"
data_name = 'german'
data_file = os.path.join(data_dir, '%s_processed.csv' % data_name)
## load and process data
german_df = pd.read_csv(data_file).reset_index(drop=True)
german_df = (german_df
             .assign(isMale=lambda df: (df['Gender']=='Male').astype(int))
             .drop(['PurposeOfLoan', 'Gender', 'OtherLoansAtStore'], axis=1)
            )

y = german_df['GoodCustomer']
# tranform y to [0, 1]
y = (y == 1).astype(int)
X = german_df.drop('GoodCustomer', axis=1)
#X = X[['ForeignWorker', 'Single', 'Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome', 'isMale', 'YearsAtCurrentHome',
#'NumberOfOtherLoansAtBank', 'NumberOfLiableIndividuals', 'HasTelephone',]]

In [49]:
Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=100)

In [50]:
#outlier_detection = IsolationForest(contamination=0.1, n_estimators=50)
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="fail")
outlier_detection.fit(Xtr);
print(np.unique(outlier_detection.predict(Xtr) > 0.6, return_counts=True))

(array([False,  True]), array([890,  10]))


## Models

In [51]:
stand = preprocessing.StandardScaler()
clf_base = LogisticRegression(max_iter=1000, class_weight='balanced')
clf = pipeline.Pipeline([('std', stand), ('clf', clf_base)])

grid = GridSearchCV(
  clf, param_grid={'clf__C': np.logspace(-4, 3)},
  cv=5,
  scoring='roc_auc',
  verbose=0
)

grid.fit(Xtr, ytr)
clf_lgr = grid.best_estimator_

clf_rt = ensemble.RandomForestClassifier(
  n_estimators=10, 
  max_depth=5, 
  max_leaf_nodes=31,
  class_weight='balanced_subsample'
)
clf_rt.fit(Xtr, ytr);

In [52]:
threshold=0.5
scores_lgr = pd.Series(clf_lgr.predict_proba(Xts)[:, 1])
denied_individuals_lgr = scores_lgr.loc[lambda s: s < threshold].index

scores_rt = pd.Series(clf_rt.predict_proba(Xts)[:, 1])
denied_individuals_rt = scores_rt.loc[lambda s: s < threshold].index

## Setting ActionSet base parameters

In [53]:
action_set_base = ActionSet(X = X)
for feat in action_set_base:
    if feat.name in ['Age', 'JobClassIsSkilled', 'OwnsHouse', 'isMale', 'JobClassIsSkilled']:
        feat.mutable = False
        feat.step_direction = 1
    if feat.name in ['Single', 'ForeignWorker', 'RentsHouse']:
        feat.mutable = False
        feat.step_direction = -1
    if feat.name in ['LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome', 'NumberOfOtherLoansAtBank', 'MissedPayments',
                     'CriticalAccountOrLoansElsewhere', 'OtherLoansAtBank', 'Unemployed', 'YearsAtCurrentJob_lt_1']:
        feat.mutable = True
        feat.step_direction = -1
    if feat.name in ['YearsAtCurrentHome', 'NumberOfLiableIndividuals', 'HasTelephone', 'CheckingAccountBalance_geq_0',
                     'CheckingAccountBalance_geq_200', 'SavingsAccountBalance_geq_100', 'SavingsAccountBalance_geq_500',
                     'NoCurrentLoan', 'HasCoapplicant', 'HasGuarantor', 'YearsAtCurrentJob_geq_4']:
        feat.mutable = True
        feat.step_direction = 1
    
    feat.flip_direction = 1
    feat.update_grid()

In [54]:
action_set = copy.deepcopy(action_set_base)
action_set['Age'].mutable = False
action_set['Age'].update_grid()

action_set['LoanDuration'].step_type ="absolute"
action_set['LoanDuration'].step_size = 6
action_set['LoanDuration'].update_grid()

action_set['LoanAmount'].step_size = 0.1
action_set['LoanAmount'].update_grid()

print('ActionSet stats')
print('Number of actionable features:', sum([action.actionable for action in action_set]))
print('Mean number of actions per feature:', np.nanmean([len(action._grid) if action.actionable else np.nan for action in action_set]))
print('Max number of actions per feature:', np.nanmax([len(action._grid) if action.actionable else np.nan for action in action_set]))
log_combinations = int(np.prod([len(action._grid) for action in action_set if action_set.actionable]))
print('Number of combinations:', log_combinations)

ActionSet stats
Number of actionable features: 11
Mean number of actions per feature: 2.1818181818181817
Max number of actions per feature: 4.0
Number of combinations: 542575165440


In [55]:
percCalc_lgr = PercentileCalculator(action_set=action_set)

In [56]:
action_set_tree = copy.deepcopy(action_set_base)
action_set_tree.embed_forest(clf_rt)
for action in action_set_tree:
    action.flip_direction = action.step_direction
    
print('ActionSet stats')
print('Number of actionable features:', sum([action.mutable for action in action_set_tree]))
print('Mean number of actions per feature:', np.mean([len(action._grid)-1 for action in action_set_tree]))
print('Max number of actions per feature:', np.max([len(action._grid)-1 for action in action_set_tree]))
rt_combinations = int(np.prod([len(action._grid) for action in action_set_tree if action.actionable]))
print('Number of combinations:', rt_combinations)

ActionSet stats
Number of actionable features: 20
Mean number of actions per feature: 3.3333333333333335
Max number of actions per feature: 30
Number of combinations: 926416896


In [57]:
percCalc_rt = PercentileCalculator(action_set=action_set_tree)

## Counterfactuals Logistic

In [58]:
clf_lgr_mapocam = MonotoneClassifier(clf_lgr, X=Xtr, y=ytr, threshold=threshold)

clf_lgr_shap = GeneralClassifier_Shap(clf_lgr, X=Xtr, y=ytr, threshold=threshold)

PermutationExplainer explainer: 901it [00:38, 19.90it/s]                         


In [59]:
mapocam1_solutions = []
mapocam2_solutions = []
mapocam2_solutions_outlier = []
for i in denied_individuals_lgr:
    individual = Xts.iloc[i].values
    criteria = PercentileCriterion(individual, percCalc_lgr)

    mapocam = MAPOCAM(
        action_set, 
        individual, 
        clf_lgr_mapocam,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam1_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set, 
        individual, 
        clf_lgr_shap,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set, 
        individual, 
        clf_lgr_shap, 
        outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions_outlier.append(mapocam.solutions)
    

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScale

In [60]:
def check_equal_solutions(solutions_1, solutions_2):
    if len(solutions_1) != len(solutions_2):
        return False
    
    for sol_1 in solutions_1:
        for sol_2 in solutions_2:
            if not np.all(sol_1 == sol_2):
                return False
            
    return True

def table_with_solutions(individual, solutions_1, solutions_2, solutions_1_name = "sol1", solutions_2_name = "sol2"):
    table = pd.DataFrame(individual)
    table.columns = ['original']

    if len(solutions_1) == 0:
        table[solutions_1_name + ' (0)'] = None
    for i, sol in enumerate(solutions_1):
        table[f"{solutions_1_name} ({i})"] = sol
    
    if len(solutions_2) == 0:
        table[solutions_2_name + ' (0)'] = None
    for i, sol in enumerate(solutions_2):
        table[f"{solutions_2_name} ({i})"] = sol

    table_T = table.T
    changed_columns = []
    fixed_columns = []
    for col in table_T.columns:
        # get unique that are not none
        unique = table_T[col].unique()
        unique = unique[~pd.isnull(unique)]
        if len(unique) == 1:
            fixed_columns.append(col)
        else:
            changed_columns.append(col)

    table_T = table_T[changed_columns + fixed_columns]
    table = table_T.T

    return table

In [61]:
for i in range(len(denied_individuals_lgr)):
    
    if check_equal_solutions(mapocam1_solutions[i], mapocam2_solutions[i]):
        continue

    individual = Xts.iloc[denied_individuals_lgr[i]]
    table = table_with_solutions(individual, mapocam1_solutions[i], mapocam2_solutions[i], "mapocam1", "mapocam2")
    print(table)
    print("---")


                                 original  mapocam1 (0)  mapocam2 (0)
LoanDuration                           18             6             6
LoanAmount                           3244          1801          3244
YearsAtCurrentJob_geq_4                 0             1             1
ForeignWorker                           0             0             0
Single                                  0             0             0
Age                                    33            33            33
LoanRateAsPercentOfIncome               1             1             1
YearsAtCurrentHome                      4             4             4
NumberOfOtherLoansAtBank                2             2             2
NumberOfLiableIndividuals               1             1             1
HasTelephone                            1             1             1
CheckingAccountBalance_geq_0            1             1             1
CheckingAccountBalance_geq_200          0             0             0
SavingsAccountBalanc

In [62]:
for i in range(len(denied_individuals_lgr)):
    
    if check_equal_solutions(mapocam2_solutions[i], mapocam2_solutions_outlier[i]):
        continue

    individual = Xts.iloc[denied_individuals_lgr[i]]
    table = table_with_solutions(individual, mapocam2_solutions[i], mapocam2_solutions_outlier[i], "mapocam2", "mapocam2_outlier")
    print(table)
    print("---")


## Counterfactuals RF

In [63]:
clf_rf_mapocam =  TreeClassifier(clf_rt, Xtr, ytr, threshold=threshold, use_predict_max=True)

In [64]:
clf_rf_shap = GeneralClassifier_Shap(clf_rt, X=Xtr, y=ytr, threshold=threshold)

PermutationExplainer explainer: 901it [00:27, 20.85it/s]                         


In [65]:
mapocam1_solutions = []
mapocam2_solutions = []
mapocam2_solutions_outlier = []
for i in denied_individuals_lgr:
    individual = Xts.iloc[i].values
    criteria = PercentileCriterion(individual, percCalc_rt)
    clf_rf_mapocam.fit(individual, action_set_tree)
    mapocam = MAPOCAM(
        action_set_tree, 
        individual, 
        clf_lgr_mapocam,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam1_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set_tree, 
        individual, 
        clf_rf_shap, 
        #outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set_tree, 
        individual, 
        clf_rf_shap, 
        outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions_outlier.append(mapocam.solutions)
    

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScale

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScale

In [66]:
for i in range(len(denied_individuals_lgr)):
    
    if check_equal_solutions(mapocam1_solutions[i], mapocam2_solutions[i]):
        continue

    individual = Xts.iloc[denied_individuals_lgr[i]]
    table = table_with_solutions(individual, mapocam1_solutions[i], mapocam2_solutions[i], "mapocam1", "mapocam2")
    print(table)
    print("---")


[autoreload of cfmining.mip_builder failed: Traceback (most recent call last):
  File "/home/giovani/anaconda3/envs/cfmining/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/home/giovani/anaconda3/envs/cfmining/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/home/giovani/anaconda3/envs/cfmining/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/home/giovani/hiaac/cfmining/cfmining/mip_builder.py", line 1259, in <module>
    _SOLVER_TYPE_CPX: _RecourseBuilderCPX,
NameError: name '_SOLVER_TYPE_CPX' is not defined. Did you mean: '_SOLVER_TYPE_CBC'?
]


                                 original  mapocam1 (0)  mapocam2 (0)
LoanDuration                           24            20            20
LoanAmount                           1282          1282          1206
HasGuarantor                            0             0             1
ForeignWorker                           0             0             0
Single                                  0             0             0
Age                                    32            32            32
LoanRateAsPercentOfIncome               4             4             4
YearsAtCurrentHome                      2             2             2
NumberOfOtherLoansAtBank                1             1             1
NumberOfLiableIndividuals               1             1             1
HasTelephone                            0             0             0
CheckingAccountBalance_geq_0            0             0             0
CheckingAccountBalance_geq_200          0             0             0
SavingsAccountBalanc

In [67]:
for i in range(len(denied_individuals_lgr)):
    
    if check_equal_solutions(mapocam2_solutions[i], mapocam2_solutions_outlier[i]):
        continue

    individual = Xts.iloc[denied_individuals_lgr[i]]
    table = table_with_solutions(individual, mapocam2_solutions[i], mapocam2_solutions_outlier[i], "mapocam2", "mapocam2_outlier")
    print(table)
    print("---")


                                 original  mapocam2 (0)  mapocam2_outlier (0)
LoanDuration                           20            20                    16
LoanAmount                           7057          4038                  3444
OtherLoansAtBank                        1             1                     0
HasGuarantor                            0             1                     0
ForeignWorker                           0             0                     0
Single                                  1             1                     1
Age                                    36            36                    36
LoanRateAsPercentOfIncome               3             3                     3
YearsAtCurrentHome                      4             4                     4
NumberOfOtherLoansAtBank                2             2                     2
NumberOfLiableIndividuals               2             2                     2
HasTelephone                            1             1         