In [1]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

from sklearn import preprocessing, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest
from sklearn import tree, ensemble
from cfmining.mip_algorithms import ForestRecourseActions
from cfmining.predictors import TreeClassifier

from cfmining.mip_algorithms import LinearRecourseActions, LinearRecourseActionsMulti
from cfmining.algorithms import MAPOCAM, BruteForce, Greedy, MAPOCAM2
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier, GeneralClassifier, TreeClassifier, LinearClassifier, LinearRule, GeneralClassifier_Shap
from cfmining.action_set import ActionSet

from results import save_result

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

## Preparing dataset

In [40]:
data_dir = "data/"
data_name = 'german'
data_file = os.path.join(data_dir, '%s_processed.csv' % data_name)
## load and process data
german_df = pd.read_csv(data_file).reset_index(drop=True)
german_df = (german_df
             .assign(isMale=lambda df: (df['Gender']=='Male').astype(int))
             .drop(['PurposeOfLoan', 'Gender', 'OtherLoansAtStore'], axis=1)
            )

y = german_df['GoodCustomer']
# tranform y to [0, 1]
y = (y == 1).astype(int)
X = german_df.drop('GoodCustomer', axis=1)
X = X[['ForeignWorker', 'Single', 'Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome', 'isMale', 'YearsAtCurrentHome',
'NumberOfOtherLoansAtBank', 'NumberOfLiableIndividuals', 'HasTelephone',]]

In [41]:
Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=100)

In [42]:
outlier_detection = IsolationForest(contamination=0.1, n_estimators=50)
outlier_detection.fit(Xtr.values);
print(np.unique(outlier_detection.predict(Xtr.values), return_counts=True))

(array([-1,  1]), array([ 90, 810]))


## Models

In [43]:
stand = preprocessing.StandardScaler()
clf_base = LogisticRegression(max_iter=1000, class_weight='balanced')
clf = pipeline.Pipeline([('std', stand), ('clf', clf_base)])

grid = GridSearchCV(
  clf, param_grid={'clf__C': np.logspace(-4, 3)},
  cv=5,
  scoring='roc_auc',
  verbose=0
)

grid.fit(Xtr, ytr)
clf_lgr = grid.best_estimator_

clf_rt = ensemble.RandomForestClassifier(
  n_estimators=10, 
  max_depth=5, 
  max_leaf_nodes=31,
  class_weight='balanced_subsample'
)
clf_rt.fit(Xtr, ytr);

In [44]:
threshold=0.5
scores_lgr = pd.Series(clf_lgr.predict_proba(Xts)[:, 1])
denied_individuals_lgr = scores_lgr.loc[lambda s: s < threshold].index

scores_rt = pd.Series(clf_rt.predict_proba(Xts)[:, 1])
denied_individuals_rt = scores_rt.loc[lambda s: s < threshold].index

## Setting ActionSet base parameters

In [45]:
action_set_base = ActionSet(X = X)
for feat in action_set_base:
    if feat.name in ['Age', 'JobClassIsSkilled', 'OwnsHouse', 'isMale', 'JobClassIsSkilled']:
        feat.mutable = False
        feat.step_direction = 1
    if feat.name in ['Single', 'ForeignWorker', 'RentsHouse']:
        feat.mutable = False
        feat.step_direction = -1
    if feat.name in ['LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome', 'NumberOfOtherLoansAtBank', 'MissedPayments',
                     'CriticalAccountOrLoansElsewhere', 'OtherLoansAtBank', 'Unemployed', 'YearsAtCurrentJob_lt_1']:
        feat.mutable = True
        feat.step_direction = -1
    if feat.name in ['YearsAtCurrentHome', 'NumberOfLiableIndividuals', 'HasTelephone', 'CheckingAccountBalance_geq_0',
                     'CheckingAccountBalance_geq_200', 'SavingsAccountBalance_geq_100', 'SavingsAccountBalance_geq_500',
                     'NoCurrentLoan', 'HasCoapplicant', 'HasGuarantor', 'YearsAtCurrentJob_geq_4']:
        feat.mutable = True
        feat.step_direction = 1
    
    feat.flip_direction = 1
    feat.update_grid()

In [46]:
action_set = copy.deepcopy(action_set_base)
action_set['Age'].mutable = False
action_set['Age'].update_grid()

action_set['LoanDuration'].step_type ="absolute"
action_set['LoanDuration'].step_size = 6
action_set['LoanDuration'].update_grid()

action_set['LoanAmount'].step_size = 0.1
action_set['LoanAmount'].update_grid()

print('ActionSet stats')
print('Number of actionable features:', sum([action.actionable for action in action_set]))
print('Mean number of actions per feature:', np.nanmean([len(action._grid) if action.actionable else np.nan for action in action_set]))
print('Max number of actions per feature:', np.nanmax([len(action._grid) if action.actionable else np.nan for action in action_set]))
log_combinations = int(np.prod([len(action._grid) for action in action_set if action_set.actionable]))
print('Number of combinations:', log_combinations)

ActionSet stats
Number of actionable features: 3
Mean number of actions per feature: 2.6666666666666665
Max number of actions per feature: 4.0
Number of combinations: 8279040


In [47]:
percCalc_lgr = PercentileCalculator(action_set=action_set)

In [48]:
action_set_tree = copy.deepcopy(action_set_base)
action_set_tree.embed_forest(clf_rt)
for action in action_set_tree:
    action.flip_direction = action.step_direction
    
print('ActionSet stats')
print('Number of actionable features:', sum([action.mutable for action in action_set_tree]))
print('Mean number of actions per feature:', np.mean([len(action._grid)-1 for action in action_set_tree]))
print('Max number of actions per feature:', np.max([len(action._grid)-1 for action in action_set_tree]))
rt_combinations = int(np.prod([len(action._grid) for action in action_set_tree if action.actionable]))
print('Number of combinations:', rt_combinations)

ActionSet stats
Number of actionable features: 7
Mean number of actions per feature: 9.181818181818182
Max number of actions per feature: 44
Number of combinations: 190080


In [49]:
percCalc_rt = PercentileCalculator(action_set=action_set_tree)

## Counterfactuals Logistic

### Mapocam

In [57]:
clf_lgr_mapocam = MonotoneClassifier(clf_lgr, X=Xtr, y=ytr, threshold=threshold)

In [63]:
for i in denied_individuals_lgr:
    individual = Xts.iloc[i].values
    criteria = PercentileCriterion(individual, percCalc_lgr)
    mapocam = MAPOCAM(
        action_set, 
        individual, 
        clf_lgr_mapocam,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    if len(mapocam.solutions) > 0:
        print(i, mapocam.solutions[0])
        break

21 [  0   1  26  30 425   2   1   2   2   1   0]


X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScale

In [64]:
individual = Xts.iloc[denied_individuals_lgr[21]]
print(individual)
individual = individual.values

ForeignWorker                    0
Single                           1
Age                             48
LoanDuration                    48
LoanAmount                   12204
LoanRateAsPercentOfIncome        2
isMale                           1
YearsAtCurrentHome               2
NumberOfOtherLoansAtBank         1
NumberOfLiableIndividuals        1
HasTelephone                     1
Name: 615, dtype: int64


In [65]:
criteria = PercentileCriterion(individual, percCalc_lgr)
mapocam = MAPOCAM(
    action_set, 
    individual, 
    clf_lgr_mapocam,
    max_changes=float('inf'), 
    compare=criteria
)
mapocam.fit()
mapocam.solutions

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names


[array([  0,   1,  48,  48, 425,   2,   1,   2,   1,   1,   1])]

### Mapocam 2

In [66]:
clf_lgr_shap_1 = GeneralClassifier_Shap(clf_lgr, X=Xtr, y=ytr, threshold=threshold)

PermutationExplainer explainer: 901it [01:47,  8.05it/s]                         


In [68]:
criteria = PercentileCriterion(individual, percCalc_lgr)
mapocam = MAPOCAM2(
    action_set, 
    individual, 
    clf_lgr_shap_1, 
    #outlier_detection,
    max_changes=float('inf'), 
    compare=criteria
)
mapocam.fit()
mapocam.solutions

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names


[array([  0,   1,  48,  48, 425,   2,   1,   2,   1,   1,   1])]

## Mapocam 2 with outlier detection

In [73]:
criteria = PercentileCriterion(individual, percCalc_lgr)
mapocam = MAPOCAM2(
    action_set, 
    individual, 
    clf_lgr_shap_1, 
    outlier_detection,
    max_changes=float('inf'), 
    compare=criteria
)
mapocam.fit()
mapocam.solutions

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names


[array([  0,   0,  24,  15, 425,   2,   0,   3,   2,   1,   0])]

**Find a different solution**

In [74]:
mapocam1_solutions = []
mapocam2_solutions = []
mapocam2_solutions_outlier = []
for i in denied_individuals_lgr:
    individual = Xts.iloc[i].values
    criteria = PercentileCriterion(individual, percCalc_lgr)

    mapocam = MAPOCAM(
        action_set, 
        individual, 
        clf_lgr_mapocam,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam1_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set, 
        individual, 
        clf_lgr_shap_1, 
        #outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set, 
        individual, 
        clf_lgr_shap_1, 
        outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions_outlier.append(mapocam.solutions)
    

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScale

In [75]:
for i in range(len(mapocam1_solutions)):
    print(i)
    print("MAPOCAM1 solutions:", mapocam1_solutions[i])
    print("MAPOCAM2 solutions:", mapocam2_solutions[i])
    print("MAPOCAM2 solutions with outlier detection:", mapocam2_solutions_outlier[i])
    print('------------------')

0
MAPOCAM1 solutions: []
MAPOCAM2 solutions: [array([   0,    0,   29,    6, 1801,    2,    0,    4,    1,    1,    0])]
MAPOCAM2 solutions with outlier detection: [array([   0,    0,   29,    6, 1801,    2,    0,    4,    1,    1,    0])]
------------------
1
MAPOCAM1 solutions: []
MAPOCAM2 solutions: [array([  0,   0,  34,   6, 425,   4,   0,   4,   2,   1,   0])]
MAPOCAM2 solutions with outlier detection: [array([  0,   0,  34,   6, 425,   4,   0,   4,   2,   1,   0])]
------------------
2
MAPOCAM1 solutions: []
MAPOCAM2 solutions: [array([  0,   1,  31,   6, 425,   4,   1,   4,   1,   1,   0])]
MAPOCAM2 solutions with outlier detection: [array([  0,   1,  31,   6, 425,   4,   1,   4,   1,   1,   0])]
------------------
3
MAPOCAM1 solutions: []
MAPOCAM2 solutions: [array([  0,   0,  24,   6, 425,   1,   0,   4,   1,   1,   0])]
MAPOCAM2 solutions with outlier detection: [array([  0,   0,  24,   6, 425,   1,   0,   4,   1,   1,   0])]
------------------
4
MAPOCAM1 solutions: []
MAPOC

In [82]:
for i in range(len(mapocam1_solutions)):
    if len(mapocam1_solutions[i]) != len(mapocam2_solutions[i]):
        print(i)
        print("MAPOCAM1 solutions:")
        for sol in mapocam1_solutions[i]:
            print(list(zip(Xtr.columns, sol)))

        if len(mapocam1_solutions[i]) == 0:
            print("No solution")
        
        print("MAPOCAM2 solutions:")
        for sol in mapocam2_solutions[i]:
            print(list(zip(Xtr.columns, sol)))

        if len(mapocam2_solutions[i]) == 0:
            print("No solution")

        print("-------")

0
MAPOCAM1 solutions:
No solution
MAPOCAM2 solutions:
[('ForeignWorker', 0), ('Single', 0), ('Age', 29), ('LoanDuration', 6), ('LoanAmount', 1801), ('LoanRateAsPercentOfIncome', 2), ('isMale', 0), ('YearsAtCurrentHome', 4), ('NumberOfOtherLoansAtBank', 1), ('NumberOfLiableIndividuals', 1), ('HasTelephone', 0)]
-------
1
MAPOCAM1 solutions:
No solution
MAPOCAM2 solutions:
[('ForeignWorker', 0), ('Single', 0), ('Age', 34), ('LoanDuration', 6), ('LoanAmount', 425), ('LoanRateAsPercentOfIncome', 4), ('isMale', 0), ('YearsAtCurrentHome', 4), ('NumberOfOtherLoansAtBank', 2), ('NumberOfLiableIndividuals', 1), ('HasTelephone', 0)]
-------
2
MAPOCAM1 solutions:
No solution
MAPOCAM2 solutions:
[('ForeignWorker', 0), ('Single', 1), ('Age', 31), ('LoanDuration', 6), ('LoanAmount', 425), ('LoanRateAsPercentOfIncome', 4), ('isMale', 1), ('YearsAtCurrentHome', 4), ('NumberOfOtherLoansAtBank', 1), ('NumberOfLiableIndividuals', 1), ('HasTelephone', 0)]
-------
3
MAPOCAM1 solutions:
No solution
MAPOCAM2

In [83]:
for i in range(len(mapocam1_solutions)):
    if len(mapocam2_solutions[i]) != len(mapocam2_solutions_outlier[i]):
        print(i)
        print("MAPOCAM2 solutions:")
        for sol in mapocam2_solutions[i]:
            print(list(zip(Xtr.columns, sol)))

        if len(mapocam2_solutions[i]) == 0:
            print("No solution")
        
        print("MAPOCAM2 with outlier detection solutions:")
        for sol in mapocam2_solutions_outlier[i]:
            print(list(zip(Xtr.columns, sol)))

        if len(mapocam2_solutions_outlier[i]) == 0:
            print("No solution")

        print("-------")

25
MAPOCAM2 solutions:
[('ForeignWorker', 0), ('Single', 1), ('Age', 31), ('LoanDuration', 36), ('LoanAmount', 425), ('LoanRateAsPercentOfIncome', 1), ('isMale', 1), ('YearsAtCurrentHome', 3), ('NumberOfOtherLoansAtBank', 2), ('NumberOfLiableIndividuals', 2), ('HasTelephone', 1)]
MAPOCAM2 with outlier detection solutions:
No solution
-------
29
MAPOCAM2 solutions:
[('ForeignWorker', 0), ('Single', 0), ('Age', 53), ('LoanDuration', 30), ('LoanAmount', 425), ('LoanRateAsPercentOfIncome', 4), ('isMale', 0), ('YearsAtCurrentHome', 1), ('NumberOfOtherLoansAtBank', 1), ('NumberOfLiableIndividuals', 1), ('HasTelephone', 1)]
MAPOCAM2 with outlier detection solutions:
No solution
-------
33
MAPOCAM2 solutions:
[('ForeignWorker', 0), ('Single', 0), ('Age', 24), ('LoanDuration', 6), ('LoanAmount', 425), ('LoanRateAsPercentOfIncome', 1), ('isMale', 0), ('YearsAtCurrentHome', 2), ('NumberOfOtherLoansAtBank', 1), ('NumberOfLiableIndividuals', 2), ('HasTelephone', 0)]
MAPOCAM2 with outlier detection 

## Counterfactuals RF

In [86]:
clf_rf_mapocam =  TreeClassifier(clf_rt, Xtr, ytr, threshold=threshold, use_predict_max=True)

In [89]:
individual = Xts.iloc[0].values

### MAPOCAM

In [95]:
criteria = PercentileCriterion(individual, percCalc_rt)
clf_rf_mapocam.fit(individual, action_set_tree)
mapocam = MAPOCAM(
    action_set_tree, 
    individual, 
    clf_rf_mapocam,
    max_changes=float('inf'), 
    compare=criteria
)
mapocam.fit()
print(mapocam.solutions[0])

[   0    0   29   17 5308    2    0    4    1    1    0]


### MAPOCAM2

In [92]:
clf_rf_shap = GeneralClassifier_Shap(clf_rt, X=Xtr, y=ytr, threshold=threshold)

PermutationExplainer explainer: 901it [01:04, 11.61it/s]                         


In [93]:
criteria = PercentileCriterion(individual, percCalc_rt)
mapocam = MAPOCAM2(
    action_set_tree, 
    individual, 
    clf_rf_shap, 
    #outlier_detection,
    max_changes=float('inf'), 
    compare=criteria
)
mapocam.fit()
mapocam.solutions

X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

[array([   0,    0,   29,   17, 5308,    2,    0,    4,    1,    1,    0])]

### Mapocam 2 with outlier detection

In [94]:
criteria = PercentileCriterion(individual, percCalc_rt)
mapocam = MAPOCAM2(
    action_set_tree, 
    individual, 
    clf_rf_shap, 
    outlier_detection,
    max_changes=float('inf'), 
    compare=criteria
)
mapocam.fit()
mapocam.solutions

X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

[array([   0,    0,   29,   17, 5308,    2,    0,    4,    1,    1,    0])]

**Find a different solution**

In [96]:
mapocam1_solutions = []
mapocam2_solutions = []
mapocam2_solutions_outlier = []
for i in denied_individuals_lgr:
    individual = Xts.iloc[i].values
    criteria = PercentileCriterion(individual, percCalc_rt)
    clf_rf_mapocam.fit(individual, action_set_tree)
    mapocam = MAPOCAM(
        action_set_tree, 
        individual, 
        clf_lgr_mapocam,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam1_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set_tree, 
        individual, 
        clf_rf_shap, 
        #outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions.append(mapocam.solutions)

    mapocam = MAPOCAM2(
        action_set_tree, 
        individual, 
        clf_rf_shap, 
        outlier_detection,
        max_changes=float('inf'), 
        compare=criteria
    )
    mapocam.fit()
    mapocam2_solutions_outlier.append(mapocam.solutions)
    

X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScaler was fitted with feature names
X does not have valid feature names, but StandardScale

In [None]:
for i in range(len(mapocam1_solutions)):
    if len(mapocam1_solutions[i]) != len(mapocam2_solutions[i]):
        print(i)
        print("MAPOCAM1 solutions:")
        for sol in mapocam1_solutions[i]:
            print(list(zip(Xtr.columns, sol)))

        if len(mapocam1_solutions[i]) == 0:
            print("No solution")
        
        print("MAPOCAM2 solutions:")
        for sol in mapocam2_solutions[i]:
            print(list(zip(Xtr.columns, sol)))

        if len(mapocam2_solutions[i]) == 0:
            print("No solution")

        print("-------")