In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns
import itertools
import sklearn
import shap
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
import sys; sys.path.insert(0, '../src/') 
import necsuf_tabular_text as nec_suf
import cum_shp_plot
import recourse_experiment
from __future__ import print_function

In [2]:
# dataset version taken from https://www.kaggle.com/kabure/german-credit-data-with-risk?select=german_credit_data.csv
german_cred_df = pd.read_csv("../datasets/german_credit_data.csv")

In [3]:
# following standard pre-processing from https://www.kaggle.com/vigneshj6/german-credit-data-analysis-python
german_cred_df['Saving accounts'] = german_cred_df['Saving accounts'].map({"little":0,"moderate":1,"quite rich":2 ,"rich":3});
german_cred_df['Saving accounts'] = german_cred_df['Saving accounts'].fillna(german_cred_df['Saving accounts'].dropna().mean())

german_cred_df['Checking account'] = german_cred_df['Checking account'].map({"little":0,"moderate":1,"rich":2 });
german_cred_df['Checking account'] = german_cred_df['Checking account'].fillna(german_cred_df['Checking account'].dropna().mean())

german_cred_df['Sex'] = german_cred_df['Sex'].map({"male":0,"female":1});

german_cred_df['Housing'] = german_cred_df['Housing'].map({"own":0,"free":1,"rent":2});

german_cred_df['Purpose'] = german_cred_df['Purpose'].map({'radio/TV':0, 'education':1, 'furniture/equipment':2, 'car':3, 'business':4,
       'domestic appliances':5, 'repairs':6, 'vacation/others':7});

german_cred_df['Risk'] = german_cred_df['Risk'].map({"good":1,"bad":0});
german_cred_df.rename(columns={"Risk": "outcome", "Saving accounts": "Savings",
                              "Checking account": "Checking", "Credit amount": "Credit"}, inplace=True)

german_cred_df.drop("Unnamed: 0", axis=1, inplace=True)

### Problem intiailization

In [4]:
inp = pd.DataFrame(german_cred_df.iloc[998]).T
inp

Unnamed: 0,Age,Sex,Job,Housing,Savings,Checking,Credit,Duration,Purpose,outcome
998,23.0,0.0,2.0,1.0,0.0,0.0,1845.0,45.0,0.0,0.0


In [5]:
### ratio positive labels in dataset?
np.sum(german_cred_df.outcome)/len(german_cred_df)

0.7

In [6]:
#### Create F model
X,y = np.array(german_cred_df.iloc[:, :-1]), np.array(german_cred_df.iloc[:, -1:]).ravel()

X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, test_size=.2, random_state=42)

clf = ExtraTreesClassifier(random_state=0, max_depth=15)
clf.fit(X_train, y_train)
# save F(inp) for later
f_inp = clf.predict(np.array(inp.iloc[:, :-1]))
f_inp

array([0])

In [7]:
german_cred_df['Model_pred'] = clf.predict(german_cred_df.iloc[:, :-1].values)

In [8]:
accuracy_score(y_test, clf.predict(X_test))

0.71

### Causal Model Fitting

In [9]:
SCM_models = nec_suf.fit_scm(german_cred_df)

D with everyone from opposite class, negative outcome, causal approach

In [12]:
num_features = len(inp.columns[:-1])
# Notice this time we use the causal_SCM argument, and pass in the SCM we fitted above
_, CF_i2r_causal, refs1_causal = \
    nec_suf.suff_nec_pipeline((german_cred_df.outcome != inp.outcome.item()), inp, clf, german_cred_df, 
                              num_features, causal_SCM=SCM_models, n_sample=100, 
                              col_con=[0,6,7], col_cat=[1,2,3,4,5,8])

D with everyone from opposite class, non-causal approach

In [13]:
_, CF_i2r, refs = \
    nec_suf.suff_nec_pipeline((german_cred_df.outcome != inp.outcome.item()), inp, clf, german_cred_df, num_features, n_sample=100,
                              col_con=[0,6,7], col_cat=[1,2,3,4,5,8])

Comparison of recourse options

In [14]:
recourse_experiment.deg_nec_suff(CF_i2r_causal, inp, f_inp, clf, num_features,
                             r2i=False, deg_thresh=0, datatype='Tabular',
                             filter_supersets=True, filter_cost=True,
                            pred_on_fly=True, max_output=5)

Unnamed: 0,index,degree,string,cardinality,cost
0,[0],0.88,Age 24.0,1,0.076923
1,[1],0.19,Sex 1.0,1,1.0
2,[2],0.37,Job 1.0,1,1.0
3,[3],0.76,Housing 2.0,1,1.0
4,[4],0.35,Savings 0.456548,1,1.0


In [15]:
CF_df_suff_i2r = nec_suf.deg_nec_suff(CF_i2r, inp, f_inp, clf, num_features, r2i=False)
recourse_experiment.deg_nec_suff(CF_i2r, inp, f_inp, clf, num_features,
                             r2i=False, deg_thresh=0, datatype='Tabular',
                             filter_supersets=True, filter_cost=True,
                            pred_on_fly=True, max_output=5)

Unnamed: 0,index,degree,string,cardinality,cost
0,[2],0.16,Job 3.0,1,1.0
1,[5],0.52,Checking 0.651815,1,1.0
2,[7],0.03,Duration 30.0,1,1.25
3,"[0, 3]",0.02,"Age 65.0, Housing 0.0",2,4.230769
4,"[0, 4]",0.02,"Age 34.000000, Savings 0.456548",2,1.846154
