In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../')

from ract import Action, RecourseForestClassifier
from datasets import Dataset

In [2]:
DATASETS = ['c', 'f', 'b', 'p']

COST_TYPE = 'MPS'
COST_BUDGET = 0.3

N_ESTIMATORS = [100, 400]
MAX_DEPTH = 16
ALPHAS = {
    'c': [0.0, 0.02, 0.04, 0.06, 0.08, 0.1], 
    'f': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], 
    'b': [0.0, 0.02, 0.04, 0.06, 0.08, 0.1], 
    'p': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
}

KEYS_GROUPBY = [
    'Budget',
    'Method',
    'Alpha',
    'N_ESTIMATORS',
]
KEYS_MODEL = [
    'Accuracy',
    'AUC',
    'F1',
    'Time [s]',
]
KEYS_ACTION = [
    'Cost',
    'Validity',
    'Budget-Validity',
    'Valid-Cost',
    'Recourse', 
    'Sparsity', 
    'Plausibility',
]

In [3]:
def run_cv_ensemble(dataset='g', causal=False, n_split=10):
    np.random.seed(0)
    
    D = Dataset(dataset=dataset)
    X, y = D.get_dataset()
    params = D.params
    
    results = {
        'Fold': [],
        'Method': [],
        'Alpha': [],
        'Budget': [],
        'N_ESTIMATORS': [],
        'Accuracy': [],
        'AUC': [],
        'F1': [],
        'Time [s]': [],
        'Cost': [],
        'Validity': [],
        'Budget-Validity': [],
        'Valid-Cost': [], 
        'Recourse': [], 
        'Sparsity': [],
        'Plausibility': [],
    }
    importance = []
    
    k = 0
    for tr, ts in StratifiedKFold(n_splits=n_split).split(X, y):
        k = k + 1
        X_tr, X_ts, y_tr, y_ts = D.X[tr], D.X[ts], D.y[tr], D.y[ts]

        iforest = IsolationForest(n_estimators=100).fit(X_tr)
        action = Action(X_tr, y_target=0, cost_type=COST_TYPE, cost_budget=COST_BUDGET, causal=causal, **params)
        
        print('- k = {}'.format(k))
        for n_estimators in N_ESTIMATORS:

            print('\t- n_estimators: {}'.format(n_estimators))
            for alpha in ALPHAS[dataset]:
                clf = RecourseForestClassifier(action, n_estimators=n_estimators, max_depth=MAX_DEPTH)   
                clf.action.alpha = alpha

                start = time()
                clf = clf.fit(X_tr, y_tr)
                runtime = time() - start
                X_target = X_ts[clf.predict(X_ts) != action.y_target]
                result = clf.explain_action(X_target)
    
                results['Fold'].append(k); results['Method'].append('Baseline' if alpha == 0.0 else 'RACT'); 
                results['Alpha'].append(alpha); results['Budget'].append(COST_BUDGET); results['N_ESTIMATORS'].append(n_estimators); 
                results['Accuracy'].append(clf.score(X_ts, y_ts)); results['AUC'].append(roc_auc_score(y_ts, clf.predict_proba(X_ts)[:, 1])); 
                results['F1'].append(f1_score(y_ts, clf.predict(X_ts))); results['Time [s]'].append(runtime); 
                results['Cost'].append(result['cost'].mean()); results['Validity'].append(result['validity'].mean()); 
                results['Budget-Validity'].append(result['cost-validity'].mean()); results['Valid-Cost'].append(result['valid-cost'].mean()); 
                results['Recourse'].append(1 - (~result['cost-validity']).sum() / X_ts.shape[0]); 
                results['Sparsity'].append(result['valid-sparsity'].mean()); 
                results['Plausibility'].append(-1 * iforest.score_samples(result['counterfactual']).mean() if X_target.shape[0]!=0 else 0.0); 
                print('\t\t- Method: {} (alp.: {}) | AUC: {:.4} | Val.: {:.4} | Rec.: {:.4}'.format(results['Method'][-1], alpha, results['AUC'][-1], results['Budget-Validity'][-1], results['Recourse'][-1]))
                importance.append(clf.feature_importances_)

                if alpha == 0.0:
                    clf = RecourseForestClassifier(action, n_estimators=n_estimators, max_depth=MAX_DEPTH, feature_masking=True)

                    start = time()
                    clf = clf.fit(X_tr, y_tr)
                    runtime = time() - start
                    X_target = X_ts[clf.predict(X_ts) != action.y_target]
                    result = clf.explain_action(X_target)

                    results['Fold'].append(k); results['Method'].append('Baseline (OAF)'); 
                    results['Alpha'].append(alpha); results['Budget'].append(COST_BUDGET); results['N_ESTIMATORS'].append(n_estimators); 
                    results['Accuracy'].append(clf.score(X_ts, y_ts)); results['AUC'].append(roc_auc_score(y_ts, clf.predict_proba(X_ts)[:, 1])); 
                    results['F1'].append(f1_score(y_ts, clf.predict(X_ts))); results['Time [s]'].append(runtime); 
                    results['Cost'].append(result['cost'].mean()); results['Validity'].append(result['validity'].mean()); 
                    results['Budget-Validity'].append(result['cost-validity'].mean()); results['Valid-Cost'].append(result['valid-cost'].mean()); 
                    results['Recourse'].append(1 - (~result['cost-validity']).sum() / X_ts.shape[0]); 
                    results['Sparsity'].append(result['valid-sparsity'].mean()); 
                    results['Plausibility'].append(-1 * iforest.score_samples(result['counterfactual']).mean() if X_target.shape[0]!=0 else 0.0);                     
                    print('\t\t- Method: {} | AUC: {:.4} | Val.: {:.4} | Rec.: {:.4}'.format(results['Method'][-1], results['AUC'][-1], results['Budget-Validity'][-1], results['Recourse'][-1]))
                    importance.append(clf.feature_importances_)
        
    
    results = pd.DataFrame(results)
    results.to_csv('../res/appendix/complexity/{}_forest{}.csv'.format(dataset, '_causal' if causal else ''), index=False)
    
    importance = pd.DataFrame(np.array(importance), columns=D.feature_names)
    importance = pd.concat([results[['Fold', 'Method', 'Alpha', 'Budget', 'N_ESTIMATORS']], importance], axis=1)
    importance.to_csv('../res/appendix/complexity/fi_{}_forest{}.csv'.format(dataset, '_causal' if causal else ''), index=False)


In [4]:
from datasets import DATASET_FULLNAMES

for dataset in DATASETS: 
    print('# {}'.format(DATASET_FULLNAMES[dataset]))
    run_cv_ensemble(dataset)
    print()

# COMPAS
- k = 1
	- n_estimators: 100
		- Method: Baseline (alp.: 0.0) | AUC: 0.7257 | Val.: 0.5365 | Rec.: 0.825
		- Method: Baseline (OAF) | AUC: 0.7354 | Val.: 0.6276 | Rec.: 0.8558
		- Method: RACT (alp.: 0.02) | AUC: 0.732 | Val.: 0.6453 | Rec.: 0.8655
		- Method: RACT (alp.: 0.04) | AUC: 0.7248 | Val.: 0.7463 | Rec.: 0.9173
		- Method: RACT (alp.: 0.06) | AUC: 0.7279 | Val.: 0.7656 | Rec.: 0.9271
		- Method: RACT (alp.: 0.08) | AUC: 0.7372 | Val.: 0.8187 | Rec.: 0.9433
		- Method: RACT (alp.: 0.1) | AUC: 0.7343 | Val.: 0.8392 | Rec.: 0.9481
	- n_estimators: 400
		- Method: Baseline (alp.: 0.0) | AUC: 0.7281 | Val.: 0.6173 | Rec.: 0.8493
		- Method: Baseline (OAF) | AUC: 0.7355 | Val.: 0.6532 | Rec.: 0.8606
		- Method: RACT (alp.: 0.02) | AUC: 0.7281 | Val.: 0.6816 | Rec.: 0.8849
		- Method: RACT (alp.: 0.04) | AUC: 0.7304 | Val.: 0.7371 | Rec.: 0.9092
		- Method: RACT (alp.: 0.06) | AUC: 0.7303 | Val.: 0.7343 | Rec.: 0.9109
		- Method: RACT (alp.: 0.08) | AUC: 0.7303 | Val.: 0.82