In [3]:
import tensorflow as tf
from xgboost.testing.updater import check_get_quantile_cut_device

tf.compat.v1.disable_eager_execution()
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import alibi
print(alibi.__version__)
from alibi.explainers import CounterfactualProto
#models
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

0.9.6


In [34]:
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"all values: {X.shape[0]}")
print(f"training values: # stroke: {sum(y_train == 1)} and # no stroke: {sum(y_train == 0)}")
print(f"testing values: # stroke: {sum(y_test == 1)} and # no stroke: {sum(y_test == 0)}")

all values: 569
training values: # stroke: 286 and # no stroke: 169
testing values: # stroke: 71 and # no stroke: 43


In [31]:
def get_models():
    return {
        "RandomForest": lambda: RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
        "XGBoost": lambda: XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic'),
        "AdaBoost": lambda: AdaBoostClassifier(n_estimators=100),
        "SVM": lambda: svm.SVC(kernel='linear', probability=True, random_state=42),
        "MLP": lambda: MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=500, random_state=42)
    }

def apply_cfproto(instance: np.ndarray, predict_fn) -> np.ndarray:
    cf = CounterfactualProto(
        predict_fn,
        shape=instance.shape,
        kappa=0.1,
        beta=0.1,
        gamma=0,
        theta=0,
        max_iterations=1000, # voher 500
        ae_model=None,
        enc_model=None,
        feature_range=(X_train.min(axis=0), X_train.max(axis=0)),
        clip=(X_train.min(axis=0), X_train.max(axis=0)),
        c_init=1.,
        c_steps=10, # vorher 5
        learning_rate_init=1e-2,
        write_dir='./cf')

    cf.fit(X_train, d_type='abdm', w=None, disc_perc=[25, 50, 75], standardize_cat_vars=False,
       smooth=1., center=True, update_feature_range=True)
    explanation = cf.explain(instance)
    return explanation

def predict(x: np.ndarray, model) -> np.ndarray:
    preds = model.predict_proba(x)
    return np.atleast_2d(preds)

def train_models():
    model_fns = get_models()
    trained_models = {}
    for name, model_fn in model_fns.items():
        model = model_fn()
        model.fit(X_train, y_train)
        trained_models[name] = model
    return trained_models

def run_cfproto():
    trained_models = train_models()

    for model_name, model in trained_models.items():
        res = []
        test = [10, 20]
        for i in test: # size: 114 (testing data)
            instance = X_test[i].reshape(1, -1)
            original_pred = predict(instance, model).argmax(axis=1)[0]
            original_prob = predict(instance, model)[0][original_pred]

            cfproto_explanation = apply_cfproto(instance, lambda x: predict(x, model))
            data = cfproto_explanation.data
            counterfactual = data["cf"] if data.get("cf") is not None else instance

            if data.get("cf") is not None:
                print("Original prediction:", predict(instance).argmax())
                print("CF prediction      :", predict(data["cf"]).argmax())
                print("Δ x                :", data["cf"] - instance)

            success = 1 if data.get("cf") is not None else 0
            cf_pred = predict(counterfactual, model).argmax(axis=1)[0]
            cf_prob = predict(counterfactual, model)[0][cf_pred]

            row = {
                "model": model_name,
                "success": success,
                "original_pred": original_pred,
                "original_prob": original_prob,
                "cf_pred": cf_pred,
                "cf_prob": cf_prob,
            }
            diff = (counterfactual - instance)[0]
            for j, name in enumerate(feature_names):
                row[f"Δ_{name}"] = diff[j]

            res.append(row)

        df = pd.DataFrame(res)
        df.to_csv(f"cfproto_{model_name}_breastcancer.csv", index=False)



In [32]:
if __name__ == "__main__":
    run_cfproto()

No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
No counterfactual found!
