In [4]:
import numpy as np
import pandas as pd
import time
import sys
import os

sys.path.append("../")

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

os.environ["OMP_NUM_THREADS"] = "2"

In [6]:
def preprocessing():
    dataset = pd.read_csv("./data/german_credit_data.csv").drop("Unnamed: 0", axis=1)

    dataset.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

    for column in dataset.columns:
        if dataset[column].dtype != np.dtype("int64"):
            dataset[column] = LabelEncoder().fit_transform(dataset[column])

    return dataset


def train_model(x, y, model):
    cv = KFold(n_splits=3, random_state=None)
    global total_cm
    total_cm = np.zeros((2, 2))
    metric_dict = {}
    split_dict = {}

    it = 0
    for train_ind, test_ind in cv.split(x):
        x_train, x_test = x[train_ind], x[test_ind]
        y_train, y_test = y[train_ind], y[test_ind]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        total_cm += confusion_matrix(y_test, y_pred)

        metrics = [
            accuracy_score(y_test, y_pred) * 100,
            precision_score(y_test, y_pred) * 100,
            recall_score(y_test, y_pred) * 100,
        ]

        metrics = np.round(metrics, decimals=2)

        metric_dict[it] = metrics
        split_dict[it] = (train_ind, test_ind)
        it += 1

    return total_cm, metric_dict, split_dict

In [7]:
dataset = preprocessing()

model = RandomForestClassifier(n_estimators=100, criterion="entropy")
# model2 = LogisticRegression(max_iter = 1000)

cols = dataset.iloc[:, :-1].columns.values
class_names = ["Bad Risk", "No Risk"]
X = dataset.drop(dataset.columns[-1], axis=1).values
Y = dataset.iloc[:, [-1]].values.reshape(-1,)

cm, metrics, splits = train_model(X, Y, model)

x_train, x_test = X[splits[0][0]], X[splits[0][1]]
y_train, y_test = Y[splits[0][0]], Y[splits[0][1]]

In [8]:
from chweimo.counterfactual import Optimizer

explainer = Optimizer(X, Y, model.predict_proba)

x_orig = x_test[0]
x_orig_y = model.predict_proba(x_orig.reshape(1, -1)).reshape(-1)
change_class = np.argmin(x_orig_y)

explainer.generate_cf(
            sample=x_orig,
            change_class=change_class,
            termination=50,
            verbose=False
        )

In [9]:
from chweimo.explain_tools.linear_model import find_weight
norm_coef, sparse_coef = find_weight(explainer, verbose=True)

R2: 0.8232737199459302
KL Divergence: 0.6119733719215966
gini: 0.4629977176046765


In [None]:
norm_coef