In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Load Dataset

In [2]:
seed = 2022

X = pd.read_csv('diabetes/train.csv')
y = X.Outcome

X.drop(['Id', 'Outcome'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [3]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2,138,62,35,0,33.6,0.127,47
1,0,135,68,42,250,42.3,0.365,24
2,0,173,78,32,265,46.5,1.159,58
3,4,99,72,17,0,25.6,0.294,28
4,8,194,80,0,0,26.1,0.551,67


In [4]:
X_train.shape

(1124, 8)

# Fit Global Sufficient Rules

In [5]:
from acv_explainers import ACXplainer
from sklearn.metrics import roc_auc_score, accuracy_score

# It has the same params as a Random Forest, and it should be tuned to maximize the performance.  
acv_xplainer = ACXplainer(classifier=True, n_estimators=20, max_depth=10, sample_fraction=0.9, seed=4686)
acv_xplainer.fit(X_train, y_train)

In [6]:
accuracy_score(y_test, acv_xplainer.predict(X_test))

0.9110320284697508

### 1- Compute Sufficient Explanations

In [7]:
sdp_importance, sdp_index, size, sdp = acv_xplainer.importance_sdp_rf(X_train, y_train.astype(np.double),
                                                                      X_train, y_train.astype(np.double), stop=False, 
                                                                      pi_level=0.9)

100%|███████████████████████████████████████████| 20/20 [00:00<00:00, 85.44it/s]
100%|█████████████████████████████████████████████| 8/8 [01:59<00:00, 14.91s/it]


In [8]:
from acv_explainers.utils import get_active_null_coalition_list

S_star, N_star = get_active_null_coalition_list(sdp_index, size)

### 2- Compute Sufficient Rules

In [9]:
sdp, rules, sdp_all, rules_data, w = acv_xplainer.compute_sdp_maxrules(X_train, y_train.astype(np.double),
                                                         X_train, y_train.astype(np.double), S_star, verbose=True)

100%|███████████████████████████████████████| 1124/1124 [06:12<00:00,  3.02it/s]


### 3- Compute Global Sufficient Rules (G-SR)

In [10]:
acv_xplainer.fit_global_rules(X_train, y_train, rules, S_star)

In [11]:
y_test_pred = acv_xplainer.predict_proba_global_rules(X_test.values, min_acc=0.9)

### 4-  Compute the coverage and the precision of G-SR

In [12]:
y_o = []
y_r = []
for i, ya in enumerate(y_test_pred[0]):
    if ya != None:
        y_o.append(ya)
        y_r.append(y_test.values[i])
        
y_o = np.array(y_o, dtype=int)

print('Accuracy = {} --- Test Coverage = {}'.format(accuracy_score(y_r, y_o), len(y_r)/X_test.shape[0]))

Accuracy = 0.982532751091703 --- Test Coverage = 0.8149466192170819


# Baseline models

In [13]:
from imodels import BoostedRulesClassifier, BayesianRuleListClassifier, GreedyRuleListClassifier, SkopeRulesClassifier # see more models below
from imodels import SLIMRegressor, RuleFitRegressor, RuleFitClassifier

# Rule Fit

In [14]:
rf = RuleFitClassifier()  # initialize a model
rf.fit(X_train, y_train)  

RuleFitClassifier()

In [15]:
accuracy_score(y_test, rf.predict(X_test))

0.7615658362989324

# Skoped Rule

In [16]:
rf = SkopeRulesClassifier(n_estimators=10, precision_min=0.7, recall_min=0.25)  # initialize a model
rf.fit(X_train, y_train)  

accuracy_score(y_test, rf.predict(X_test))

0.7188612099644128

# Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
rf = DecisionTreeClassifier()  # initialize a model
rf.fit(X_train, y_train)  

accuracy_score(y_test, rf.predict(X_test))

0.9074733096085409

# Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf = RandomForestClassifier(n_estimators=500, random_state=202)  # initialize a model
rf.fit(X_train, y_train)  

accuracy_score(y_test, rf.predict(X_test))

0.9217081850533808