# iris Classification

This notebook is used as part of my thesis, comparing different XAI methods and libraries.
<br/>
The purpose of the created models is to classify if a sample belongs to one of three iris species.
<br/>
Dataset: https://archive.ics.uci.edu/ml/datasets/iris

## 1 Set up Environment and Dataset <a class="anchor" id="ch1"></a>

### 1.1 Load Libraries and Set Up Parameters <a class="anchor" id="ch1.1"></a>

In [1]:
# random seed for reproduction
seedNum = 23

In [2]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import urllib.request
import seaborn as sns
import catboost
import shap
import lime
import graphviz
import tensorflow as tf

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.inspection import partial_dependence, plot_partial_dependence

from catboost import CatBoostClassifier
from alibi.explainers import AnchorTabular, CounterFactualProto, CounterFactual
from datetime import datetime

# required installs:
# pip install shap
# pip install lime
# pip install alibi
# conda install python-graphviz AND install from https://graphviz.org/download/

In [3]:
# timer for the script processing
startTimeScript = datetime.now()

# set up n_jobs
n_jobs = 6

# set flag for splitting the dataset
splitDataset = True
splitPercentage = 0.20

# set number of folds for cross validation
n_folds = 10

# set various default modeling parameters
scoring = 'accuracy'

In [4]:
col_names = {"SepalLengthCm":'Sepal Length (cm)', "SepalWidthCm":'Sepal Width (cm)',
             "PetalLengthCm":'Petal Length (cm)', "PetalWidthCm":'Petal Width (cm)'}

In [5]:
#import dataset
dataset_path = 'data/iris.csv'
Xy_original = pd.read_csv(dataset_path)
Xy_original.rename(columns=col_names, inplace=True)
Xy_original.shape

FileNotFoundError: [Errno 2] No such file or directory: 'data/iris.csv'

In [None]:
# Use variable totCol to hold the number of columns in the dataframe
totCol = len(Xy_original.columns)
totAttr = totCol-1


X_original = Xy_original.iloc[:,0:totAttr]
y_original = Xy_original.iloc[:,totAttr]

print("Xy_original.shape: {} X_original.shape: {} y_original.shape: {}".format(Xy_original.shape, X_original.shape, y_original.shape))

In [None]:
X_original = X_original.drop("Id", axis=1)

### 1.2 Quick EDA <a class="anchor" id="ch1.2"></a>

In [None]:
# check distribution of labels
Xy_original.groupby('Species').size()

In [None]:
fault_counts= Xy_original['Species'].value_counts()

fig, ax = plt.subplots(1, 2, figsize=(15,5))
fault_counts_barplot = sns.barplot(x = fault_counts.index,y = fault_counts.values, ax = ax[0])
fault_counts_barplot.set_ylabel('Number of classes in the dataset')
fault_counts.plot.pie(autopct="%1.1f%%", ax=ax[1])
plt.show()

### 1.3 Data Cleaning and Preparation <a class="anchor" id="ch1.3"></a>

In [None]:
# Split the data further into training and test datasets
if (splitDataset):
    X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_original, y_original, test_size=splitPercentage, 
                                                                    stratify=y_original, random_state=seedNum)
else:
    X_train_df, y_train_df = X_original, y_original
    X_test_df, y_test_df = X_original, y_original
print("X_train.shape: {} y_train_df.shape: {}".format(X_train_df.shape, y_train_df.shape))
print("X_test_df.shape: {} y_test_df.shape: {}".format(X_test_df.shape, y_test_df.shape))

In [None]:
# Finalize the training and testing datasets for the modeling activities
X_train = X_train_df.to_numpy()
y_train = y_train_df.to_numpy()
X_test = X_test_df.to_numpy()
y_test = y_test_df.to_numpy()
print("X_train.shape: {} y_train.shape: {}".format(X_train.shape, y_train.shape))
print("X_test.shape: {} y_test.shape: {}".format(X_test.shape, y_test.shape))

## 2 Tree-based Modeling <a class="anchor" id="ch2"></a>

### 2.1 Try Some Untuned  Models <a class="anchor" id="ch2.1"></a>

In [None]:
# Set up Algorithms Spot-Checking Array

startTimeModule = datetime.now()
train_models = []
train_results = []
train_model_names = []
train_metrics = []
train_models.append(('DT', DecisionTreeClassifier(random_state=seedNum)))
train_models.append(('BT', BaggingClassifier(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('RF', RandomForestClassifier(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('ET', ExtraTreesClassifier(random_state=seedNum, n_jobs=n_jobs)))
train_models.append(('GB', GradientBoostingClassifier(random_state=seedNum)))

In [None]:
# Generate models in turn

for name, model in train_models:
    startTimeModule = datetime.now()
    kfold = KFold(n_splits=n_folds)
    cv_results = cross_val_score(model, X_original, y_original, cv=kfold, scoring=scoring)
    train_results.append(cv_results)
    train_model_names.append(name)
    train_metrics.append(cv_results.mean())
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
    print(model)
    print ('Model training time:', (datetime.now() - startTimeModule), '\n')
print ('Average metrics ('+scoring+') from all models:',np.mean(train_metrics))
print ('Total training time for all models:',(datetime.now() - startTimeModule))

### 2.2 Train and Set Up Reference Models <a class="anchor" id="ch2.2"></a>

Decision Tree:

In [None]:
startTimeModule = datetime.now()

decisiontree = DecisionTreeClassifier(random_state=seedNum)
param_grid_tree = {
    "max_depth": [6,8],
    "criterion" : ["gini","entropy"],
    "min_samples_leaf": [6,10,14]
}

kfold = KFold(n_splits=n_folds)
grid = GridSearchCV(decisiontree, param_grid_tree, scoring=scoring, cv=kfold, n_jobs=n_jobs, refit="Recall")

grid_result = grid.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print ('Computing time:',(datetime.now() - startTimeModule))

clf_dt_be = grid_result.best_estimator_
clf_dt = clf_dt_be.fit(X_train, y_train)

Random Forest:

In [None]:
startTimeModule = datetime.now()

tune_model = RandomForestClassifier(random_state=seedNum, n_jobs=n_jobs)

n_estimators = [100]
criterion = ["gini","entropy"]
max_features =[None, "sqrt", 0.2, 0.3, 0.4, 0.5]

paramGrid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features)

kfold = KFold(n_splits=n_folds)
grid = GridSearchCV(estimator=tune_model, param_grid=paramGrid, scoring=scoring, cv=kfold, refit="Accuracy")
grid_result = grid.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print ('Computing time:',(datetime.now() - startTimeModule))

clf_rf_be = grid_result.best_estimator_
clf_rf = clf_rf_be.fit(X_train, y_train)

Gradient Boosting

In [None]:
gridsearch_new = True

if(gridsearch_new):
    param_grid_catB = {
        "depth": [6,10],
    }
    
    clf_catb = CatBoostClassifier(
        eval_metric='Accuracy',
    )
    
    startTimeModule = datetime.now()
    grid_search = clf_catb.grid_search(param_grid_catB, X_train, y_train, partition_random_seed=seedNum, plot=False)
    
    print ('Computing time:',(datetime.now() - startTimeModule))

clf_catb.get_params()

clf_cb_be = CatBoostClassifier(eval_metric='Accuracy', depth=6, verbose=False)
clf_cb = clf_cb_be.fit(X_train, y_train, verbose=False)

### 2.3 Model Evaluation <a class="anchor" id="ch2.3"></a>

In [None]:
predictions_dt = clf_dt.predict(X_test)
predictions_rf = clf_rf.predict(X_test)
predictions_cb = clf_cb.predict(X_test)
cv_dt = cross_val_score(clf_dt_be, X_train, y_train, cv=kfold, scoring=scoring)
cv_rf = cross_val_score(clf_rf_be, X_train, y_train, cv=kfold, scoring=scoring)
cv_cb = cross_val_score(clf_cb_be, X_train, y_train, cv=kfold, scoring=scoring)

print(clf_dt,"\n\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions_dt))
print("\n\nClassification Report:\n\n",classification_report(y_test, predictions_dt))
print("Cross-Validation: %f (%f)" % (cv_dt.mean(), cv_dt.std()))
print("--------------------------------------------------------\n")

print(clf_rf,"\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions_rf))
print("\n\nClassification Report:\n\n",classification_report(y_test, predictions_rf))
print("Cross-Validation: %f (%f)" % (cv_rf.mean(), cv_rf.std()))
print("--------------------------------------------------------\n")

print(clf_cb,"\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions_cb))
print("\n\nClassification Report:\n\n",classification_report(y_test, predictions_cb))
print("Cross-Validation: %f (%f)" % (cv_cb.mean(), cv_cb.std()))

## 3 Model Explainers <a class="anchor" id="ch3"></a>

In [None]:
clf = clf_cb

In [None]:
predictions=clf.predict(X_test)
probabilities = clf.predict_proba(X_test)
classes_df = pd.DataFrame(predictions_rf)
class_names = classes_df[0].unique()
feature_names = X_original.columns.values

In [None]:
class_names = np.array(["Setosa","Versicolor","Virginica"])

In [None]:
predictions = np.where(predictions=="Iris-setosa", "Setosa", predictions)
predictions = np.where(predictions=="Iris-versicolor", "Versicolor", predictions)
predictions = np.where(predictions=="Iris-virginica", "Virginica", predictions)

In [None]:
truth = y_test
truth = np.where(truth=="Iris-setosa", "Setosa", truth)
truth = np.where(truth=="Iris-versicolor", "Versicolor", truth)
truth = np.where(truth=="Iris-virginica", "Virginica", truth)

Select which prediction from the test set to explain:

In [None]:
pred_idx = 2 # <- you can change this to select specific prediction to explain
class_idx = class_names.tolist().index(predictions[pred_idx])

print("Predicted class: ", predictions[pred_idx])
print("True class: ", truth[pred_idx])
print("\nPredicted probabilities:")

iter=0
for label in class_names:
    print(label,": ",probabilities[pred_idx][iter])
    iter+=1

In [None]:
clf_cb.classes_

### 3.1 LIME <a class="anchor" id="ch3.1"></a>
https://github.com/marcotcr/lime

In [None]:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=feature_names, class_names=class_names, discretize_continuous=True)

Visualize local explanation of the prediction:

In [None]:
exp = lime_explainer.explain_instance(X_test[4], clf.predict_proba, num_features=5, top_labels = 3)
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
explist=exp.as_list()
explist

### 3.2 Anchor Explanations <a class="anchor" id="ch3.2"></a>

In [None]:
predict_fn = lambda x: clf.predict_proba(x)
anchor_explainer = AnchorTabular(predict_fn, feature_names)
anchor_explainer.fit(X_train)

In [None]:
anch_exp = anchor_explainer.explain(X_test[pred_idx], threshold=0.90)

In [None]:
#class_names = target_names

anch_exp = anchor_explainer.explain(X_test[pred_idx], threshold=0.95)
print('\nANCHOR:\n\nIF %s' % ('\n AND '.join(anch_exp.anchor)))
print('THEN PREDICT: ', class_names[anchor_explainer.predictor(X_test[pred_idx].reshape(1, -1))[0]])
print('\nWITH PRECISION: %.2f' % anch_exp.precision)
print('WITH COVERAGE: %.2f' % anch_exp.coverage)

In [None]:
print('Prediction: ', class_names[anchor_explainer.predictor(X_test[pred_idx].reshape(1, -1))[0]])
print('\nAnchor:\n %s' % ('\n AND '.join(anch_exp.anchor)))
print('\nPrecision: %.2f' % anch_exp.precision)
print('Coverage: %.2f' % anch_exp.coverage)

In [None]:
print("Sepal Length (cm): ", X_test[pred_idx][0])
print("Sepal Width (cm): ", X_test[pred_idx][1])
print("Petal Length (cm): ", X_test[pred_idx][2])
print("Petal Width (cm): ", X_test[pred_idx][3])

### 3.3 Counterfactuals Guided by Prototypes <a class="anchor" id="ch3.3"></a>

In [None]:
clf = clf_rf

In [None]:
X = X_test[pred_idx].reshape((1,) + X_test[12].shape)
shape = X.shape

In [None]:
predict_fn = lambda x: clf.predict_proba(x)

In [None]:
tf.compat.v1.disable_eager_execution()

In [None]:
# initialize explainer, fit and generate counterfactual
cf = CounterFactualProto(predict_fn, shape, use_kdtree=True, theta=10., max_iterations=500,
                         feature_range=(X_train.min(axis=0), X_train.max(axis=0)), 
                         c_init=1., c_steps=10, eps=(0.05, 0.05))

In [None]:
cf.fit(X_train);

In [None]:
startTimeModule = datetime.now()
explanation = cf.explain(X)
print ('Computing time:',(datetime.now() - startTimeModule))

In [None]:
#print(f'Original prediction: {class_names[explanation.orig_class]}')
print('Nearest counterfactual instance: {}'.format(class_names[explanation.cf['class']]))
print('Probabilities: ',round(explanation.cf['proba'][0][0],2)," ",
      round(explanation.cf['proba'][0][1],2)," ",
      round(explanation.cf['proba'][0][2],2),)
print('\nSmallest feature value changes necessary:\n')
orig = X
counterfactual = explanation.cf['X']
delta = counterfactual - orig
for i, f in enumerate(feature_names):
    if np.abs(delta[0][i]) > 1e-4:
        print('{}: {:.2f}  -->   {:.2f}'.format(f,orig[0][i], counterfactual[0][i]))


In [None]:
orig = X
counterfactual = explanation.cf['X']
delta = counterfactual - orig
for i, f in enumerate(feature_names):
    if np.abs(delta[0][i]) > 1e-4:
        print('{}: {:.2f}  -->   {:.2f}'.format(f,orig[0][i], counterfactual[0][i]))

### 3.4 Whitebox: Decision Tree <a class="anchor" id="ch3.4"></a>

In [None]:
predictions_dt = clf_dt.predict(X_test)

In [None]:
predictions_dt = np.where(predictions_dt=="Iris-setosa", "Setosa", predictions_dt)
predictions_dt = np.where(predictions_dt=="Iris-versicolor", "Versicolor", predictions_dt)
predictions_dt = np.where(predictions_dt=="Iris-virginica", "Virginica", predictions_dt)

In [None]:
tree_classes_df = pd.DataFrame(predictions_dt)
tree_feature_names = X_original.columns.values

In [None]:
dot_data = tree.export_graphviz(clf_dt,
                     feature_names=tree_feature_names, 
                     class_names=class_names,
                     filled=False, rounded=True,
                     special_characters=True,
                     out_file=None,)
graph = graphviz.Source(dot_data)
graph.format = "png"
graph.render("iris_tree")

In [None]:
probabilities_dt = clf_dt.predict_proba(X_test)

In [None]:
pred_idx = pred_idx # <- you can change this to select specific prediction to explain
class_idx = class_names.tolist().index(predictions_dt[pred_idx])

print("Predicted class: ", predictions_dt[pred_idx])
print("True class: ", truth[pred_idx])
print("\nPredicted probabilities:")

iter=0
for label in class_names:
    print(label,": ",probabilities_dt[pred_idx][iter])
    iter+=1

In [None]:
import statistics
times=[]
i = 0
while i <= 100:
    startTimeModule = datetime.now()
    clf_dt.predict(X_train[[1]])
    endtime=(datetime.now() - startTimeModule)
    times.append(endtime.seconds+(endtime.microseconds/1000/1000))
    i+=1
    
print(statistics.mean(times))
print(statistics.stdev(times))