In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from data.getdata import loaddata
from data.split3fold import split3fold
import pickle
from sklearn.tree import DecisionTreeClassifier
from matplotlib import colors
from sklearn.preprocessing import LabelEncoder

plt.style.use('ggplot')

from vars import plot_colors, color_dict, make_meshgrid, plot_contours,get_rules

In [None]:
data = 'compas'

In [None]:
dataA, dataB, cols, discrete, continuous, le = loaddata(data)
blackboxtrainA, trainA, testA = split3fold(dataA, 0.4, 0.2, random_state=1)
blackboxtrainB, trainB, testB = split3fold(dataB, 0.4, 0.2, random_state=1)

In [None]:
modelA = pickle.load(open('blackboxes/'+data+'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/'+data+'B.sav', 'rb'))

In [None]:
#create difference detection dataset:
train = pd.concat([trainA, trainB])
train['predA'] = modelA.predict(train[cols].values)
train['predB'] = modelB.predict(train[cols].values)
train['difference'] = train.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
train.drop(columns=['predA', 'predB', 'y'], inplace=True)
train.reset_index(inplace=True, drop = True)
test = pd.concat([testA, testB])
test['predA'] = modelA.predict(test[cols].values)
test['predB'] = modelB.predict(test[cols].values)
test['difference'] = test.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
test.drop(columns=['predA', 'predB', 'y'], inplace=True)
test.reset_index(inplace=True, drop = True)

In [None]:
#baseline global explainer:
baseline = DecisionTreeClassifier(random_state=0)
baseline.fit(train[cols].values, train.difference)

In [None]:
pred = baseline.predict(test[cols].values)

In [None]:
performancefile = 'results/Baselineperformance.txt'
with open(performancefile, 'a') as myfile:
    myfile.write('model data iteration depthexplainer leavesexplainer accuracy precisionmacro recallmacro\n')
for j in range(5):
    baseline = DecisionTreeClassifier(random_state=j)
    baseline.fit(train[cols].values, train.difference)
    pred = baseline.predict(test[cols].values)
    with open(performancefile, 'a') as f:
        line = ' '.join(["Approach 0: Baseline",
                         data,
                         str(j),
                         str(baseline.get_depth()),
                         str(baseline.get_n_leaves()),
                         str(metrics.accuracy_score(test.difference, pred)),
                         str(metrics.precision_score(test.difference, pred, average='macro')),
                         str(metrics.recall_score(test.difference, pred, average='macro'))
                         ])
        f.write(line + '\n')

#### Varying depth of tree

In [None]:
performancefile = 'results/Baselineperformance_depth.txt'
with open(performancefile, 'a') as myfile:
    myfile.write('model data maxdepthexplainer depthexplainer leavesexplainer accuracy precisionmacro recallmacro\n')
for j in [3,5,7,None]:
    baseline = DecisionTreeClassifier(random_state=0, max_depth=j)
    baseline.fit(train[cols].values, train.difference)
    pred = baseline.predict(test[cols].values)
    with open(performancefile, 'a') as f:
        line = ' '.join(['Baseline',
                         data,
                         str(j),
                         str(baseline.get_depth()),
                         str(baseline.get_n_leaves()),
                         str(metrics.accuracy_score(test.difference, pred)),
                         str(metrics.precision_score(test.difference, pred, average='macro')),
                         str(metrics.recall_score(test.difference, pred, average='macro'))
                         ])
        f.write(line + '\n')

#### Reduced training data

In [None]:
performancefile = 'results/Baselineperformance_reduced.txt'
with open(performancefile, 'a') as myfile:
    myfile.write('model data iteration maxdepthexplainer depthexplainer leavesexplainer accuracy precisionmacro recallmacro\n')
for j in range(5):
    trainsub = train.sample(n=100, random_state=j).reset_index(drop=True)
    for i in [3,5,7,None]:
        baseline = DecisionTreeClassifier(random_state=0, max_depth=i)
        baseline.fit(trainsub[cols].values, trainsub.difference)
        pred = baseline.predict(test[cols].values)
        with open(performancefile, 'a') as f:
            line = ' '.join(['Baseline',
                             data,
                             str(j),
                             str(i),
                             str(baseline.get_depth()),
                             str(baseline.get_n_leaves()),
                             str(metrics.accuracy_score(test.difference, pred)),
                             str(metrics.precision_score(test.difference, pred, average='macro')),
                             str(metrics.recall_score(test.difference, pred, average='macro'))
                             ])
            f.write(line + '\n')

## Running example

In [None]:
data = 'running2'

In [None]:
train, cols = loaddata(data)

In [None]:
modelA = pickle.load(open('blackboxes/'+data+'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/'+data+'B.sav', 'rb'))

In [None]:
train['yA'] = modelA.predict(train[['x1', 'x2']].values)
train['yB'] = modelB.predict(train[['x1', 'x2']].values)
train['difference'] = train.apply(lambda row: '%g' % row['yA'] + '|' + '%g' % row['yB'], axis = 1)
train.drop(columns=['yA', 'yB'], inplace=True)

In [None]:
#baseline global explainer:
baseline = DecisionTreeClassifier(random_state=0)
baseline.fit(train[cols].values, train.difference)

In [None]:
#load test data:
test, cols = loaddata(data + 'test')
test['yA'] = modelA.predict(test[cols].values)
test['yB'] = modelB.predict(test[cols].values)
test['difference'] = test.apply(lambda row: '%g' % row['yA'] + '|' + '%g' % row['yB'], axis=1)
test.drop(columns=['yA', 'yB'], inplace=True)
#Evaluation:
pred = baseline.predict(test[cols])

if data == 'running1':
    dataname = '"Sine"'
else:
    dataname = '"Spiral"'
pred = baseline.predict(test[cols].values)
with open('results/FromLocalToGlobalrunning.txt', 'a') as myfile:
    line = ' '.join([dataname,
                     '"Approach 0: Baseline"',
                     str(baseline.get_depth()),
                     str(baseline.get_n_leaves()),
                     str(metrics.accuracy_score(test.difference, pred)),
                     str(metrics.precision_score(test.difference, pred, average='macro')),
                     str(metrics.recall_score(test.difference, pred, average='macro'))
                     ])
    myfile.write(line + '\n')

In [None]:
X0 = train.x1
X1 = train.x1
xx, yy = make_meshgrid(X0, X1, h = 0.005)
z = baseline.predict(np.c_[xx.ravel(), yy.ravel()])
d=LabelEncoder()
d.fit(np.array([x for x in color_dict[data].keys()]))
z = d.transform(z)
z = z.reshape(xx.shape)

In [None]:
ordering = [x for x in color_dict[data].keys()]
keys = list(ordering)
ordering.sort()
ordering = [keys.index(x) for x in ordering]

In [None]:
values = [x[0] for x in color_dict[data].values()]
orderedmap = [values[i] for i in ordering]
MyCmap=colors.ListedColormap(orderedmap)
fig, ax = plt.subplots(figsize = (10,8))

cntr1 = plot_contours(ax, modelA, xx, yy, levels = 1,colors = 'black',linewidths = 2, linestyles = 'dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels = 1, colors = 'black',linewidths = 1)

cp = ax.contourf(xx, yy, z+0.1, alpha = 0.7, cmap=MyCmap)

h = [plt.plot([],[], color = i[0], linewidth=10, label = j)[0] for j,i in color_dict[data].items()]
ax.legend(handles=h, loc='lower left', title='Prediction Explainer', frameon = False, bbox_to_anchor=(0,-0.15), ncol = 9)

ax.scatter(train.x1, train.x2, c='black', alpha = 0.3, s=10)

ax.set_facecolor('#FFFFFF')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.grid(True, color = '#F3F3F3')

ax.set_ylim(train.x2.min(), train.x2.max())
ax.set_xlim(train.x1.min(), train.x1.max())

plt.tight_layout()
plt.savefig("docout/sections/localtoglobal/results/Baseline_decisionsurfaceexplainer_" + data + ".jpg", dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
from matplotlib.colors import to_rgb
from sklearn.tree import plot_tree
import matplotlib
import re

def replace_text(obj):
    if type(obj) == matplotlib.text.Annotation:
        txt = obj.get_text()
        txt = re.sub("samples[^$]*class", "class", txt)
        obj.set_text(txt)
    return obj

In [None]:
fig, ax = plt.subplots(figsize=(28, 5))
class_names = baseline.classes_
#class_names_decoded = dec.inverse_transform(class_names)
#class_names = [classes_dict[data][x] for x in class_names_decoded]
c = [color_dict[data][x][0] for x in class_names]
N = len(class_names)
artists = plot_tree(baseline, fontsize=6, ax=ax,
                    impurity=False, node_ids=True,
                    feature_names=cols, class_names=class_names)
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
for artist, impurity, value in zip(artists, baseline.tree_.impurity, baseline.tree_.value):
    # let the max value decide the color; whiten the color depending on impurity (gini)
    r, g, b = to_rgb(c[np.argmax(value)])
    f = impurity * N / (N - 1) if N > 1 else 0
    rnew = f + (1 - f) * r
    gnew = f + (1 - f) * g
    bnew = f + (1 - f) * b
    artist.get_bbox_patch().set(facecolor = (rnew, gnew, bnew),
                                edgecolor = 'black')
    brightness = np.sqrt(0.299*rnew*rnew + 0.587*gnew*gnew + 0.114*bnew*bnew)
    if brightness < 0.5:
        artist.set(color = 'white')
plt.savefig('docout/sections/localtoglobal/results/baseline_'+data+'_explainer.jpg',dpi=300, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
import pyperclip
rules = get_rules(baseline, ['x_1', 'x_2'], class_names = baseline.classes_)
rules = '\n'.join(rules)
pyperclip.copy(rules)