In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from data.getdata import loaddata, prepare_df
from data.split3fold import split3fold
import pickle
import matplotlib.pylab as pl
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import minisom
from collections import defaultdict
from scipy.cluster.hierarchy import dendrogram, linkage, set_link_color_palette, fcluster
from matplotlib.legend import Legend
from matplotlib import colors

plt.style.use('ggplot')

from vars import plot_colors, color_dict, classes_dict, make_meshgrid, plot_contours,get_rules
from FromLocalToGlobal import *

In [None]:
data = 'bankmarketing'

In [None]:
dataA, dataB, cols, discrete, continuous, le = loaddata(data)
blackboxtrainA, trainA, testA = split3fold(dataA, 0.4, 0.2, random_state=1)
blackboxtrainB, trainB, testB = split3fold(dataB, 0.4, 0.2, random_state=1)

In [None]:
modelA = pickle.load(open('blackboxes/'+data+'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/'+data+'B.sav', 'rb'))

In [None]:
#create difference detection dataset:
train = pd.concat([trainA, trainB])
train['predA'] = modelA.predict(train[cols].values)
train['predB'] = modelB.predict(train[cols].values)
train['difference'] = train.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
train.drop(columns=['predA', 'predB', 'y'], inplace=True)
train = train.reset_index(drop = True, inplace = False)
test = pd.concat([testA, testB])
test['predA'] = modelA.predict(test[cols].values)
test['predB'] = modelB.predict(test[cols].values)
test['difference'] = test.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
test.drop(columns=['predA', 'predB', 'y'], inplace=True)
discrete.append('difference')

#### Approach 1.3:
##### Dendrograms:

In [None]:
differenceclasses = train.difference.unique()
differenceclasses = differenceclasses[~np.isin(differenceclasses, ['0|0', '1|1', '2|2'])]
differenceclasses.sort()

discrete_woclassname = discrete.copy()
discrete_woclassname.remove('difference')

d = defaultdict(lambda: OneHotEncoder(drop = 'first'))
trainbinary = train.copy()
testbinary = test.copy()
colsbinary = cols.copy()
for feature in discrete_woclassname:
    uniquevals = np.concatenate((trainbinary[feature].values.reshape(-1,1), testbinary[feature].values.reshape(-1,1)))
    d[feature].fit(uniquevals)
    tmp = d[feature].transform(trainbinary[feature].values.reshape(-1,1)).toarray()
    colnames = [feature + str(i) for i in range(tmp.shape[1])]
    trainbinary[colnames] = tmp
    testbinary[colnames] = d[feature].transform(testbinary[feature].values.reshape(-1,1)).toarray()
    colsbinary = colsbinary + colnames
    colsbinary.remove(feature)
    trainbinary.drop(columns = feature, inplace = True)
    testbinary.drop(columns = feature, inplace = True)

trainbinarynorm = trainbinary[colsbinary].copy()
if(len(continuous)>0):
    d = StandardScaler()
    scaler = d.fit(trainbinarynorm[continuous].values)
    trainbinarynorm[continuous] = scaler.transform(trainbinarynorm[continuous].values)

In [None]:
linkagetype = 'ward'
linked = []
for value in differenceclasses:
    subtrainbinarynorm = trainbinarynorm.loc[trainbinary.difference == value, colsbinary].values
    linked.append(linkage(subtrainbinarynorm, linkagetype))

In [None]:
#number of instances for which a neighborhood is generated:
if data == 'compas':
    t = {'2|0': 4, '0|1': 15,
         '2|1': 10, '1|2': 4.5,
         '0|2': 6, '1|0': 10
         } #ward linkage 26
elif data == 'bankmarketing':
    t = {'0|1': 25, '1|0': 10} #ward linkage 21

ninstances = 0
for index, value in enumerate(differenceclasses):
    clusterarr = pd.DataFrame(fcluster(linked[index], t=t[value], criterion='distance'), columns=['cluster'])
    ncluster = len(np.unique(clusterarr))
    ninstances +=  ncluster
    print("{dclass}: {n}".format(dclass = value, n=ncluster))
print(ninstances)

In [None]:
if data == 'compas':
    gs = gridspec.GridSpec(2, 3)
    fig = plt.figure(figsize=(12, 6))
    t = {'2|0': 4, '0|1': 15,
         '2|1': 10, '1|2': 4.5,
         '0|2': 6, '1|0': 10
         } #26 instances
elif data == 'bankmarketing':
    gs = gridspec.GridSpec(1, 2)
    fig = plt.figure(figsize=(12, 3))

for index, value in enumerate(differenceclasses):
    col = int(np.mod(index, 3))
    row = int(np.floor(index/3))
    ax = pl.subplot(gs[row, col])
    set_link_color_palette(plot_colors)
    D = dendrogram(linked[index], ax=ax, no_labels=True, color_threshold=t[value],
                   above_threshold_color='k')
    ax.axhline(y=t[value], c = 'black', linestyle = 'dotted')
    ax.set_facecolor('#FFFFFF')
    ax.set_title('Region ($M_A|M_B$): ' + classes_dict[data][value], fontsize = 10)
    ax.grid(False)
plt.tight_layout()
plt.savefig('docout/sections/localtoglobal/results/Approach3_Dendrograms_'+ linkagetype +'_'+ str(ninstances) +'_' + data + '.jpg', dpi=150)

##### SOM projection to 2-dim with colored regions:

In [None]:
print(np.sqrt(5 * np.sqrt(len(train))))
shapesom = 30

In [None]:
som = minisom.MiniSom(shapesom, shapesom, trainbinarynorm.shape[1], sigma=2, learning_rate=1, random_seed = 0)
som.train(trainbinarynorm.values, 100000, verbose = True)

In [None]:
#U-Matrix.
fig, ax = plt.subplots(figsize = (10,11))
plt.grid(False)
im = plt.pcolor(som.distance_map().T, cmap='Greys')  # plotting the distance map as background

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_yticks([])
ax.set_xticks([])

plt.subplots_adjust(bottom=0.15)
cax = plt.axes([0.1, 0.1, 0.8, 0.017])
plt.colorbar(cax=cax, orientation = 'horizontal')

plt.savefig('docout/sections/localtoglobal/SOMprojection_UMatrix_' + data + '.jpg', dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
Z = np.zeros((shapesom, shapesom))
fig, ax = plt.subplots(figsize = (10,10))

collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

w_x, w_y = zip(*[som.winner(d) for d in trainbinarynorm.values])
for i in np.arange(som._weights.shape[0]):
    for j in np.arange(som._weights.shape[1]):
        idx = np.logical_and(w_x == i, w_y == j)
        tmp = train.difference[idx]
        if len(tmp) > 0:
            feature = tmp.value_counts().idxmax()
            collector_color[feature] = plt.plot([i + .5], [j + .5], color=color_dict[data][feature][0],
                                                marker='s', markersize=15, linewidth = 0) #22 #15
            collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=color_dict[data][feature][1][0],
                                                 color=color_dict[data][feature][1][1], markersize=10, #15 #10
                                                 markerfacecolor = 'None',linewidth = 0)

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = plt.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [classes_dict[data][j] for j in collector_color.keys()],#markerscale = 0.5, #0.7
                 title='$M_A$ | $M_B$', frameon=False, ncol=5, loc='lower left',bbox_to_anchor=(0, -0.11)
                 )
leg._legend_box.align = "left"

plt.xlim([0, shapesom])
plt.ylim([0, shapesom])

plt.grid(color = '#ffffff')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(range(shapesom))
ax.set_yticks(range(shapesom))
ax.set_yticklabels([])
ax.set_xticklabels([])

plt.savefig('docout/sections/localtoglobal/SOMprojection_ClassColored_' + data + '.jpg', dpi=150, bbox_inches='tight', transparent=False,pad_inches=0)

### Obtain explaining model

In [None]:
random_state = 0
globaldata = getglobal(train, classname = 'difference', npoints = 1, populationsize=1000, method = clusterbasedinstances, random_state = random_state, discrete = discrete, continuous=continuous, t=t)

## Running example plots

In [None]:
performancefile = 'results/FromLocalToGlobalrunning.txt'

In [None]:
data = 'running1'

In [None]:
train, cols = loaddata(data)

In [None]:
modelA = pickle.load(open('blackboxes/'+data+'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/'+data+'B.sav', 'rb'))

In [None]:
train['yA'] = modelA.predict(train[['x1', 'x2']].values)
train['yB'] = modelB.predict(train[['x1', 'x2']].values)
train['difference'] = train.apply(lambda row: '%g' % row['yA'] + '|' + '%g' % row['yB'], axis = 1)
train.drop(columns=['yA', 'yB'], inplace=True)

In [None]:
#load test data:
test, cols = loaddata(data + 'test')
test['yA'] = modelA.predict(test[cols].values)
test['yB'] = modelB.predict(test[cols].values)
test['difference'] = test.apply(lambda row: '%g' % row['yA'] + '|' + '%g' % row['yB'], axis=1)
test.drop(columns=['yA', 'yB'], inplace=True)

In [None]:
from matplotlib.colors import to_rgb
from sklearn.tree import plot_tree
import matplotlib
import re
import pyperclip


def replace_text(obj):
    if type(obj) == matplotlib.text.Annotation:
        txt = obj.get_text()
        txt = re.sub("samples[^$]*class", "class", txt)
        obj.set_text(txt)
    return obj

### Plot for Approach 1.1

In [None]:
populationsize = 1000
globaldata = getglobal(train, modelA=modelA, modelB=modelB, classname='difference', npoints=8,
                       populationsize=populationsize, method=randominstances,
                       random_state=0, discrete=['difference'],
                       continuous=['x1', 'x2'])
globaldata.columns = ['x1', 'x2']

In [None]:
with open('docout/sections/localtoglobal/res/approach1_neighborhood_' + data + '.pickle', 'wb') as handle:
    pickle.dump(globaldata, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('docout/sections/localtoglobal/res/approach1_neighborhood_' + data + '.pickle', 'rb') as handle:
    globaldata = pickle.load(handle)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(X0, X1, h=0.005)

z1 = modelA.predict(np.c_[xx.ravel(), yy.ravel()])
z1 = z1.reshape(xx.shape)
z2 = modelB.predict(np.c_[xx.ravel(), yy.ravel()])
z2 = z2.reshape(xx.shape)

cntr1 = plot_contours(ax, modelA, xx, yy, levels=1, colors='black', linewidths=2, linestyles='dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels=1, colors='black', linewidths=1)

h = [plt.plot([], [], ls=i, color='black')[0] for i in ['dotted', 'solid']]
ax.legend(handles=h, labels=['Decision Boundary $M_A$', 'Decision Boundary $M_B$'],
          loc='lower left', title='', frameon=False, bbox_to_anchor=(0, -0.12),
          ncol=4)

ax.scatter(globaldata.x1, globaldata.x2, c='black', alpha=0.3, s=6)
choseninstancesindex = np.unique(globaldata.index.get_level_values(0))
choseninstances = train.iloc[choseninstancesindex]
ax.scatter(choseninstances.x1, choseninstances.x2, c='#D90429', s=150, marker='X')

ax.set_facecolor('#FFFFFF')
plt.tight_layout()
plt.grid(True, color='#F3F3F3')

ax.set_xlim(X0.min(),X0.max())
ax.set_ylim(train.x2.min(),train.x2.max())

plt.savefig('docout/sections/localtoglobal/results/Approach1_generatedneighborhoods_' + data + '.jpg', dpi=150,bbox_inches='tight',transparent=True, pad_inches=0)

In [None]:
predA = modelA.predict(globaldata.values)
predB = modelB.predict(globaldata.values)
difference = np.array([str(x1) + '|' + str(x2) for x1, x2 in zip(predA, predB)])
explainer = DecisionTreeClassifier(random_state=0)
explainer.fit(globaldata.values, difference)

In [None]:
X0, X1 = globaldata.x1, globaldata.x2
xx, yy = make_meshgrid(X0, X1, h=0.005)
z = explainer.predict(np.c_[xx.ravel(), yy.ravel()])
d=LabelEncoder()
d.fit(np.array([x for x in color_dict[data].keys()]))
z = d.transform(z)
z = z.reshape(xx.shape)

In [None]:
ordering = [x for x in color_dict[data].keys()]
keys = list(ordering)
ordering.sort()
ordering = [keys.index(x) for x in ordering]

In [None]:
#Evaluation:
if data == 'running1':
    dataname = '"Sine"'
else:
    dataname = '"Spiral"'
pred = explainer.predict(test[cols].values)
with open(performancefile, 'a') as myfile:
    line = ' '.join([dataname,
                     '"Approach 1: Random sampling"',
                     str(explainer.get_depth()),
                     str(explainer.get_n_leaves()),
                     str(metrics.accuracy_score(test.difference, pred)),
                     str(metrics.precision_score(test.difference, pred, average='macro')),
                     str(metrics.recall_score(test.difference, pred, average='macro'))
                     ])
    myfile.write(line + '\n')

In [None]:
values = [x[0] for x in color_dict[data].values()]
orderedmap = [values[i] for i in ordering]
MyCmap=colors.ListedColormap(orderedmap)
fig, ax = plt.subplots(figsize=(10, 8))

cntr1 = plot_contours(ax, modelA, xx, yy, levels=1, colors='black', linewidths=2, linestyles='dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels=1, colors='black', linewidths=1)

cp = ax.contourf(xx, yy, z+0.1, alpha=0.7, cmap=MyCmap)

h = [plt.plot([], [], color=i[0], linewidth=10, label=j)[0] for j, i in color_dict[data].items()]
ax.legend(handles=h, loc='lower left', title='Prediction Explainer', frameon=False, bbox_to_anchor=(0, -0.12),
          ncol=9)

ax.set_facecolor('#FFFFFF')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.grid(True, color='#F3F3F3')

ax.set_ylim(train.x2.min(), train.x2.max())
ax.set_xlim(train.x1.min(), train.x1.max())

plt.tight_layout()
plt.savefig('docout/sections/localtoglobal/results/Approach1_decisionsurfaceexplainer_' + data + '.jpg', dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(55, 10))
class_names = explainer.classes_
colorss = [color_dict[data][x][0] for x in class_names]
N = len(class_names)
artists = plot_tree(explainer, fontsize=8, ax=ax,
                    impurity=False, node_ids=True,
                    feature_names=cols, class_names=class_names)
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
for artist, impurity, value in zip(artists, explainer.tree_.impurity, explainer.tree_.value):
    # let the max value decide the color; whiten the color depending on impurity (gini)
    r, g, b = to_rgb(colorss[np.argmax(value)])
    f = impurity * N / (N - 1) if N > 1 else 0
    artist.get_bbox_patch().set_facecolor((f + (1 - f) * r, f + (1 - f) * g, f + (1 - f) * b))
    artist.get_bbox_patch().set_edgecolor('black')
plt.savefig('docout/sections/localtoglobal/results/Approach1_' + data + '_explainer.jpg', dpi=300, bbox_inches='tight',transparent=True, pad_inches=0)

In [None]:
rules = get_rules(explainer, ['x_1', 'x_2'], class_names)
rules = '\n'.join(rules)
pyperclip.copy(rules)

### Plot for Approach 1.2

In [None]:
populationsize = 1000
globaldata = getglobal(train, classname='difference', npoints=8, modelA=modelA, modelB=modelB,
                       populationsize=populationsize, method=stratifiedrandominstances,
                       random_state=23, discrete=['difference'], #5
                       continuous=['x1', 'x2'])
globaldata.columns = ['x1', 'x2']

In [None]:
with open('docout/sections/localtoglobal/res/approach2_neighborhood_' + data + '.pickle', 'wb') as handle:
    pickle.dump(globaldata, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('docout/sections/localtoglobal/res/approach2_neighborhood_' + data + '.pickle', 'rb') as handle:
    globaldata = pickle.load(handle)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(X0, X1, h=0.005)

z1 = modelA.predict(np.c_[xx.ravel(), yy.ravel()])
z1 = z1.reshape(xx.shape)
z2 = modelB.predict(np.c_[xx.ravel(), yy.ravel()])
z2 = z2.reshape(xx.shape)

cntr1 = plot_contours(ax, modelA, xx, yy, levels=1, colors='black', linewidths=2, linestyles='dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels=1, colors='black', linewidths=1)

h = [plt.plot([], [], ls=i, color='black')[0] for i in ['dotted', 'solid']]
ax.legend(handles=h, labels=['Decision Boundary $M_A$', 'Decision Boundary $M_B$'],
          loc='lower left', title='', frameon=False, bbox_to_anchor=(0, -0.12),
          ncol=4)

ax.scatter(globaldata.x1, globaldata.x2, c='black', alpha=0.3, s=6)
choseninstancesindex = np.unique(globaldata.index.get_level_values(0))
choseninstances = train.iloc[choseninstancesindex]
ax.scatter(choseninstances.x1, choseninstances.x2, c='#D90429', s=150, marker='X')

ax.set_facecolor('#FFFFFF')
plt.tight_layout()
plt.grid(True, color='#F3F3F3')

ax.set_xlim(X0.min(), X0.max())
ax.set_ylim(train.x2.min(), train.x2.max())

plt.savefig('docout/sections/localtoglobal/results/Approach2_generatedneighborhoods_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
predA = modelA.predict(globaldata.values)
predB = modelB.predict(globaldata.values)
difference = np.array([str(x1) + '|' + str(x2) for x1, x2 in zip(predA, predB)])
explainer = DecisionTreeClassifier(random_state=0)
explainer.fit(globaldata.values, difference)

In [None]:
X0, X1 = globaldata.x1, globaldata.x2
xx, yy = make_meshgrid(X0, X1, h=0.005)
z = explainer.predict(np.c_[xx.ravel(), yy.ravel()])
d=LabelEncoder()
d.fit(np.array([x for x in color_dict[data].keys()]))
z = d.transform(z)
z = z.reshape(xx.shape)

In [None]:
#Evaluation:
if data == 'running1':
    dataname = '"Sine"'
else:
    dataname = '"Spiral"'
pred = explainer.predict(test[cols].values)
with open(performancefile, 'a') as myfile:
    line = ' '.join([dataname,
                     '"Approach 2: Class-stratified sampling"',
                     str(explainer.get_depth()),
                     str(explainer.get_n_leaves()),
                     str(metrics.accuracy_score(test.difference, pred)),
                     str(metrics.precision_score(test.difference, pred, average='macro')),
                     str(metrics.recall_score(test.difference, pred, average='macro'))
                     ])
    myfile.write(line + '\n')

In [None]:
ordering = [x for x in color_dict[data].keys()]
keys = list(ordering)
ordering.sort()
ordering = [keys.index(x) for x in ordering]

In [None]:
values = [x[0] for x in color_dict[data].values()]
orderedmap = [values[i] for i in ordering]
MyCmap = colors.ListedColormap(orderedmap)
fig, ax = plt.subplots(figsize=(10, 8))

cntr1 = plot_contours(ax, modelA, xx, yy, levels=1, colors='black', linewidths=2, linestyles='dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels=1, colors='black', linewidths=1)

cp = ax.contourf(xx, yy, z + 0.1, alpha=0.7, cmap=MyCmap)

h = [plt.plot([], [], color=i[0], linewidth=10, label=j)[0] for j, i in color_dict[data].items()]
ax.legend(handles=h, loc='lower left', title='Prediction Explainer', frameon=False, bbox_to_anchor=(0, -0.12),
          ncol=9)

ax.set_facecolor('#FFFFFF')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.grid(True, color='#F3F3F3')

ax.set_ylim(train.x2.min(), train.x2.max())
ax.set_xlim(train.x1.min(), train.x1.max())

plt.tight_layout()
plt.savefig('docout/sections/localtoglobal/results/Approach2_decisionsurfaceexplainer_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(57, 10))
class_names = explainer.classes_
colorss = [color_dict[data][x][0] for x in class_names]
N = len(class_names)
artists = plot_tree(explainer, fontsize=8, ax=ax,
                    impurity=False, node_ids=True,
                    feature_names=cols, class_names=class_names)
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
for artist, impurity, value in zip(artists, explainer.tree_.impurity, explainer.tree_.value):
    # let the max value decide the color; whiten the color depending on impurity (gini)
    r, g, b = to_rgb(colorss[np.argmax(value)])
    f = impurity * N / (N - 1) if N > 1 else 0
    artist.get_bbox_patch().set_facecolor((f + (1 - f) * r, f + (1 - f) * g, f + (1 - f) * b))
    artist.get_bbox_patch().set_edgecolor('black')
plt.savefig('docout/sections/localtoglobal/results/Approach2_' + data + '_explainer.jpg', dpi=300, bbox_inches='tight',
            transparent=True, pad_inches=0)

In [None]:
rules = get_rules(explainer, ['x_1', 'x_2'], class_names)
rules = '\n'.join(rules)
pyperclip.copy(rules)

### Plot for Approach 1.3

In [None]:
#create hierarchical clustering:
d = StandardScaler()
scaler = d.fit(train[cols].values)
trainnorm = scaler.transform(train[cols].values)

subtrain = trainnorm[train.difference == '1|0']
linked = linkage(subtrain, 'single')

In [None]:
#Plot for running1
x = np.arange(train.x1.min(),train.x1.max(),0.1)
bba = 4*np.sin(x)
bbb = np.sin(x)/x

X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(np.array([X0.min(),X0.max()]), X1, h = 0.05)
z1 = modelA.predict(np.c_[xx.ravel(), yy.ravel()])
z1 = z1.reshape(xx.shape)
z2 = modelB.predict(np.c_[xx.ravel(), yy.ravel()])
z2 = z2.reshape(xx.shape)

res = (z1 == 1) & (z2 == 0)
res = res.astype(int).astype(str)

gs = gridspec.GridSpec(1, 2)
fig = plt.figure(figsize=(16, 7))


set_link_color_palette(plot_colors)
ax2 = plt.subplot(gs[0, 1])
D = dendrogram(linked, ax=ax2, no_labels=True, color_threshold=0.64,
           above_threshold_color='k')
ax2.set_facecolor('#FFFFFF')
ax2.grid(False)

tmp = pd.DataFrame({'index': D['leaves'],'nodecolor': D['leaves_color_list']})

ax1 = plt.subplot(gs[0, 0])
ax1.plot(x, bba, ':', color = 'black')
ax1.plot(x, bbb, '-', color = 'black')
ax1.contourf(xx,yy,res, alpha = 0.1, cmap = plt.cm.binary)


h = [plt.plot([],[], ls = 'dotted', color = 'black')[0],
     plt.plot([],[], ls = 'solid', color = 'black')[0],
     plt.plot([],[], color = '#BEBEBE', linewidth=10)[0]
     ]
ax1.legend(handles=h, labels=['Decision Boundary $M_A$', 'Decision Boundary $M_B$', 'Region: $M_A$: 1| $M_B$: 0'],
           loc='lower left', title='', frameon = False, bbox_to_anchor=(0,-0.2), ncol = 2)

subddata = train.loc[train.difference == '1|0', cols]

for col in tmp.nodecolor.unique():
    ind = tmp.loc[tmp.nodecolor == col, 'index']
    ax1.scatter(subddata.iloc[ind]['x1'],subddata.iloc[ind]['x2'],
            color = col, s=15)

ax1.scatter(train.loc[train.difference != '1|0', 'x1'],
            train.loc[train.difference != '1|0', 'x2'],
            color = 'black', s=10, alpha = 0.1)

ax1.set_facecolor('#FFFFFF')
ax1.set_xlabel('$x_1$')
ax1.set_ylabel('$x_2$')
ax1.grid(True, color = '#F3F3F3')

plt.savefig('docout/sections/localtoglobal/results/Approach3_' + data + "_colored.jpg", dpi=150,bbox_inches='tight')

In [None]:
som = minisom.MiniSom(20, 20, trainnorm.shape[1], sigma=1, learning_rate=1, random_seed = 1)
som.train(trainnorm, 100000, verbose = True)

In [None]:
#U-Matrix.
fig, ax = plt.subplots(figsize = (10,11))
plt.grid(False)
im = plt.pcolor(som.distance_map().T, cmap='Greys')  # plotting the distance map as background

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_yticks([])
ax.set_xticks([])

plt.subplots_adjust(bottom=0.15)
cax = plt.axes([0.1, 0.1, 0.8, 0.017])
plt.colorbar(cax=cax, orientation = 'horizontal')

plt.savefig("docout/sections/localtoglobal/results/SOMprojection_UMatrix_" + data + ".jpg", dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
Z = np.zeros((20, 20))
fig, ax = plt.subplots(figsize=(10, 10))

collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

w_x, w_y = zip(*[som.winner(d) for d in trainnorm])
for i in np.arange(som._weights.shape[0]):
    for j in np.arange(som._weights.shape[1]):
        idx = np.logical_and(w_x == i, w_y == j)
        tmp = train.difference[idx]
        if len(tmp) > 0:
            feature = tmp.value_counts().idxmax()
            collector_color[feature] = plt.plot([i + .5], [j + .5], color=color_dict[data][feature][0],
                                                marker='s', markersize=20, linewidth = 0)
            collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=color_dict[data][feature][1][0],
                                                 color=color_dict[data][feature][1][1], markersize=13,
                                                 alpha = 0.7, markerfacecolor = 'None',linewidth = 0)

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = plt.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [j for j in collector_color.keys()],
                 title='$M_A$ | $M_B$', frameon=False, ncol=5, loc='lower left',bbox_to_anchor=(0, -0.1)
                 )
leg._legend_box.align = "left"

plt.xlim([0, 20])
plt.ylim([0, 20])

plt.grid(False)

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_yticks([])
ax.set_xticks([])

plt.savefig('docout/sections/localtoglobal/results/SOMprojection_ClassColored_' + data + '.jpg',
    dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

### Plot of chosen instances

In [None]:
#draw dendrograms to get t
differenceclasses = train.difference.unique()
differenceclasses = differenceclasses[~np.isin(differenceclasses, ['0|0', '1|1', '2|2'])]
linkagetype = 'single'
linked = []
for value in differenceclasses:
    subtrain = trainnorm[train.difference == value]
    linked.append(linkage(subtrain, linkagetype))

if data == 'running2':
    gs = gridspec.GridSpec(2, 3)
    fig = plt.figure(figsize=(12, 6))
    t={'1|0': 0.1,'2|0': 1,'2|1': 0.15,'0|2': 0.15, '0|1':1, '1|2': 0.15}
elif data == 'running1':
    gs = gridspec.GridSpec(1, 2)
    fig = plt.figure(figsize=(12, 3))
    t={'1|0': 0.64, '0|1':0.3}

for index, value in enumerate(differenceclasses):
    col = int(np.mod(index, 3))
    row = int(np.floor(index/3))
    ax = pl.subplot(gs[row, col])

    set_link_color_palette(plot_colors)
    D = dendrogram(linked[index], ax=ax, no_labels=True, color_threshold=t[value], above_threshold_color='k', truncate_mode=None)
    ax.axhline(y = t[value], linestyle = 'dashed', color = 'black', alpha = 0.5)
    ax.set_facecolor('#FFFFFF')
    ax.set_title('Region ($M_A|M_B$): ' + value, fontsize = 10)
    ax.grid(False)
plt.savefig('docout/sections/localtoglobal/results/Approach3_Dendrograms_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
if data == 'running2':
    c={'1|0':
           ['black','#85B832','#ce2029'],
       '2|0':
           ['#85B832','#ce2029'],
       '2|1':
           ['#85B832','#ce2029'],
       '0|2':
           ['black','#85B832'],
       '0|1':
           ['#85B832'],
       '1|2':
           ['black','#85B832','#ce2029']
       }
    gs = gridspec.GridSpec(2, 3)
    fig = plt.figure(figsize=(12, 6))

    X0, X1 = train.x1, train.x2
    xx, yy = make_meshgrid(X0, X1, h = 0.005)

    z1 = modelA.predict(np.c_[xx.ravel(), yy.ravel()])
    z1 = z1.reshape(xx.shape)
    z2 = modelB.predict(np.c_[xx.ravel(), yy.ravel()])
    z2 = z2.reshape(xx.shape)

    for index, value in enumerate(differenceclasses):
        col = int(np.mod(index, 3))
        row = int(np.floor(index/3))
        ax = pl.subplot(gs[row, col])

        cntr1 = plot_contours(ax, modelA, xx, yy, levels = 1,colors = 'black',linewidths = 2, linestyles = 'dotted')
        cntr2 = plot_contours(ax, modelB, xx, yy, levels = 1, colors = 'black',linewidths = 1)

        clusterassignment = fcluster(linked[index], t=t[value], criterion='distance')
        clusters = np.unique(clusterassignment).tolist()
        clusters.sort()

        traindifference = train.loc[train.difference == value]
        colindex = 0
        for i in clusters:
            tmp = traindifference.loc[clusterassignment == i]
            if len(tmp) == 1:
                ax.scatter(tmp.x1, tmp.x2, c='black',  s=6)
            else:
                ax.scatter(tmp.x1, tmp.x2, c=plot_colors[colindex],  s=6)
                colindex = colindex +1

        tmp = train.loc[~(train.difference == value)]
        ax.scatter(tmp.x1, tmp.x2, c='black', alpha = 0.1, s=6)

        ax.set_facecolor('#FFFFFF')
        ax.set_title('Region ($M_A|M_B$): ' + value, fontsize = 10)

    plt.savefig('docout/sections/localtoglobal/results/Approach3_clusteredregions_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)
elif data == 'running1':
    c={'1|0':
           ['#2f4b7c','black','#ffa600','#85B832','#ce2029'],
       '0|1':
           ['#ffa600', 'black','#85B832','#ce2029','black','#2f4b7c']}
    gs = gridspec.GridSpec(1, 2)
    fig = plt.figure(figsize=(12, 3))

    for index, value in enumerate(differenceclasses):
        col = int(np.mod(index, 3))
        row = int(np.floor(index/3))
        ax = pl.subplot(gs[row, col])

        x = np.arange(train.x1.min(),train.x1.max(),0.1)
        bba = 4*np.sin(x)
        bbb = np.sin(x)/x

        lines = []
        lines += ax.plot(x, bba, ':', color = 'black')
        lines += ax.plot(x, bbb, '-', color = 'black')

        clusterassignment = fcluster(linked[index], t=t[value], criterion='distance')
        clusterassignment = pd.factorize(clusterassignment)[0]
        clusters = np.unique(clusterassignment).tolist()

        traindifference = train.loc[train.difference == value]

        for i in clusters:
            tmp = traindifference.loc[clusterassignment == i]
            ax.scatter(tmp.x1, tmp.x2, c=c[value][i-1], s=6)
        tmp = train.loc[~(train.difference == value)]
        ax.scatter(tmp.x1, tmp.x2, c='black', alpha = 0.1, s=6)

        ax.set_facecolor('#FFFFFF')
        ax.set_title('Region ($M_A|M_B$): ' + value, fontsize = 10)
    plt.savefig('docout/sections/localtoglobal/results/Approach3_clusteredregions_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
t={'1|0': 0.64, '0|1':0.3}

In [None]:
populationsize = 1000
globaldata = getglobal(train, classname='difference', npoints=0, modelA=modelA, modelB=modelB,
                       populationsize=populationsize, method=clusterbasedinstances,
                       random_state=9, discrete = ['difference'], #2
                       continuous=['x1', 'x2'], t=t, linkagetype='single')
globaldata.columns = ['x1', 'x2']

In [None]:
with open('docout/sections/localtoglobal/res/approach3_neighborhood_' + data + '.pickle', 'wb') as handle:
    pickle.dump(globaldata, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('docout/sections/localtoglobal/res/approach3_neighborhood_' + data + '.pickle', 'rb') as handle:
    globaldata = pickle.load(handle)

In [None]:
fig, ax = plt.subplots(figsize = (10,8))

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(X0, X1, h = 0.005)

z1 = modelA.predict(np.c_[xx.ravel(), yy.ravel()])
z1 = z1.reshape(xx.shape)
z2 = modelB.predict(np.c_[xx.ravel(), yy.ravel()])
z2 = z2.reshape(xx.shape)

cntr1 = plot_contours(ax, modelA, xx, yy, levels = 1,colors = 'black',linewidths = 2, linestyles = 'dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels = 1, colors = 'black',linewidths = 1)

h = [plt.plot([],[], ls = i, color = 'black')[0] for i in ['dotted', 'solid']]
ax.legend(handles=h, labels=['Decision Boundary $M_A$', 'Decision Boundary $M_B$'],
          loc='lower left', title='', frameon=False, bbox_to_anchor=(0, -0.12),
          ncol=4)

#add generated neighborhoods:
ax.scatter(globaldata.x1,globaldata.x2, c='black', alpha = 0.3, s=10)

#add selected instances
choseninstancesindex = np.unique(globaldata.index.get_level_values(0))
choseninstances = train.iloc[choseninstancesindex]
ax.scatter(choseninstances.x1,choseninstances.x2, c='#D90429', s=150, marker = 'X')

ax.set_facecolor('#FFFFFF')
plt.tight_layout()
plt.grid(True, color = '#F3F3F3')

ax.set_xlim(X0.min(),X0.max())
ax.set_ylim(train.x2.min(),train.x2.max())

plt.savefig('docout/sections/localtoglobal/results/Approach3_generatedneighborhoods_' + data + '.jpg',dpi=150, bbox_inches='tight',transparent=True,pad_inches=0)

In [None]:
predA = modelA.predict(globaldata.values)
predB = modelB.predict(globaldata.values)
difference = np.array([str(x1) + '|' + str(x2) for x1,x2 in zip(predA,predB)])

In [None]:
explainer = DecisionTreeClassifier(random_state=0)
explainer.fit(globaldata.values, difference)

In [None]:
#Evaluation:
if data == 'running1':
    dataname = '"Sine"'
else:
    dataname = '"Spiral"'
pred = explainer.predict(test[cols].values)
with open(performancefile, 'a') as myfile:
    line = ' '.join([dataname,
                     '"Approach 3: Cluster-stratified sampling"',
                     str(explainer.get_depth()),
                     str(explainer.get_n_leaves()),
                     str(metrics.accuracy_score(test.difference, pred)),
                     str(metrics.precision_score(test.difference, pred, average='macro')),
                     str(metrics.recall_score(test.difference, pred, average='macro'))
                     ])
    myfile.write(line + '\n')

In [None]:
X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(X0, X1, h = 0.005)
z = explainer.predict(np.c_[xx.ravel(), yy.ravel()])
d=LabelEncoder()
d.fit(np.array([x for x in color_dict[data].keys()]))
z = d.transform(z)
z = z.reshape(xx.shape)

In [None]:
ordering = [x for x in color_dict[data].keys()]
keys = list(ordering)
ordering.sort()
ordering = [keys.index(x) for x in ordering]

In [None]:
values = [x[0] for x in color_dict[data].values()]
orderedmap = [values[i] for i in ordering]
MyCmap=colors.ListedColormap(orderedmap)
fig, ax = plt.subplots(figsize = (10,8))

cntr1 = plot_contours(ax, modelA, xx, yy, levels = 1,colors = 'black',linewidths = 2, linestyles = 'dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels = 1, colors = 'black',linewidths = 1)

cp = ax.contourf(xx, yy, z+0.1, alpha = 0.7, cmap=MyCmap)

h = [plt.plot([],[], color = i[0], linewidth=10, label = j)[0] for j,i in color_dict[data].items()]
ax.legend(handles=h, loc='lower left', title='Prediction Explainer', frameon = False, bbox_to_anchor=(0,-0.12), ncol = 9)

ax.set_facecolor('#FFFFFF')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.grid(True, color = '#F3F3F3')

ax.set_ylim(train.x2.min(), train.x2.max())
ax.set_xlim(train.x1.min(), train.x1.max())

plt.tight_layout()
plt.savefig('docout/' + data + '.jpg', dpi=150, bbox_inches='tight',transparent=True,pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(55, 10))
class_names = explainer.classes_
colorss = [color_dict[data][x][0] for x in class_names]
N = len(class_names)
artists = plot_tree(explainer, fontsize=8, ax=ax,
                    impurity=False, node_ids=True,
                    feature_names=cols, class_names=class_names)
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
for artist, impurity, value in zip(artists, explainer.tree_.impurity, explainer.tree_.value):
    # let the max value decide the color; whiten the color depending on impurity (gini)
    r, g, b = to_rgb(colorss[np.argmax(value)])
    f = impurity * N / (N - 1) if N > 1 else 0
    artist.get_bbox_patch().set_facecolor((f + (1 - f) * r, f + (1 - f) * g, f + (1 - f) * b))
    artist.get_bbox_patch().set_edgecolor('black')
plt.savefig('docout/sections/localtoglobal/results/Approach3_' + data + '_explainer.jpg', dpi=300, bbox_inches='tight',
            transparent=True, pad_inches=0)

In [None]:
rules = get_rules(explainer, ['x_1', 'x_2'], class_names)
rules = '\n'.join(rules)
pyperclip.copy(rules)