In [None]:
import pandas as pd
import numpy as np

from data.getdata import loaddata
import pickle
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import minisom
from data.split3fold import split3fold
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, set_link_color_palette
from matplotlib.colors import to_rgb
from sklearn.tree import plot_tree
import matplotlib
import re
from sklearn.tree import DecisionTreeClassifier
from matplotlib import colors as co
import matplotlib.gridspec as gridspec
import matplotlib.pylab as pl

from vars import plot_colors, color_dict, classes_dict, make_meshgrid, plot_contours, get_rules

def getclusterid(x, som, clusterarr):
    #x ... normalized instance
    bmu = getwinnerid(x, som)
    c = clusterarr.loc[clusterarr.node == bmu, 'cluster']
    return c

def getwinnerid(x, som):
    #x ... normalized instance
    bmu = som.winner(x)[1]
    return bmu

def plot_dendrogram(model, **kwargs):
    #copied from official documentation: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    # Plot the corresponding dendrogram
    d = dendrogram(linkage_matrix, **kwargs)
    return d

def replace_text(obj):
    if type(obj) == matplotlib.text.Annotation:
        txt = obj.get_text()
        txt = re.sub("samples[^$]*class", "class", txt)
        obj.set_text(txt)
    return obj

def predict(x, explainers):
    winner = x[-1]
    x = x[:-1]
    mod = explainers[winner]
    return mod.predict(x.reshape(1, -1))

# Running example

In [None]:
data = 'running2'
train, cols = loaddata(data)
modelA = pickle.load(open('blackboxes/' + data + 'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/' + data + 'B.sav', 'rb'))
train['yA'] = modelA.predict(train[['x1', 'x2']].values)
train['yB'] = modelB.predict(train[['x1', 'x2']].values)

train['difference_bin'] = train.apply(lambda row: ~(row['yA']==row['yB']), axis=1)
train['difference'] = train.apply(lambda row: '%g' % row['yA'] + '|' + '%g' % row['yB'], axis=1)
train.drop(columns=['yA', 'yB'], inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

ax.scatter(train.loc[train.difference_bin, 'x1'], train.loc[train.difference_bin, 'x2'],
           color=plot_colors[1],
           marker = 'o',
           label = 'Decision Difference')
ax.scatter(train.loc[~train.difference_bin, 'x1'], train.loc[~train.difference_bin, 'x2'],
           color=plot_colors[0],
           marker = 'x',
           label = 'No Decision Difference')

fig.legend(frameon=False, ncol=10, loc='lower left',bbox_to_anchor=(0, -0.01))

ax.set_facecolor('#FFFFFF')
plt.tight_layout()
plt.grid(True, color='#F3F3F3')

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)

plt.savefig('docout/sections/communication/Step1' + data + '.jpg', dpi=150, bbox_inches='tight', transparent=True,pad_inches=0)

### Step 2

In [None]:
trainsom = train[cols].copy()
d = StandardScaler()
scaler = d.fit(trainsom.values)
trainsomnorm = scaler.transform(trainsom.values)

In [None]:
trainsomnormdifference = trainsomnorm[~train.difference.isin(['0|0', '1|1', '2|2'])]
trainsomdifference = trainsom[~train.difference.isin(['0|0', '1|1', '2|2'])]

In [None]:
n_nodes = int(np.floor(5 * np.sqrt(len(trainsomdifference))))
som = minisom.MiniSom(1, n_nodes, trainsomdifference.shape[1], sigma=4, learning_rate=0.6, random_seed=1)
som.train(trainsomnormdifference, 100000, verbose=True)

In [None]:
connectivity_matrix = np.zeros((n_nodes, n_nodes))
for i in range(n_nodes-1):
    connectivity_matrix[i,i+1] = 1.0
weights = som.get_weights()[0]

In [None]:
weights = som.get_weights()[0]
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage='single',
                                connectivity=connectivity_matrix)
model = model.fit(weights)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
plt.subplots_adjust(hspace=0.5)
set_link_color_palette(plot_colors)
threshold = 0.2  #0.225
den = plot_dendrogram(model, no_labels=True, color_threshold=threshold, above_threshold_color='k')
ax.set_facecolor('#FFFFFF')
ax.axhline(y=3, c='black', linestyle='dotted')

In [None]:
ward = AgglomerativeClustering(connectivity=connectivity_matrix, linkage="single",
                               distance_threshold=0.2, n_clusters=None).fit(weights)
label = ward.labels_
clusterarr = pd.DataFrame({'node': range(n_nodes), 'cluster': label})
clusterarr['cluster'] = pd.factorize(clusterarr.cluster)[0]
ncluster = len(np.unique(label))
print(ncluster)

In [None]:
#determine cluster ID for each instance of the training set:
clusterwinnerspos = np.apply_along_axis(getclusterid, 1, som.get_weights()[0], som, clusterarr)
clusterwinners = np.apply_along_axis(getclusterid, 1, trainsomnormdifference, som, clusterarr)
nodeswinners = np.apply_along_axis(getwinnerid, 1, trainsomnormdifference, som)
clusterdendrogram = [clusterwinnerspos[x, 0] for x in den['leaves']]
clustercolor = [[x, y] for x, y in zip(clusterdendrogram, den['leaves_color_list'])]
clustercolor = np.unique(clustercolor, axis=0)
clustercolor = clustercolor[clustercolor[:, 0].astype(int).argsort()]
alreadyused = clustercolor[~(clustercolor[:, 1] == 'k'), 1]
available = [x for x in plot_colors if x not in alreadyused]
clustercolor[clustercolor[:, 1] == 'k', 1] = available[:(ncluster - len(alreadyused))]
clustercolor = pd.DataFrame(clustercolor, columns=['cluster', 'color'])
clusterswithoutdata = [x for x in range(ncluster) if x not in list(np.unique(clusterwinners))]
nodesofclusterwithoutdata = clusterarr.loc[clusterarr.cluster.isin(clusterswithoutdata), 'node'].tolist()
#for each cluster, for each node determine nearest node in cluster with data:
for node in nodesofclusterwithoutdata:
    weightnode = weights[node]
    nextnode = node
    i = 1
    while nextnode in nodesofclusterwithoutdata:
        map = som._activation_distance(weightnode, som._weights)[0, [node - i, node + i]].argsort()
        nextnode = node + i if map[0] > 0 else node - i
        i = i + 1
    oldcluster = clusterarr.loc[clusterarr.node == node, 'cluster'].item()
    newcluster = clusterarr.loc[clusterarr.node == nextnode, 'cluster'].item()
    clusterarr.loc[clusterarr.node == node, 'cluster'] = newcluster
    clustercolor.loc[clustercolor.cluster == str(oldcluster), 'color'] = clustercolor.loc[
        clustercolor.cluster == str(newcluster), 'color'].item()

In [None]:
X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(X0, X1, h = 0.005)

grid = np.c_[xx.ravel(), yy.ravel()]
grid = scaler.transform(grid)

z = np.apply_along_axis(getclusterid, 1, grid, som, clusterarr)
z = z.reshape(xx.shape)
z = z+0.5

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
plt.subplots_adjust(hspace=0.5)

collist = list(clustercolor.color)
MyCmap=co.ListedColormap(collist)
cf = ax.contourf(xx,yy,z, alpha = 0.7, cmap=MyCmap, levels = list(range(ncluster+1)))

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

ax.scatter(train.loc[train.difference_bin, 'x1'], train.loc[train.difference_bin, 'x2'], color='black', label = 'Decision Difference', alpha = 1, s = 15)

h = [plt.plot([],[], color = i, linewidth=10, label = j)[0] for j,i in enumerate(list(dict.fromkeys(collist)))]
leg = fig.legend(handles=h, title='Cluster', frameon = False, ncol = 10, loc='lower left',
           bbox_to_anchor=(0.125,-0.05))
leg._legend_box.align = "left"

ax.set_facecolor('#FFFFFF')
plt.tight_layout()
plt.grid(True, color='#F3F3F3')

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)

plt.savefig('docout/sections/communication/Step2_Clustering_' + data + '.jpg', dpi=150, bbox_inches='tight', transparent=True,pad_inches=0)

In [None]:
with open('Approach2Explainer_' + data + '.pickle', 'rb') as handle:
    explainers = pickle.load(handle)

In [None]:
region = 0

fig, ax = plt.subplots(figsize=(15, 8))
class_names = explainers[region].classes_
#class_names_decoded = dec.inverse_transform(class_names)
#class_names = [classes_dict[data][x] for x in class_names_decoded]
colors = [color_dict[data][x][0] for x in class_names]
N = len(class_names)
artists = plot_tree(explainers[region], fontsize=9, ax=ax,
                    impurity=False, node_ids=True,
                    feature_names=cols, class_names=class_names)
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
for artist, impurity, value in zip(artists, explainers[region].tree_.impurity, explainers[region].tree_.value):
    # let the max value decide the color; whiten the color depending on impurity (gini)
    r, g, b = to_rgb(colors[np.argmax(value)])
    f = impurity * N / (N - 1) if N > 1 else 0
    artist.get_bbox_patch().set_facecolor((f + (1 - f) * r, f + (1 - f) * g, f + (1 - f) * b))
    artist.get_bbox_patch().set_edgecolor('black')

plt.savefig('docout/sections/communication/Step3_Region' + str(region) + '_Explainer_' + data + ".jpg", dpi=150, bbox_inches='tight',transparent=True, pad_inches=0)

In [None]:
rules = get_rules(explainers[region], cols, class_names)
rules = '\n'.join(rules)
pyperclip.copy(rules)

Rule 1: if ($x1 \leq -0.428$) and ($x2 \leq 0.613$) and ($x1 \leq -0.679$) then class: 2|2
Rule 2: if ($x1 \leq -0.428$) and ($x2 \leq 0.613$) and ($x1 > -0.679$) then class: 1|2
Rule 3: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 \leq 0.687$) then class: 2|2
Rule 4: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 > 0.687$) and ($x1 \leq -1.144$) then class: 2|2
Rule 5: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 > 0.687$) and ($x1 > -1.144$) and ($x2 \leq 0.745$) and ($x1 \leq -0.816$) then class: 2|2
Rule 6: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 > 0.687$) and ($x1 > -1.144$) and ($x2 \leq 0.745$) and ($x1 > -0.816$) then class: 1|2
Rule 7: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 > 0.687$) and ($x1 > -1.144$) and ($x2 > 0.745$) and ($x2 \leq 0.8$) and ($x1 \leq -0.915$) then class: 2|2
Rule 8: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 > 0.687$) and ($x1 > -1.144$) and ($x2 > 0.745$) and ($x2 \leq 0.8$) and ($x1 > -0.915$) then class: 1|2
Rule 9: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 \leq -0.763$) and ($x2 > 0.687$) and ($x1 > -1.144$) and ($x2 > 0.745$) and ($x2 > 0.8$) then class: 1|2
Rule 10: if ($x1 \leq -0.428$) and ($x2 > 0.613$) and ($x1 > -0.763$) then class: 1|2
Rule 11: if ($x1 > -0.428$) and ($x1 \leq -0.372$) and ($x2 \leq 1.014$) then class: 1|1
Rule 12: if ($x1 > -0.428$) and ($x1 \leq -0.372$) and ($x2 > 1.014$) then class: 1|2
Rule 13: if ($x1 > -0.428$) and ($x1 > -0.372$) then class: 1|1

In [None]:
X0, X1 = train.x1, train.x2
xx, yy = make_meshgrid(np.array([train.x1.min(), 0.3]), np.array([train.x2.max(), 0.55]), h=0.001)

grid = np.c_[xx.ravel(), yy.ravel()]
gridtransformed = scaler.transform(grid)

zcluster = np.apply_along_axis(getclusterid, 1, gridtransformed, som, clusterarr)
zcluster = zcluster.reshape(xx.shape)

In [None]:
zrule = explainers[region].apply(grid)
#rulenames = np.unique(zrule) #naming after node ids
zrule = pd.factorize(zrule)[0]
zrule = zrule.reshape(xx.shape)
levelsrule = list(np.unique(zrule)) + [11]
rulenames = np.unique(zrule)
zrule[zcluster != region] = -100

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
plt.subplots_adjust(hspace=0.5)
collist = plot_colors[:11]
MyCmap=co.ListedColormap(collist)

cntr1 = plot_contours(ax, modelA, xx, yy, levels=1, colors='black', linewidths=2, linestyles='dotted')
cntr2 = plot_contours(ax, modelB, xx, yy, levels=1, colors='black', linewidths=1, linestyles='dashed')

cf = ax.contour(xx, yy, zcluster+0.1, colors = 'k', levels = np.unique(z))
cfrule = ax.contourf(xx,yy,zrule+0.1, alpha = 0.7, cmap=MyCmap, levels = levelsrule)

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

h = [plt.plot([],[], ls = 'solid', color = 'black')[0],
     plt.plot([],[], ls = 'dotted', color = 'black')[0],
     plt.plot([],[], ls = 'dashed', color = 'black')[0]]
h1 = [plt.plot([],[], color = i, linewidth=10)[0] for i in collist]
legend2 = ax.legend(handles=h, labels=['Cluster Boundaries', 'Decision Boundary $M_A$','Decision Boundary $M_B$'],
          loc='lower left', title='', frameon=False, bbox_to_anchor=(0, -0.12),
          ncol=4)

legend1 = ax.legend(h1, ['Rule ' + str(j) for j in np.unique(rulenames)+1], loc='lower left', title='', frameon=False, bbox_to_anchor=(0, -0.2), ncol=7)
ax.add_artist(legend1)
ax.add_artist(legend2)

ax.set_facecolor('#FFFFFF')
plt.tight_layout()
plt.grid(True, color='#F3F3F3')

plt.savefig('docout/sections/communication/Step3_Region' + str(region) + '_RegionsRules_' + data + ".jpg", dpi=150, bbox_inches='tight',transparent=True, pad_inches=0)

### Step 3

In [None]:
with open('Approach2Neighborhood_' + data + '.pickle', 'rb') as handle:
    neighborhoods = pickle.load(handle)

In [None]:
explainers = dict.fromkeys(neighborhoods.keys())
for region in neighborhoods.keys():
    subgeneticneighborhood = pd.DataFrame(neighborhoods[region])
    subgeneticneighborhood.columns = cols

    predA = modelA.predict(subgeneticneighborhood.values).astype(str)
    predB = modelB.predict(subgeneticneighborhood.values).astype(str)
    difference = pd.Series(np.char.add(np.char.add(predA, '|'), predB))
    clf = DecisionTreeClassifier(random_state=0, max_depth=4)
    clf.fit(subgeneticneighborhood, difference)
    explainers[region] = clf

In [None]:
somvislist = list()
slidingtrainsomnormlist = list()
targetlist = list()
scalerswindow = list()
shapesomregion = 15
windowsize = 3
ncluster = clusterarr.cluster.max()

for currwindow in range(ncluster - 2):
    print('window: ', currwindow)

    slidingtrainsom = pd.concat([pd.DataFrame(neighborhoods[x]) for x in range(currwindow, currwindow + windowsize)])
    slidingtrainsom.columns = cols

    target = np.concatenate(
        [explainers[x].predict(neighborhoods[x]) for x in range(currwindow, currwindow + windowsize)])
    targetlist.append(pd.Series(target))

    for feature in discrete_woclassname:
        tmp = d[feature].transform(slidingtrainsom[feature].values.reshape(-1, 1)).toarray()
        colnames = [feature + str(i) for i in range(tmp.shape[1])]
        slidingtrainsom[colnames] = tmp
        slidingtrainsom.drop(columns=feature, inplace=True)

    scalerwindow = StandardScaler()
    slidingtrainsom[continuous] = scalerwindow.fit_transform(slidingtrainsom[continuous])
    scalerswindow.append(scalerwindow)
    slidingtrainsomnormlist.append(slidingtrainsom)

    somvisregion = minisom.MiniSom(shapesomregion, shapesomregion, slidingtrainsom.shape[1], sigma=2.0,
                                   learning_rate=1.0, random_seed=0)
    somvisregion.train(slidingtrainsom.values, 100000, verbose=False)
    somvislist.append(somvisregion)

In [None]:
rows = 1
columns = 5

gs = gridspec.GridSpec(rows, columns)
fig = plt.figure(figsize=(35, 7))
gs.update(wspace=0.07)

color_dict[data]['No decision difference'] = color_dict[data]['0|0']
classes_dict[data]['No decision difference'] = 'No decision difference'

collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

#plot of iterations training:
for index in range(ncluster-2):
    col = int(np.mod(index, columns))
    row = int(np.floor(index/columns))
    ax = pl.subplot(gs[row, col])

    Z = np.zeros((shapesomregion, shapesomregion))
    somvisregion = somvislist[index]
    subgeneticneighborhoodbinary = slidingtrainsomnormlist[index]
    target = targetlist[index]
    target[target.isin(['0|0', '1|1', '2|2'])] = 'No decision difference'

    w_x, w_y = zip(*[somvisregion.winner(d) for d in subgeneticneighborhoodbinary.values])
    for i in np.arange(somvisregion._weights.shape[0]):
        for j in np.arange(somvisregion._weights.shape[1]):
            idx = [x == i and y == j for x, y in zip(w_x, w_y)]
            tmp = target[idx]
            if len(tmp) > 0:
                feature = tmp.value_counts().idxmax()
                collector_color[feature] = plt.plot([i + .5], [j + .5], color=color_dict[data][feature][0],
                                                    marker='s', markersize=19, linewidth = 0) #15
                collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=color_dict[data][feature][1][0],
                                                     color=color_dict[data][feature][1][1], markersize=13, markerfacecolor = 'None',linewidth = 0) #10

    plt.xlim([0, shapesomregion])
    plt.ylim([0, shapesomregion])

    plt.grid(False)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_yticks([])
    ax.set_xticks([])
    ax.set_title('Clusters: ' + ', '.join([str(x) for x in range(index, index + windowsize)]))

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = fig.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [classes_dict[data][j] for j in collector_color.keys()],
                 title='Prediction explainer', frameon=False, ncol = 10, loc='lower left',bbox_to_anchor=(0.125, -0.05)
                 )
leg._legend_box.align = "left"
plt.savefig(figurepath + 'Step3_combined_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=True, pad_inches=0)

# Compas

In [None]:
data = 'compas'
dataA, dataB, cols, discrete, continuous, le = loaddata(data)
blackboxtrainA, trainA, testA = split3fold(dataA, 0.4, 0.2, random_state=1)
blackboxtrainB, trainB, testB = split3fold(dataB, 0.4, 0.2, random_state=1)
modelA = pickle.load(open('blackboxes/' + data + 'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/' + data + 'B.sav', 'rb'))
#create difference detection dataset:
train = pd.concat([trainA, trainB])
train['predA'] = modelA.predict(train[cols].values)
train['predB'] = modelB.predict(train[cols].values)
train['difference'] = train.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis=1)
train['difference_binary'] = train.apply(lambda row: not (row['predA']==row['predB']), axis=1)
train.drop(columns=['predA', 'predB', 'y'], inplace=True)
train = train.reset_index(drop=True)

test = pd.concat([testA, testB])
test['predA'] = modelA.predict(test[cols].values)
test['predB'] = modelB.predict(test[cols].values)
test['difference'] = test.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
test['difference_binary'] = test.apply(lambda row: not (row['predA']==row['predB']), axis=1)
test.drop(columns=['predA', 'predB', 'y'], inplace=True)
test = test.reset_index(drop=True)

discrete.append('difference')

In [None]:
discrete_woclassname = discrete.copy()
discrete_woclassname.remove('difference')

d = defaultdict(lambda: OneHotEncoder(drop = 'first'))
trainbinary = train.copy()
testbinary = test.copy()
colsbinary = cols.copy()
for feature in discrete_woclassname:
    uniquevals = np.concatenate((trainbinary[feature].values.reshape(-1,1), testbinary[feature].values.reshape(-1,1)))
    d[feature].fit(uniquevals)
    tmp = d[feature].transform(trainbinary[feature].values.reshape(-1,1)).toarray()
    colnames = [feature + str(i) for i in range(tmp.shape[1])]
    trainbinary[colnames] = tmp
    testbinary[colnames] = d[feature].transform(testbinary[feature].values.reshape(-1,1)).toarray()
    colsbinary = colsbinary + colnames
    colsbinary.remove(feature)
    trainbinary.drop(columns = feature, inplace = True)
    testbinary.drop(columns = feature, inplace = True)

trainbinarynorm = trainbinary[colsbinary].copy()
testbinarynorm = testbinary[colsbinary].copy()
scaler = StandardScaler().fit(trainbinarynorm[continuous].values)
trainbinarynorm[continuous] = scaler.transform(trainbinarynorm[continuous].values)
testbinarynorm[continuous] = scaler.transform(testbinarynorm[continuous].values)

In [None]:
shapesom = 20
somvis = minisom.MiniSom(shapesom, shapesom, trainbinarynorm.shape[1], sigma=2.0, learning_rate=1.0, random_seed=0)
somvis.train(trainbinarynorm.values, 100000, verbose=True)

In [None]:
Z = np.zeros((shapesom, shapesom))
fig, ax = plt.subplots(figsize=(10, 10))

colors = {'True': [plot_colors[1], 'o', 'black'], 'False': [plot_colors[0], 'x', 'white']}

collector_color = dict.fromkeys(colors.keys())
collector_marker = dict.fromkeys(colors.keys())

w_x, w_y = zip(*[somvis.winner(d) for d in trainbinarynorm.values])
for i in np.arange(somvis._weights.shape[0]):
    for j in np.arange(somvis._weights.shape[1]):
        idx = [x == i and y == j for x, y in zip(w_x, w_y)]
        tmp = train.difference_binary[idx]
        if len(tmp) > 0:
            feature = tmp.value_counts().idxmax()
            collector_color[str(feature)] = plt.plot([i + .5], [j + .5], color=colors[str(feature)][0],marker='s',
                                                     markersize=22, linewidth = 0)
            collector_marker[str(feature)] = plt.plot([i + .5], [j + .5], color=colors[str(feature)][2], linewidth = 0,
                                        marker=colors[str(feature)][1], markersize=15, markerfacecolor = 'None')

leg = plt.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 ['Decision differences', 'No decision differences'],markerscale = 0.5,
                  frameon=False, ncol=5, loc='lower left',bbox_to_anchor=(0, -0.1)
                 )

plt.xlim([0, shapesom])
plt.ylim([0, shapesom])

plt.grid(False)

plt.grid(color = '#ffffff')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(range(shapesom))
ax.set_yticks(range(shapesom))
ax.set_yticklabels([])
ax.set_xticklabels([])

plt.savefig('docout/sections/communication/Step1_SOMprojection_' + data + '.jpg', dpi=150, bbox_inches='tight',
            transparent=False, pad_inches=0)

In [None]:
with open('Approach2SOM_' + data + '.pickle', 'rb') as handle:
    som = pickle.load(handle)
with open('Approach2Clusterarray_' + data + '.pickle', 'rb') as handle:
    clusterarr = pickle.load(handle)

In [None]:
with open('Approach2Neighborhood_' + data + '.pickle', 'rb') as handle:
    neighborhoods = pickle.load(handle)

In [None]:
clusterwinners = pd.Series(np.apply_along_axis(getclusterid, 1, trainbinarynorm.values, som, clusterarr).flatten())
ofinterest = [
    (0,10), (1,10), (2,9)
]
w = [somvis.winner(d) for d in trainbinarynorm.values]
ind = [x in ofinterest for x in w]
clusterwinners[ind].value_counts()/np.sum(ind)

In [None]:
clusterarr.groupby('cluster').count()

In [None]:
#plot of decision differences for specific region based on generated neighborhood:
region = 2
subgeneticneighborhood = pd.DataFrame(neighborhoods[region])
subgeneticneighborhood.columns = cols

subgeneticneighborhoodbinary = subgeneticneighborhood.copy()
for feature in discrete_woclassname:
    tmp = d[feature].transform(subgeneticneighborhoodbinary[feature].values.reshape(-1, 1)).toarray()
    colnames = [feature + str(i) for i in range(tmp.shape[1])]
    subgeneticneighborhoodbinary[colnames] = tmp
    subgeneticneighborhoodbinary.drop(columns = feature, inplace = True)

subgeneticneighborhoodbinary[continuous] = scaler.transform(subgeneticneighborhoodbinary[continuous].values)

predA = modelA.predict(subgeneticneighborhood.values).astype(str)
predB = modelB.predict(subgeneticneighborhood.values).astype(str)
difference = pd.Series(np.char.add(np.char.add(predA, '|'), predB))
clf = DecisionTreeClassifier(random_state=0, max_depth=3)
clf.fit(subgeneticneighborhood, difference)

target = pd.Series(clf.predict(subgeneticneighborhood))

In [None]:
np.sqrt(5 * np.sqrt(len(subgeneticneighborhood)))

In [None]:
shapesomregion = 15
#For visualisation:
somvisregion = minisom.MiniSom(shapesomregion, shapesomregion, subgeneticneighborhoodbinary.shape[1], sigma=2.0,learning_rate=1.0, random_seed=0)
somvisregion.train(subgeneticneighborhoodbinary.values, 100000, verbose=True)

In [None]:
Z = np.zeros((shapesomregion, shapesomregion))
fig, ax = plt.subplots(figsize=(10, 10))

collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

w_x, w_y = zip(*[somvisregion.winner(d) for d in subgeneticneighborhoodbinary.values])
for i in np.arange(somvisregion._weights.shape[0]):
    for j in np.arange(somvisregion._weights.shape[1]):
        idx = [x == i and y == j for x, y in zip(w_x, w_y)]
        tmp = target[idx]
        if len(tmp) > 0:
            feature = tmp.value_counts().idxmax()
            collector_color[feature] = plt.plot([i + .5], [j + .5], color=color_dict[data][feature][0],
                     marker='s', markersize=32, linewidth = 0)
            collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=color_dict[data][feature][1][0],
                     color=color_dict[data][feature][1][1], markersize=20, markerfacecolor = 'None',linewidth = 0)

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = plt.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
    [classes_dict[data][j] for j in collector_color.keys()],markerscale = 0.45,
    title='Prediction explainer', frameon=False, ncol=5, loc='lower left',bbox_to_anchor=(0, -0.125)
)
leg._legend_box.align = "left"

plt.xlim([0, shapesomregion])
plt.ylim([0, shapesomregion])

plt.grid(False)

plt.grid(color = '#ffffff')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(range(shapesomregion))
ax.set_yticks(range(shapesomregion))
ax.set_yticklabels([])
ax.set_xticklabels([])

plt.savefig('docout/sections/communication/Step3_Region' + str(region) + '_SOMprojection_ClassColored_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=False, pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
rules = clf.apply(subgeneticneighborhood.values)
label_names = {3:'Rule 1', 4:'Rule 2', 6:'Rule 3',
               7:'Rule 4', 10:'Rule 5', 11:'Rule 6',
               13:'Rule 7', 14:'Rule 8'}

labels_map = somvisregion.labels_map(subgeneticneighborhoodbinary.values, [label_names[t] for t in rules])

the_grid = gridspec.GridSpec(shapesomregion, shapesomregion, fig)
for position in labels_map.keys():
    label_fracs = [labels_map[position][l] for l in label_names.values()]
    plt.subplot(the_grid[shapesomregion-1-position[1],
                         position[0]], aspect=1)
    patches, texts = plt.pie(label_fracs, colors=plot_colors)
#region 1(-12.1, -14.7)
#region2(-6.1, -2.7)
plt.legend(patches, label_names.values(), loc='lower left',bbox_to_anchor=(-11.1, -18.5), frameon=False, ncol = 6)
plt.savefig('docout/sections/communication/Step2_Region' + str(region) + '_SOMprojection_PieRules_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=False, pad_inches=0)

In [None]:
#U-Matrix.
fig, ax = plt.subplots(figsize=(10, 11))
plt.grid(False)
im = plt.pcolor(somvisregion.distance_map().T, cmap='Greys')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_yticks([])
ax.set_xticks([])

plt.subplots_adjust(bottom=0.15)
cax = plt.axes([0.1, 0.1, 0.8, 0.017])
plt.colorbar(cax=cax, orientation='horizontal')

plt.savefig('docout/sections/communication/Step3_Region'+str(region)+'_SOMprojection_UMatrix_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(17, 5))
class_names = clf.classes_
class_names_decoded = [classes_dict[data][i] for i in class_names]
colors = [color_dict[data][x][0] for x in class_names]
N = len(class_names)
artists = plot_tree(clf, fontsize=9, ax=ax,
                    impurity=False, node_ids=True,
                    feature_names=cols, class_names=class_names_decoded)
ax.properties()['children'] = [replace_text(i) for i in ax.properties()['children']]
for artist, impurity, value in zip(artists, clf.tree_.impurity, clf.tree_.value):
    # let the max value decide the color; whiten the color depending on impurity (gini)
    r, g, b = to_rgb(colors[np.argmax(value)])
    f = impurity * N / (N - 1) if N > 1 else 0
    rnew = f + (1 - f) * r
    gnew = f + (1 - f) * g
    bnew = f + (1 - f) * b
    artist.get_bbox_patch().set(facecolor = (rnew, gnew, bnew),
                                edgecolor = 'black')
    brightness = np.sqrt(0.299*rnew*rnew + 0.587*gnew*gnew + 0.114*bnew*bnew)
    if brightness < 0.5:
        artist.set(color = 'white')

plt.savefig('docout/sections/communication/Step3_Region' + str(region) + '_Explainer_' + data + ".jpg", dpi=300, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
import pyperclip
from sklearn.tree import _tree

In [None]:
rules = get_rules(clf, ['sex',
                        'age',
                        'race',
                        'juv\_fel\_count',
                        'juv\_misd\_count',
                        'juv\_other\_count',
                        'priors\_count',
                        'c\_charge\_degree'], class_names_decoded)
rules = '\n'.join(rules)
pyperclip.copy(rules)

#### Description of cluster:
##### Genetic neighborhood:

In [None]:
regionencoded = subgeneticneighborhood.copy()
regionencoded['sex'] = le['sex'].inverse_transform(regionencoded['sex'].astype(int))
regionencoded['race'] = le['race'].inverse_transform(regionencoded['race'].astype(int))
regionencoded['c_charge_degree'] = le['c_charge_degree'].inverse_transform(regionencoded['c_charge_degree'].astype(int))

In [None]:
regionencoded[continuous].describe()

In [None]:
regionencoded['sex'].value_counts() / len(regionencoded)

In [None]:
regionencoded['race'].value_counts() / len(regionencoded)

In [None]:
regionencoded['c_charge_degree'].value_counts() / len(regionencoded)

### Step 3

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
label_names = {0: 'Explainer 0', 1: 'Explainer 1', 2: 'Explainer 2',
               3: 'Explainer 3', 4: 'Explainer 4', 5: 'Explainer 5',
               6: 'Explainer 6', 7: 'Explainer 7'}

clusterassignmentfull = np.apply_along_axis(getclusterid, 1, trainbinarynorm.values, som, clusterarr).flatten()
labels_map = somvis.labels_map(trainbinarynorm.values, [label_names[t] for t in clusterassignmentfull])

the_grid = gridspec.GridSpec(shapesom, shapesom, fig)
for position in labels_map.keys():
    label_fracs = [labels_map[position][l] for l in label_names.values()]
    plt.subplot(the_grid[shapesom - 1 - position[1],
                         position[0]], aspect=1)
    patches, texts = plt.pie(label_fracs, colors=plot_colors)
#region 1(-12.1, -14.7)
#region2(-6.1, -2.7)
plt.legend(patches, label_names.values(), loc='lower left', bbox_to_anchor=(-7.5, -15.5), frameon=False, ncol=5)
plt.savefig('docout/sections/communication/Step2_' + str(region) + '_SOMprojection_PieExplainer_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=False, pad_inches=0)

#### Component planes

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
rows = 2
columns = 4

gs = gridspec.GridSpec(rows, columns)
fig = plt.figure(figsize=(35, 20))
gs.update(wspace=0.1, hspace = 0.15)
w_x, w_y = zip(*[somvis.winner(d) for d in trainbinarynorm.values])

for index, value in enumerate(cols):
    col = int(np.mod(index, columns))
    row = int(np.floor(index/columns))
    ax = pl.subplot(gs[row, col])

    target = train[value]

    if value in continuous:
        Z = np.zeros((shapesom, shapesom))

        for i in np.arange(somvis._weights.shape[0]):
            for j in np.arange(somvis._weights.shape[1]):
                idx = [x == i and y == j for x, y in zip(w_x, w_y)]
                Z[i,j] = np.mean(target[idx])

        m = plt.pcolor(Z.transpose(), cmap='Greys')
        ax.patch.set(hatch='..', edgecolor='red')
        ax.set_facecolor('#ffffff')
        #cbar = fig.colorbar(m, ax=ax, pad=0.03, orientation='horizontal')
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("bottom", size="5%", pad=0.1)
        plt.colorbar(m, cax=cax,orientation='horizontal')
    elif value in discrete:
        for i in np.arange(somvis._weights.shape[0]):
            for j in np.arange(somvis._weights.shape[1]):
                idx = [x == i and y == j for x, y in zip(w_x, w_y)]
                tmp = target[idx]
                if len(tmp) > 0:
                    feature = tmp.value_counts().idxmax()
                    patches = plt.plot([i + .5], [j + .5], color=plot_colors[feature],marker='s', markersize=18, linewidth = 0)
        uniquevals = le[value].inverse_transform(np.unique(target))
        h = [plt.plot([],[], color = plot_colors[i], linewidth=10, label = j)[0] for i,j in enumerate(uniquevals)]
        ax.legend(handles=h, loc='lower left', title='', frameon = False, bbox_to_anchor=(0,-0.1), ncol = 4, handlelength = 1)

    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_yticks([])
    ax.set_xticks([])

    ax.set_title(value)

plt.savefig('docout/sections/communication/Step2_ComponentPlanes_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=False, pad_inches=0)

In [None]:
explainers = dict.fromkeys(neighborhoods.keys())
for region in neighborhoods.keys():
    subgeneticneighborhood = pd.DataFrame(neighborhoods[region])
    subgeneticneighborhood.columns = cols

    predA = modelA.predict(subgeneticneighborhood.values).astype(str)
    predB = modelB.predict(subgeneticneighborhood.values).astype(str)
    difference = pd.Series(np.char.add(np.char.add(predA, '|'), predB))
    clf = DecisionTreeClassifier(random_state=0, max_depth=4)
    clf.fit(subgeneticneighborhood, difference)
    explainers[region] = clf

In [None]:
subgeneticneighborhood = pd.DataFrame(neighborhoods[0])
subgeneticneighborhood.columns = cols

predA = modelA.predict(subgeneticneighborhood.values).astype(str)
predB = modelB.predict(subgeneticneighborhood.values).astype(str)
difference = pd.Series(np.char.add(np.char.add(predA, '|'), predB))

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
somvislist = list()
slidingtrainsomnormlist = list()
targetlist = list()
scalerswindow = list()
shapesomregion = 15
windowsize = 3
ncluster = clusterarr.cluster.max()
clusterassignmentfull = list()

for currwindow in range(ncluster-2):
    print('window: ', currwindow)

    slidingtrainsom = pd.concat([pd.DataFrame(neighborhoods[x]) for x in range(currwindow, currwindow + windowsize)])
    slidingtrainsom.columns = cols

    target = np.concatenate([explainers[x].predict(neighborhoods[x]) for x in range(currwindow, currwindow + windowsize)])
    targetlist.append(pd.Series(target))

    clusterassignmentfull.append(flatten([[x]*len(neighborhoods[x]) for x in range(currwindow, currwindow + windowsize)]))


    for feature in discrete_woclassname:
        tmp = d[feature].transform(slidingtrainsom[feature].values.reshape(-1, 1)).toarray()
        colnames = [feature + str(i) for i in range(tmp.shape[1])]
        slidingtrainsom[colnames] = tmp
        slidingtrainsom.drop(columns = feature, inplace = True)

    scalerwindow = StandardScaler()
    slidingtrainsom[continuous] = scalerwindow.fit_transform(slidingtrainsom[continuous])
    scalerswindow.append(scalerwindow)
    slidingtrainsomnormlist.append(slidingtrainsom)

    somvisregion = minisom.MiniSom(shapesomregion, shapesomregion, slidingtrainsom.shape[1], sigma=2.0,
                                   learning_rate=1.0, random_seed=0)
    somvisregion.train(slidingtrainsom.values, 100000, verbose=False)
    somvislist.append(somvisregion)

In [None]:
rows = 1
columns = 5

gs = gridspec.GridSpec(rows, columns)
fig = plt.figure(figsize=(35, 7))
gs.update(wspace=0.07)

#color_dict[data]['No decision difference'] = color_dict[data]['0|0']
#classes_dict[data]['No decision difference'] = 'No decision difference'

collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

#plot of iterations training:
for index in range(ncluster-2):
    col = int(np.mod(index, columns))
    row = int(np.floor(index/columns))
    ax = pl.subplot(gs[row, col])

    Z = np.zeros((shapesomregion, shapesomregion))
    somvisregion = somvislist[index]
    subgeneticneighborhoodbinary = slidingtrainsomnormlist[index]
    target = targetlist[index]
    #target[target.isin(['0|0', '1|1', '2|2'])] = 'No decision difference'

    w_x, w_y = zip(*[somvisregion.winner(d) for d in subgeneticneighborhoodbinary.values])
    for i in np.arange(somvisregion._weights.shape[0]):
        for j in np.arange(somvisregion._weights.shape[1]):
            idx = [x == i and y == j for x, y in zip(w_x, w_y)]
            tmp = target[idx]
            if len(tmp) > 0:
                feature = tmp.value_counts().idxmax()
                collector_color[feature] = plt.plot([i + .5], [j + .5], color=color_dict[data][feature][0],
                                                    marker='s', markersize=19, linewidth = 0) #15
                collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=color_dict[data][feature][1][0],
                                                     color=color_dict[data][feature][1][1], markersize=13, markerfacecolor = 'None',linewidth = 0) #10

    plt.xlim([0, shapesomregion])
    plt.ylim([0, shapesomregion])

    plt.grid(color = '#ffffff')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks(range(shapesomregion))
    ax.set_yticks(range(shapesomregion))
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_title('Clusters: ' + ', '.join([str(x) for x in range(index, index + windowsize)]))

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = fig.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [classes_dict[data][j] for j in collector_color.keys()],
                 title='Prediction explainer', frameon=False, ncol = 10, loc='lower left',bbox_to_anchor=(0.125, -0.05)
                 )
leg._legend_box.align = "left"
plt.savefig('docout/sections/communication/Step2_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=False, pad_inches=0)

In [None]:
rows = 1
columns = 5

gs = gridspec.GridSpec(rows, columns)
fig = plt.figure(figsize=(35, 7))
gs.update(wspace=0.07)

markers = [['o', 'black'], ['v', 'black'], ['^', 'white'],['s', 'black'], ['P', 'white'], ['D', 'black'],['_', 'black'], ['|', 'black'], ['1', 'black'], ['*', 'white']]


collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

#plot of iterations training:
for index in range(ncluster - 2):
    col = int(np.mod(index, columns))
    row = int(np.floor(index / columns))
    ax = pl.subplot(gs[row, col])

    Z = np.zeros((shapesomregion, shapesomregion))
    somvisregion = somvislist[index]
    subgeneticneighborhoodbinary = slidingtrainsomnormlist[index]
    target = clusterassignmentfull[index]

    w_x, w_y = zip(*[somvisregion.winner(d) for d in subgeneticneighborhoodbinary.values])
    for i in np.arange(somvisregion._weights.shape[0]):
        for j in np.arange(somvisregion._weights.shape[1]):
            idx = [x == i and y == j for x, y in zip(w_x, w_y)]
            tmp = pd.Series(target)[idx]
            if len(tmp) > 0:
                feature = tmp.value_counts().idxmax()
                collector_color[feature] = plt.plot([i + .5], [j + .5], color=plot_colors[feature],
                                                    marker='s', markersize=19, linewidth=0)  #15
                collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=markers[feature][0],
                                                     color=markers[feature][1], markersize=13,
                                                     markerfacecolor='None', linewidth=0)  #10

    plt.xlim([0, shapesomregion])
    plt.ylim([0, shapesomregion])

    plt.grid(color='#ffffff')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks(range(shapesomregion))
    ax.set_yticks(range(shapesomregion))
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_title('Clusters: ' + ', '.join([str(x) for x in range(index, index + windowsize)]))

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

In [None]:

leg = fig.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [classes_dict[data][j] for j in collector_color.keys()],
                 title='Prediction explainer', frameon=False, ncol=10, loc='lower left', bbox_to_anchor=(0.125, -0.05)
                 )
leg._legend_box.align = "left"
plt.savefig('docout/sections/communication/Step2_cluster_' + data + '.jpg', dpi=150, bbox_inches='tight', transparent=False,
            pad_inches=0)

In [None]:
clusterstrain = np.apply_along_axis(getclusterid, 1, trainbinarynorm.values, som=som, clusterarr=clusterarr)
clusterstrain = clusterstrain.reshape((len(clusterstrain), 1))
res = np.append(train[cols].values, clusterstrain, axis=1)
targettrain = pd.Series(np.apply_along_axis(predict, 1, res, explainers).flatten())
clusterstrain = pd.Series(clusterstrain.flatten())

In [None]:
rows = 1
columns = 5

gs = gridspec.GridSpec(rows, columns)
fig = plt.figure(figsize=(35, 7))
gs.update(wspace=0.07)

collector_color = dict.fromkeys(color_dict[data].keys())
collector_marker = dict.fromkeys(color_dict[data].keys())

#plot of training data
for index in range(ncluster-2):
    col = int(np.mod(index, columns))
    row = int(np.floor(index/columns))
    ax = pl.subplot(gs[row, col])

    Z = np.zeros((shapesomregion, shapesomregion))
    somvisregion = somvislist[index]

    subgeneticneighborhoodbinary = train.loc[clusterstrain.isin([x for x in range(currwindow, currwindow + windowsize)]), cols]

    for feature in discrete_woclassname:
        tmp = d[feature].transform(subgeneticneighborhoodbinary[feature].values.reshape(-1, 1)).toarray()
        colnames = [feature + str(i) for i in range(tmp.shape[1])]
        subgeneticneighborhoodbinary[colnames] = tmp
        subgeneticneighborhoodbinary.drop(columns = feature, inplace = True)

    subgeneticneighborhoodbinary[continuous] = scalerswindow[index].transform(subgeneticneighborhoodbinary[continuous])
    target = targettrain[clusterstrain.isin([x for x in range(currwindow, currwindow + windowsize)])]

    w_x, w_y = zip(*[somvisregion.winner(d) for d in subgeneticneighborhoodbinary.values])
    for i in np.arange(somvisregion._weights.shape[0]):
        for j in np.arange(somvisregion._weights.shape[1]):
            idx = [x == i and y == j for x, y in zip(w_x, w_y)]
            tmp = target[idx]
            if len(tmp) > 0:
                feature = tmp.value_counts().idxmax()
                collector_color[feature] = plt.plot([i + .5], [j + .5], color=color_dict[data][feature][0],
                                                    marker='s', markersize=19, linewidth = 0) #15
                collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=color_dict[data][feature][1][0],
                                                     color=color_dict[data][feature][1][1], markersize=13, markerfacecolor = 'None',linewidth = 0) #10

    plt.xlim([0, shapesomregion])
    plt.ylim([0, shapesomregion])

    plt.grid(color = '#ffffff')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks(range(shapesomregion))
    ax.set_yticks(range(shapesomregion))
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_title('Clusters: ' + ', '.join([str(x) for x in range(index, index + windowsize)]))

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = fig.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [classes_dict[data][j] for j in collector_color.keys()],
                 title='Prediction explainer', frameon=False, ncol = 10, loc='lower left',bbox_to_anchor=(0.125, -0.05)
                 )
leg._legend_box.align = "left"
plt.savefig('docout/sections/communication/Step2_train_' + data + '.jpg', dpi=150,bbox_inches='tight', transparent=False, pad_inches=0)