In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from diro2c.data_generation.helper import *
import minisom
import matplotlib.pyplot as plt
import pickle
from diro2c.data_generation.neighborhood_generation import modified_gpdatagenerator
from diro2c.data_generation.distance_functions import simple_match_distance, normalized_euclidean_distance, mixed_distance
from diro2c.data_generation.helper import *
from diro2c.enums.diff_classifier_method_type import diff_classifier_method_type
from diro2c.data_generation.neighborhood_generation.gpdatagenerator import calculate_feature_values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.cluster import AgglomerativeClustering

from collections import defaultdict
from scipy.cluster.hierarchy import dendrogram, set_link_color_palette
import matplotlib.gridspec as gridspec
import matplotlib.pylab as pl
from data.getdata import loaddata, prepare_df
from data.split3fold import split3fold
plt.style.use('ggplot')

from vars import plot_colors, color_dict, classes_dict

from sklearn.tree import plot_tree
import re
import matplotlib
def replace_text(obj):
    if type(obj) == matplotlib.text.Annotation:
        txt = obj.get_text()
        txt = re.sub("samples[^$]*class","class",txt)
        obj.set_text(txt)
    return obj

In [None]:
def getclusterid(x, som, clusterarr):
    #x ... normalized instance
    bmu = getwinnerid(x, som)
    c = clusterarr.loc[clusterarr.node == bmu, 'cluster']
    return c

def getwinnerid(x, som):
    #x ... normalized instance
    bmu = som.winner(x)[1]
    return bmu

def distance_function(x0, x1, discrete, continuous, class_name):
    return mixed_distance(x0, x1, discrete, continuous, class_name,
                          ddist=simple_match_distance,
                          cdist=normalized_euclidean_distance)

In [None]:
data = 'compas'

In [None]:
dataA, dataB, cols, discrete, continuous, le = loaddata(data)
blackboxtrainA, trainA, testA = split3fold(dataA, 0.4, 0.2, random_state=1)
blackboxtrainB, trainB, testB = split3fold(dataB, 0.4, 0.2, random_state=1)

In [None]:
modelA = pickle.load(open('blackboxes/'+data+'A.sav', 'rb'))
modelB = pickle.load(open('blackboxes/'+data+'B.sav', 'rb'))

In [None]:
train = pd.concat([trainA, trainB])
train['predA'] = modelA.predict(train[cols].values)
train['predB'] = modelB.predict(train[cols].values)
train['difference'] = train.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
train.drop(columns=['predA', 'predB', 'y'], inplace=True, errors='ignore')
train = train.reset_index(drop=True)
test = pd.concat([testA, testB])
test['predA'] = modelA.predict(test[cols].values)
test['predB'] = modelB.predict(test[cols].values)
test['difference'] = test.apply(lambda row: str(int(row['predA'])) + '|' + str(int(row['predB'])), axis = 1)
test.drop(columns=['predA', 'predB', 'y'], inplace=True)
test = test.reset_index(drop=True)

Prepare data:

In [None]:
discrete_woclassname = discrete.copy()
discrete.append('difference')

d = defaultdict(lambda: OneHotEncoder(drop = 'first'))
trainbinary = train.copy()
testbinary = test.copy()
colsbinary = cols.copy()

for feature in discrete_woclassname:
    uniquevals = np.concatenate((trainbinary[feature].values.reshape(-1,1), testbinary[feature].values.reshape(-1,1)))
    d[feature].fit(uniquevals)
    tmp = d[feature].transform(trainbinary[feature].values.reshape(-1,1)).toarray()
    colnames = [feature + str(i) for i in range(tmp.shape[1])]
    trainbinary[colnames] = tmp
    testbinary[colnames] = d[feature].transform(testbinary[feature].values.reshape(-1,1)).toarray()
    colsbinary = colsbinary + colnames
    colsbinary.remove(feature)
    trainbinary.drop(columns = feature, inplace = True)
    testbinary.drop(columns = feature, inplace = True)

### 1-dim SOM to structure dataset

In [None]:
trainsom = trainbinary[colsbinary].values
testsom = testbinary[colsbinary].values
trainsomnormfull = trainbinary[colsbinary].copy()
testsomnorm = testbinary[colsbinary].copy()
scaler = StandardScaler()
scaler = scaler.fit(trainsomnormfull[continuous].values)
trainsomnormfull[continuous] = scaler.transform(trainsomnormfull[continuous].values)
testsomnorm[continuous] = scaler.transform(testsomnorm[continuous].values)

trainsomnorm = trainsomnormfull[~train.difference.isin(['0|0', '1|1', '2|2'])]
trainsom = trainsom[~train.difference.isin(['0|0', '1|1', '2|2'])]

In [None]:
hyperparameter = pd.read_csv('results/SOMhyperparameterperformance2022-10-17.txt', sep=' ')
hyperparameter = hyperparameter.loc[hyperparameter.data == data]
hyperparameter = hyperparameter.groupby(['sigma', 'learningrate']).agg('mean').reset_index()
print(hyperparameter.loc[hyperparameter.quantization == np.min(hyperparameter.quantization)])

In [None]:
n_nodes = int(np.floor(5*np.sqrt(len(trainsom))))
som = minisom.MiniSom(1, n_nodes, trainsomnorm.shape[1], sigma=0.9, learning_rate=1.0, random_seed = 0)
som.train(trainsomnorm.values, 100000, verbose = True)

In [None]:
with open('Approach2SOM_' + data + '.pickle', 'wb') as handle:
    pickle.dump(som, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Clustering using hierarchical clustering with Ward's linkage criterion

In [None]:
connectivity_matrix = np.zeros((n_nodes, n_nodes))
for i in range(n_nodes - 1):
    connectivity_matrix[i, i + 1] = 1.0

In [None]:
def plot_dendrogram(model, **kwargs):
    #copied from official documentation: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    # Plot the corresponding dendrogram
    d = dendrogram(linkage_matrix, **kwargs)
    return d

In [None]:
weights = som.get_weights()[0]
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None,
                                connectivity=connectivity_matrix, linkage='single')
model = model.fit(weights)

In [None]:
distance_threshold = {
    'compas': 3,
    'bankmarketing': 4.5
}

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
plt.subplots_adjust(hspace=0.5)
set_link_color_palette(plot_colors)
den = plot_dendrogram(model, no_labels=True, color_threshold=distance_threshold[data],
                above_threshold_color='k'
                )
ax.axhline(y=distance_threshold[data], c = 'black', linestyle = 'dotted')
ax.set_facecolor('#FFFFFF')
plt.savefig('docout/sections/localtoglobal/results/approach4_Dendrogram_SOMNodes_' + data + '.jpg',dpi=300, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
ward = AgglomerativeClustering(connectivity=connectivity_matrix, linkage="single",
                               distance_threshold=distance_threshold[data], n_clusters=None).fit(weights)
label = ward.labels_
clusterarr = pd.DataFrame({'node': range(n_nodes), 'cluster': label})
clusterarr['cluster'] = pd.factorize(clusterarr.cluster)[0]
ncluster = len(np.unique(label))
print(ncluster)

In [None]:
#determine cluster ID for each instance of the training set:
clusterwinnerspos = np.apply_along_axis(getclusterid, 1, weights, som, clusterarr)
clusterwinners = np.apply_along_axis(getclusterid, 1, trainsomnorm, som, clusterarr)
nodeswinners = np.apply_along_axis(getwinnerid, 1, trainsomnorm, som)

In [None]:
clusterdendrogram = [clusterwinnerspos[x, 0] for x in den['leaves']]
clustercolor = [[x, y] for x, y in zip(clusterdendrogram, den['leaves_color_list'])]
clustercolor = np.unique(clustercolor, axis=0)
clustercolor = clustercolor[clustercolor[:, 0].astype(int).argsort()]
alreadyused = clustercolor[~(clustercolor[:, 1] == 'k'), 1]
available = [x for x in plot_colors if x not in alreadyused]
clustercolor[clustercolor[:, 1] == 'k', 1] = available[:(ncluster - len(alreadyused))]
clustercolor = pd.DataFrame(clustercolor, columns=['cluster', 'color'])

In [None]:
clusterswithoutdata = [x for x in range(ncluster) if x not in list(np.unique(clusterwinners))]
nodesofclusterwithoutdata = clusterarr.loc[clusterarr.cluster.isin(clusterswithoutdata), 'node'].tolist()
#for each cluster, for each node determine nearest node in cluster with data:
for node in nodesofclusterwithoutdata:
    weightnode = weights[node]
    nextnode = node
    i = 1
    while nextnode in nodesofclusterwithoutdata:
        map = som._activation_distance(weightnode, som._weights)[0, [node - i, node + i]].argsort()
        nextnode = node + i if map[0] > 0 else node - i
        i = i + 1
    if nextnode >= n_nodes:
        nextnode = node - i
    elif nextnode <0:
        nextnode = node + i
    oldcluster = clusterarr.loc[clusterarr.node == node, 'cluster'].item()
    newcluster = clusterarr.loc[clusterarr.node == nextnode, 'cluster'].item()
    clusterarr.loc[clusterarr.node == node, 'cluster'] = newcluster
    clustercolor.loc[clustercolor.cluster == str(oldcluster), 'color'] = clustercolor.loc[
        clustercolor.cluster == str(newcluster), 'color'].item()

In [None]:
#determine cluster ID for each instance of the training set:
clusterwinnerspos = np.apply_along_axis(getclusterid, 1, weights, som, clusterarr)
clusterwinners = np.apply_along_axis(getclusterid, 1, trainsomnorm, som, clusterarr)
nodeswinners = np.apply_along_axis(getwinnerid, 1, trainsomnorm, som)

In [None]:
with open('Approach2Clusterarray_' + data + '.pickle', 'wb') as handle:
    pickle.dump(clusterarr, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Local Explanations

In [None]:
#build a tree (explainer) for each node:
#prep for diroc
train['difference'] = train['difference'].astype(str)
dataset = prepare_df(train, 'train', 'difference', discrete=discrete, continuous=continuous)
features = dataset['columns'].copy()
features.remove('difference')
X = np.array(train[features])
feature_values = calculate_feature_values(
    X, dataset['columns'], 'difference', dataset['discrete'], dataset['continuous'], len(train)
)
discrete_no_class = list(dataset['discrete'])
discrete_no_class.remove('difference')

neighborhoods = dict()
explainers = dict.fromkeys(list(range(ncluster)))

clusterassignment = clusterwinners.flatten()

traindifferences = train.loc[~train.difference.isin(['0|0', '1|1', '2|2'])]

indexinstances = []
nodeswithoutdata = [x for x in range(n_nodes) if x not in list(np.unique(nodeswinners))]
subclusterarr = clusterarr.loc[~clusterarr.node.isin(nodeswithoutdata)]

for clusterid in np.unique(clusterassignment):
    print('processing cluster ' + str(clusterid))
    if len(subclusterarr.loc[subclusterarr.cluster==clusterid])>4:
        start = subclusterarr.loc[subclusterarr.cluster == clusterid,'node'].min()
        end = subclusterarr.loc[subclusterarr.cluster == clusterid,'node'].max()
        middle = int(subclusterarr.loc[subclusterarr.cluster == clusterid,'node'].median())
        nodes = [start, end, middle]
    elif len(subclusterarr.loc[subclusterarr.cluster==clusterid])>2:
        start = subclusterarr.loc[subclusterarr.cluster == clusterid,'node'].min()
        end = subclusterarr.loc[subclusterarr.cluster == clusterid,'node'].max()
        nodes = [start, end]
    else:
        nodes = list(subclusterarr.loc[subclusterarr.cluster == clusterid,'node'].sample(n=1, random_state = 0))

    Z3 = np.empty((0, train[cols].shape[1]))

    for x in nodes:
        indx = (nodeswinners == x)
        if indx.sum() >0:
            instance = traindifferences.loc[indx].sample(n=1, random_state=0)
            instanceindex = instance.index[0]
            indexinstances.append(instanceindex)
            instance = instance.values.reshape(-1, )[:-1]
            Z = modified_gpdatagenerator.generate_modified_data(instance, feature_values, modelA, modelB,
                                                                diff_classifier_method_type.multiclass_diff_classifier,
                                                                discrete_no_class, dataset['continuous'], 'difference',
                                                                dataset['idx_features'],
                                                                distance_function, neigtype={'ss': 0.5, 'sd': 0.5},
                                                                population_size=1000, halloffame_ratio=None,
                                                                alpha1=0.5, alpha2=0.5, eta1=1, eta2=0.0,
                                                                tournsize=3, cxpb=0.2, mutpb=0.3, ngen=100,
                                                                return_logbook=False, max_steps=10, is_unique=True)
            Z3 = np.concatenate([Z3, Z])

            #restrict neighborhood to current cluster
            Z3df = pd.DataFrame(Z3, columns = cols)
            for feature in discrete_no_class:
                tmp = d[feature].transform(Z3df[feature].values.reshape(-1,1)).toarray()
                colnames = [feature + str(i) for i in range(tmp.shape[1])]
                Z3df[colnames] = tmp
                Z3df.drop(columns = feature, inplace = True)
            Z3df[continuous] = scaler.transform(Z3df[continuous].values)
            Z3df = Z3df.values
            neighborhoodwinners = np.apply_along_axis(getclusterid, 1, Z3df, som, clusterarr)
            ind = (neighborhoodwinners == clusterid).flatten()
            Z3 = Z3[ind]

    neighborhoods[clusterid] = Z3
    predA = modelA.predict(Z3).astype(str)
    predB = modelB.predict(Z3).astype(str)
    difference = pd.Series(np.char.add(np.char.add(predA, '|'), predB))
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(Z3, difference)
    explainers[clusterid] = clf
    print('finished processing cluster ' + str(clusterid))

In [None]:
with open('Approach2Explainer_'+data+'.pickle', 'wb') as handle:
    pickle.dump(explainers, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('Approach2Neighborhood_'+data+'.pickle', 'wb') as handle:
    pickle.dump(neighborhoods, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('Approach2Explainer_'+data+'.pickle', 'rb') as handle:
    explainers = pickle.load(handle)
with open('Approach2Neighborhood_'+data+'.pickle', 'rb') as handle:
    neighborhoods = pickle.load(handle)

In [None]:
np.sqrt(5 * np.sqrt(len(train)))

In [None]:
shapesom = 30

In [None]:
#For visualisation:
somvis = minisom.MiniSom(shapesom, shapesom, trainsomnormfull.shape[1], sigma=2.0, learning_rate=1.0, random_seed = 0)
somvis.train(trainsomnormfull.values, 100000, verbose = True)

In [None]:
Z = np.zeros((shapesom, shapesom))
fig, ax = plt.subplots(figsize=(10, 10))

clusterassignmentfull = np.apply_along_axis(getclusterid, 1, trainsomnormfull, som, clusterarr).flatten()
collector_color = dict.fromkeys(range(ncluster))
collector_marker = dict.fromkeys(range(ncluster))

#markers = [['o', 'black'], ['v', 'white'], ['^', 'black'],['s', 'black'], ['P', 'black'], ['D', 'white'],['_', 'black'], ['|', 'white'], ['1', 'black'], ['*', 'white']]

markers = [['o', 'black'], ['v', 'black'], ['^', 'white'],['s', 'black'], ['P', 'white'], ['D', 'black'],['_', 'black'], ['|', 'black'], ['1', 'black'], ['*', 'white']]

w_x, w_y = zip(*[somvis.winner(d) for d in trainsomnormfull.values])
for i in np.arange(somvis._weights.shape[0]):
    for j in np.arange(somvis._weights.shape[1]):
        idx = [x == i and y == j for x,y in zip(w_x, w_y)]
        tmp = pd.Series(clusterassignmentfull[idx])
        if len(tmp) > 0:
            feature = tmp.value_counts().idxmax()
            collector_color[feature] = plt.plot([i + .5], [j + .5], marker='s', markersize=15, #22 #15
                                                color=clustercolor.loc[clustercolor.cluster == str(feature), 'color'].item(),
                                                linewidth = 0)
            collector_marker[feature] = plt.plot([i + .5], [j + .5], marker=markers[feature][0],
                                                 color=markers[feature][1], markersize=10, #12 #10
                                                 markerfacecolor = 'None',linewidth = 0)

collector_color = {k: v for k, v in collector_color.items() if v is not None}
collector_marker = {k: v for k, v in collector_marker.items() if v is not None}

leg = plt.legend([(collector_color[j][0], collector_marker[j][0]) for j in collector_color.keys()],
                 [j for j in collector_color.keys()], #markerscale = 0.8,
                 title='Cluster', frameon=False, ncol=10, loc='lower left',
                 bbox_to_anchor=(0, -0.125)
                 )
leg._legend_box.align = "left"

plt.xlim([0, shapesom])
plt.ylim([0, shapesom])

plt.grid(False)

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.set_yticks([])
ax.set_xticks([])

plt.savefig('docout/sections/localtoglobal/results/approach4_Regions_SOMprojection_ClassColored_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

clusterassignmentfull = np.apply_along_axis(getclusterid, 1, trainsomnormfull, som, clusterarr).flatten()

labels_map = somvis.labels_map(trainsomnormfull.values, clusterassignmentfull)
the_grid = gridspec.GridSpec(shapesom,shapesom, fig)

for position in labels_map.keys():
    label_fracs = [labels_map[position][l] for l in clusterassignmentfull]
    plt.subplot(the_grid[shapesom-1-position[1],
                         position[0]], aspect=1)
    patches, texts = plt.pie(label_fracs)

plt.savefig('docout/sections/localtoglobal/results/approach4_Regions_SOMprojection_PieClass_' + data + '.jpg',dpi=150, bbox_inches='tight', transparent=True, pad_inches=0)