## HoC DistilBERT Sentence
In this notebook we examine the performance of interpretability techniques in the HoC dataset using DistilBERT on sentence level 

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, average_precision_score
from dataset import Dataset
from myModel import MyModel, MyDataset
from myExplainers import MyExplainer
from myEvaluation import MyEvaluation
from sklearn.preprocessing import maxabs_scale
import pickle
from tqdm import tqdm
import datetime
import csv
import warnings
import torch
import tensorflow as tf
from scipy.special import softmax
from helper import print_results, print_results_ap

Loading model and dataset, defining transformer model, and if rationales are available in the dataset

In [3]:
data_path = ''
model_path = 'Trained Models/'
save_path = '/home/myloniko/ethos/Results/HoC/'

In [4]:
model_name = 'distilbert'
existing_rationales = True

Load MyModel, and the subsequent tokenizer

In [5]:
task = 'multi_label'
sentence_level = True
labels = 10

model = MyModel(model_path, 'distilbert_hoc', model_name, task, labels, False)
max_sequence_len = model.tokenizer.max_len_single_sentence
tokenizer = model.tokenizer

Loading Dataset

In [7]:
hoc = Dataset(path = data_path)
x, y, label_names, rationales = hoc.load_hoc()

Splitting dataset to train/val/test sets (70/10/20%)

In [9]:



indices = np.arange(len(y))
train_texts, test_texts, train_labels, test_labels, _, test_indexes = train_test_split(x, y, indices, test_size=.2, random_state=42)
if existing_rationales:
    test_rationales = [rationales[x] for x in test_indexes]

size = (0.1 * len(y)) / len(train_labels)
train_texts, validation_texts, train_labels, validation_labels = train_test_split(list(train_texts), train_labels, test_size=size, random_state=42)

Preparing the rationales for the dataset

In [10]:
test_label_rationales = []
for test_rational in test_rationales:
    label_rationales = []
    for label in range(labels):
        label_rationales.append([])
    for sentence in test_rational:
        for label in range(labels):
            if label_names[label] in sentence:
                label_rationales[label].append(1)
            else:
                label_rationales[label].append(0)
    test_label_rationales.append(label_rationales)

Then, we measure the performance of the model using average precision score and f1 score (both macro)

In [10]:


predictions = []
for test_text in test_texts:
    outputs = model.my_predict(test_text)
    predictions.append(outputs[0])

In [None]:

a = tf.constant(predictions, dtype = tf.float32)
b = tf.keras.activations.sigmoid(a)
predictions = b.numpy()

#Multi
pred_labels = []
for prediction in predictions:
    pred_labels.append([1 if i >= 0.5 else 0 for i in prediction])

def average_precision_wrapper(y, y_pred, view):
    return average_precision_score(y, y_pred.toarray(), average=view)

print(average_precision_score(test_labels, pred_labels, average='macro'), f1_score(test_labels, pred_labels, average='macro'))

We initialize the explainers and the evaluation module, as well as we define the metrics we want to use. In this case, we use F=Faithfulness, FTP=RFT (Ranked Faithful Truthfulness), NZW=Complexity, AUPRC=For the rationales.

In [284]:
my_explainers = MyExplainer(label_names, model, True, '‡')

my_evaluators = MyEvaluation(label_names, model.my_predict, True, True)
my_evaluatorsP = MyEvaluation(label_names, model.my_predict, True, False)
evaluation =  {'F':my_evaluators.faithfulness, 'FTP': my_evaluators.faithful_truthfulness_penalty, 
          'NZW': my_evaluators.nzw, 'AUPRC': my_evaluators.auprc}
evaluationP = {'F':my_evaluatorsP.faithfulness, 'FTP': my_evaluatorsP.faithful_truthfulness_penalty, 
          'NZW': my_evaluatorsP.nzw, 'AUPRC': my_evaluators.auprc}

We start the experiment measuring the performance of LIME and IG.

In [None]:
import time
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    file_name = save_path + 'HOC_DISTILBERT_SENTENCE_LIME_IG_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'F':[], 'FTP':[], 'NZW':[], 'AUPRC':[]}
    metricsP = {'F':[], 'FTP':[], 'NZW':[], 'AUPRC':[]}
    time_r = [[],[]]
    my_explainers.neighbours = 200
    techniques = [my_explainers.lime, my_explainers.ig] 
    for ind in tqdm(range(0,len(test_texts))):
        torch.cuda.empty_cache() 
        test_label_rational = test_label_rationales[ind].copy()
        instance = test_texts[ind]
        if len(instance.split('.')) -1 < len(test_label_rational[0]):
            for label in range(labels):
                test_label_rational[label] = test_label_rational[label][:len(instance.split('.'))-1]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        prediction, _, _ = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        if tokens.count('.') >= 2:
            interpretations = []
            kk = 0
            for technique in techniques:
                ts = time.time()
                temp = technique(instance, prediction, tokens, mask, _, _)
                temp_tokens = tokens.copy()
                if sentence_level:
                    temp_tokens = temp[0].copy()[0]
                    temp = temp[1].copy()
                interpretations.append([np.array(i)/np.max(abs(np.array(i))) for i in temp])
                time_r[kk].append(time.time()-ts)
                kk = kk + 1
            for metric in metrics.keys():
                evaluated = []
                for interpretation in interpretations:
                    evaluated.append(evaluation[metric](interpretation, _, instance, prediction, temp_tokens, _, _, test_label_rational))
                metrics[metric].append(evaluated)
            my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
            my_evaluators.clear_states()
            for metric in metrics.keys():
                evaluatedP = []
                for interpretation in interpretations:
                    evaluatedP.append(evaluationP[metric](interpretation, _, instance, prediction, temp_tokens, _, _, test_label_rational))
                metricsP[metric].append(evaluatedP)
            with open(file_name+'(A).pickle', 'wb') as handle:
                pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(file_name+'(P).pickle', 'wb') as handle:
                pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(file_name+'_TIME.pickle', 'wb') as handle:
                pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
time_r.mean(axis=1)

We present the results for LIME and IG

In [None]:
print_results(file_name+'(A)', [' LIME', ' IG  '], metrics, label_names)

In [None]:
print_results(file_name+'(P)', [' LIME', ' IG  '], metricsP, label_names)

Then, we perform the experiments for the different attention setups.

In [None]:
conf = []
for ci in ['Mean', 'Multi'] + list(range(6)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

In [None]:
import time 
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    
    file_name = save_path + 'HoC_DISTILBERT_ATTENTION_SENTENCE'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC': []}
    metricsP = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC': []}
    time_r = []
    time_b = []
    time_b2 = []
    for con in conf:
        time_r.append([])
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_label_rational = test_label_rationales[ind].copy()
        instance = test_texts[ind]
        if len(instance.split('.')) -1 < len(test_label_rational[0]):
            for label in range(labels):
                test_label_rational[label] = test_label_rational[label][:len(instance.split('.'))-1]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        my_explainers.save_states = {}
        prediction, attention, _ = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        
        if tokens.count('.') >= 2:
            interpretations = []
            kk = 0
            for con in conf:
                ts = time.time()
                my_explainers.config = con
                temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
                temp_tokens = tokens.copy()
                if sentence_level:
                    temp_tokens = temp[0].copy()[0]
                    temp = temp[1].copy()
                interpretations.append([np.array(i)/np.max(abs(np.array(i))) for i in temp])
                time_r[kk].append(time.time()-ts)
                kk = kk + 1
            for metric in metrics.keys():
                evaluated = []
                k = 0
                for interpretation in interpretations:
                    tt = time.time()
                    evaluated.append(evaluation[metric](interpretation, _, instance, prediction, temp_tokens, _, _, test_label_rational))
                    k = k + (time.time()-tt)
                if metric == 'FTP':
                    time_b.append(k)
                metrics[metric].append(evaluated)
            my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
            for metric in metrics.keys():
                evaluated = []
                k = 0
                for interpretation in interpretations:
                    tt = time.time()
                    evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, temp_tokens, _, _, test_label_rational))
                    k = k + (time.time()-tt)
                if metric == 'FTP':
                    time_b2.append(k)
                metricsP[metric].append(evaluated)
            with open(file_name+' (A).pickle', 'wb') as handle:
                pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(file_name+' (P).pickle', 'wb') as handle:
                pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(file_name+'_TIME.pickle', 'wb') as handle:
                pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

We present the results of the different attention setups

In [None]:
print_results(file_name+' (A)', conf, metrics, label_names)

In [None]:
print_results(file_name+' (P)', conf, metricsP, label_names)

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation at this step)

In [None]:
print_results_ap(metrics, label_names, conf)

In [None]:
print_results_ap(metricsP, label_names, conf)

We repeat the process with Attention Scores with negative values (A*), thus by skipping the Softmax function. In the attention setups, we exclude the multiplication option in heads and layers, as a few combinations reach +/-inf.

In [None]:
conf = []
for ci in ['Mean'] + list(range(6)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

In [None]:
import time 
import math
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    
    file_name = save_path + 'HoC_DISTILBERT_SENTENCE_ATTENTION_NO_SOFTMAX_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC': []}
    metricsP = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC': []}
    time_r = []
    time_b = []
    time_b2 = []
    for con in conf:
        time_r.append([])
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_label_rational = test_label_rationales[ind].copy()
        instance = test_texts[ind]
        if len(instance.split('.')) -1 < len(test_label_rational[0]):
            for label in range(labels):
                test_label_rational[label] = test_label_rational[label][:len(instance.split('.'))-1]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        my_explainers.save_states = {}
        prediction, _, hidden_states = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        
        if tokens.count('.') >= 2:
            attention = []
            for la in range(6):
                our_new_layer = []
                bob =  model.trainer.model.base_model.transformer.layer[la].attention
                has = hidden_states[la]
                aaa = bob.k_lin(torch.tensor(has).to('cuda'))
                bbb = bob.q_lin(torch.tensor(has).to('cuda'))
                for he in range(12):
                    bbb = bbb / math.sqrt(64)
                    attention_scores = torch.matmul(bbb[:,he*64:(he+1)*64], aaa[:,he*64:(he+1)*64].transpose(-1, -2))
                    our_new_layer.append(attention_scores.cpu().detach().numpy())
                attention.append(our_new_layer)
            attention = np.array(attention)
            interpretations = []
            kk = 0
            for con in conf:
                ts = time.time()
                my_explainers.config = con
                temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
                temp_tokens = tokens.copy()
                if sentence_level:
                    temp_tokens = temp[0].copy()[0]
                    temp = temp[1].copy()
                interpretations.append([np.array(i)/np.max(abs(np.array(i))) for i in temp])
                time_r[kk].append(time.time()-ts)
                kk = kk + 1
            for metric in metrics.keys():
                evaluated = []
                k = 0
                for interpretation in interpretations:
                    tt = time.time()
                    evaluated.append(evaluation[metric](interpretation, _, instance, prediction, temp_tokens, _, _, test_label_rational))
                    k = k + (time.time()-tt)
                if metric == 'FTP':
                    time_b.append(k)
                metrics[metric].append(evaluated)
            my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
            for metric in metrics.keys():
                evaluated = []
                k = 0
                for interpretation in interpretations:
                    tt = time.time()
                    evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, temp_tokens, _, _, test_label_rational))
                    k = k + (time.time()-tt)
                if metric == 'FTP':
                    time_b2.append(k)
                metricsP[metric].append(evaluated)        
            with open(file_name+' (A).pickle', 'wb') as handle:
                pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(file_name+' (P).pickle', 'wb') as handle:
                pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(file_name+'_TIME.pickle', 'wb') as handle:
                pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

We present the results for the different attention setups

In [None]:
print_results(file_name+' (A)', conf, metrics, label_names)

In [None]:
print_results(file_name+' (P)', conf, metricsP, label_names)

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation script at this step)

In [None]:
print_results_ap(metrics, label_names, conf)

In [None]:
print_results_ap(metricsP, label_names, conf)

## Qualitative Experiments
In this part of the notebook, we present the qualitative experiments, and we use the ready-to-use tool Optimus

In [None]:
from optimus import Optimus, plot_sentence_heatmap

In [47]:
ionbot = Optimus(model, tokenizer, label_names, task, [])

We select a random instance and we make prediction for it!

In [571]:
inddd = 211
instance = test_texts[inddd]
prediction, attention, hidden_states = model.my_predict(instance)
enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
mask = enc.attention_mask
tokens = enc.tokens

We get the baseline interpretation as well as the one from Optimus_label

In [617]:
baseline = ionbot.explain(instance, mode='baseline', level='sentence', raw_attention='A')
explanation = ionbot.explain(instance, mode='max_per_instance_per_label', level='sentence', raw_attention='A')

We get the interpretation from IG

In [573]:
ig = my_explainers.ig(instance, prediction, tokens, mask, _, _)
temp_tokens = tokens.copy()
if sentence_level:
    temp_tokens = ig[0].copy()[0]
    ig = ig[1].copy()
ig = [np.array(i)/np.max(abs(np.array(i))) for i in ig]

We get the interpretation from LIME

In [574]:
my_explainers.neighbours = 200
lime = my_explainers.lime(instance, prediction, tokens, mask, _, _)
temp_tokens = tokens.copy()
if sentence_level:
    temp_tokens = lime[0].copy()[0]
    lime = lime[1].copy()
lime = [np.array(i)/np.max(abs(np.array(i))) for i in lime]

We give the feature importance score from each sentence to each word comprising it for the different techniques

In [576]:
label = 7
n_tokens = []
rationalee = []
baseline_weights = []
weights = []
ig_weights = []
lime_weights = []
c = 0
for sentence in explanation[1]:
    for word in sentence.split(' '):
        n_tokens.append(word)
        baseline_weights.append(baseline[0][label][c])
        weights.append(explanation[0][label][c])
        ig_weights.append(ig[label][c])
        lime_weights.append(lime[label][c])
        rationalee.append(test_label_rationales[inddd][label][c])
    c = c + 1

Finally we plot the feature importance weights getting our interpretation from each technique as well as the ground truth rationales

In [None]:
plot_sentence_heatmap(
    n_tokens,
    np.array(rationalee)
)
plot_sentence_heatmap(
    n_tokens,
    np.array(baseline_weights)
)
plot_sentence_heatmap(
    n_tokens,
    np.array(weights)
)
plot_sentence_heatmap(
    n_tokens,
    np.array(ig_weights)
)
plot_sentence_heatmap(
    n_tokens,
    np.array(lime_weights)
)