## HateXplain RoBERTa multiclass
In this notebook we examine the performance of interpretability techniques in the HateXplain dataset using RoBERTa on token level 

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, average_precision_score
from dataset import Dataset
from myModel import MyModel, MyDataset
from myExplainers import MyExplainer
from myEvaluation import MyEvaluation
from sklearn.preprocessing import maxabs_scale
import pickle
from tqdm import tqdm
import datetime
import csv
import warnings
import torch
import tensorflow as tf
from scipy.special import softmax
from helper import print_results, print_results_ap

Loading model and dataset, defining transformer model, and if rationales are available in the dataset

In [2]:
data_path = '../datasets/hatexplain.json'
model_path = 'Trained Models/'
save_path = 'Results/hx_multiclass/'

In [3]:
model_name = 'roberta'
dataset_name='hx_roberta_uncased_multiclass'
existing_rationales = True

Load MyModel, and the subsequent tokenizer

In [5]:
task = 'single_label'
sentence_level = False
labels = 3

model = MyModel(model_path, dataset_name, model_name, task, labels, cased=False, attention=True)
model_no_attention = MyModel(model_path, dataset_name, model_name, task, labels, cased=True, attention=False)
max_sequence_len = model.tokenizer.max_len_single_sentence
tokenizer = model.tokenizer

import torch
print(torch.cuda.is_available())
model.trainer.model.to('cuda')

True


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [6]:
hx = Dataset(path='../') #data_path
x, y, label_names, rationales = hx.load_hatexplain_multiclass_roberta(tokenizer)

In [7]:
indices = np.arange(len(y))
train_texts, test_texts, train_labels, test_labels, _, test_indexes = train_test_split(
    x, y, indices, stratify=y, train_size=8000, test_size=2000, random_state=42)
if existing_rationales:
    test_rationales = [rationales[x] for x in test_indexes]

# size = (0.1 * len(y)) / len(train_labels)
train_texts, validation_texts, train_labels, validation_labels = train_test_split(
    list(train_texts),
    train_labels,
    stratify=train_labels,
    test_size=1000,
    random_state=42)

In [8]:
for i, label in enumerate(test_labels):
    
    if label == 0:
        # token_length = len(tokenizer.tokenize(test_texts[i]))
        word_count = len(test_texts[i].split(' '))
        test_rationales[i] = [[0] * word_count,
                              [0] * word_count,
                              [0] * word_count]
    elif label == 1:
        test_rationales[i] = np.clip(np.array(test_rationales[i]).sum(axis=0), a_min=0, a_max=1).tolist()
        test_rationales[i] = [[0] * len(test_rationales[i]), 
                            test_rationales[i], 
                            [0] * len(test_rationales[i])]
    else:
        test_rationales[i] = np.clip(np.array(test_rationales[i]).sum(axis=0), a_min=0, a_max=1).tolist()
        test_rationales[i] = [[0] * len(test_rationales[i]),  
                            [0] * len(test_rationales[i]),
                            test_rationales[i]]

In [9]:
test_test_rationales = test_rationales

In [10]:
new_rationale = []
len_test = len(test_labels) # 2000
num_labels = len(np.unique(test_labels)) #3

for i in range(len_test):
    rationale = []
    test_t = test_texts[i].split(' ')
    for j in range(num_labels):
        label_rational = []
        for k in range(len(test_t)):
            # enc = model.tokenizer(['a ' + test_t[k], test_t[k]], truncation=True, padding=False)[0]
            # mask = enc.attention_mask
            # tokens = enc.tokens
            if k == 0:
                tokens = tokenizer.tokenize(test_t[k] + ' a')
                for r in tokens[:-1]:
                    #if r == '.':
                    #    print(r)
                    #print(r)
                    rationall = 1 if test_test_rationales[i][j][k] > 0 else 0
                    label_rational.append(rationall)
            else:
                tokens = tokenizer.tokenize('a ' + test_t[k])
                for r in tokens[1:]:
                    #if r == '.':
                    #    print(r)
                    #print(r)
                    rationall = 1 if test_test_rationales[i][j][k] > 0 else 0
                    label_rational.append(rationall)
        rationale.append(label_rational)
    new_rationale.append(rationale)

Then, we measure the performance of the model using f1 score (both macro)

In [12]:
predictions = []
for test_text in test_texts:
    outputs = model.my_predict(test_text)
    predictions.append(outputs[0])

pred_labels = []
for prediction in predictions:
    pred_labels.append(np.argmax(softmax(prediction)))

accuracy_score(test_labels, pred_labels), f1_score(test_labels, pred_labels, average='macro'), f1_score(test_labels, pred_labels, average='micro')

2000it [01:59, 16.41it/s]            

(0.666, 0.6328231427781484, 0.666)

2000it [02:10, 16.41it/s]

In [10]:
my_explainers = MyExplainer(label_names, model_no_attention) #model 2

my_evaluators = MyEvaluation(label_names, model_no_attention.my_predict, sentence_level = False, task = 'multi-class', evaluation_level_all = True, tokenizer=model.tokenizer) #model 2
my_evaluatorsP = MyEvaluation(label_names, model_no_attention.my_predict, sentence_level = False, task = 'multi-class', evaluation_level_all = False, tokenizer=model.tokenizer) #model 2
evaluation =  {'F':my_evaluators.faithfulness, 'FTP': my_evaluators.faithful_truthfulness_penalty, 
          'NZW': my_evaluators.nzw, 'AUPRC': my_evaluators.auprc}
evaluationP = {'F':my_evaluatorsP.faithfulness, 'FTP': my_evaluatorsP.faithful_truthfulness_penalty, 
          'NZW': my_evaluatorsP.nzw, 'AUPRC': my_evaluators.auprc}

In [11]:
# new_rationale = test_rationales
len_test = len(test_labels) # 2000
num_labels = len(np.unique(test_labels)) #3

In [21]:
import time
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    file_name = save_path + 'HX_ROBERTA_IG_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'F':[], 'FTP':[], 'NZW':[], 'AUPRC' : []}
    metricsP = {'F':[], 'FTP':[], 'NZW':[], 'AUPRC' : []}
    time_r = []
    my_explainers.neighbours = 2000
    techniques = [my_explainers.ig] #my_explainers.lime 
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_rational = new_rationale[ind]
        instance = test_texts[ind]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        prediction, _, _ = model_no_attention.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
    
        interpretations = []
        kk = 0
        for technique in techniques:
            ts = time.time()
            temp = technique(instance, prediction, tokens, mask, _, _)
            interpretations.append([np.array(i)/np.max(abs(np.array(i))) for i in temp])
            time_r.append(time.time()-ts) #time_r[kk]  ??
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            for interpretation in interpretations:
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        my_evaluators.clear_states()
        for metric in metrics.keys():
            evaluatedP = []
            for interpretation in interpretations:
                evaluatedP.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
            metricsP[metric].append(evaluatedP)
with open(file_name+'(A).pickle', 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'(P).pickle', 'wb') as handle:
    pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'_TIME.pickle', 'wb') as handle:
    pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
# time_r.mean()
# time_r.mean(axis=1)

100%|██████████| 2000/2000 [45:49<00:00,  1.37s/it]


We present the results for IG

In [22]:
print(time_r)

[1.0187254  0.59709406 0.61371589 ... 0.72015929 0.62733841 0.69963264]


In [23]:
print_results(file_name+'(A)', [' IG '], metrics, label_names) #[' LIME', ' IG  ']

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


F
 IG   0.07057999819517136 | 0.01126 0.12351 0.07698
FTP
 IG   0.17594 | 0.23095 0.18664 0.11024
NZW
 IG   1.0 | 1.0 1.0 1.0
AUPRC
 IG   0.47736 | 0.0 0.76406 0.668


In [24]:
print_results(file_name+'(P)', [' IG '], metricsP, label_names) #[' LIME', ' IG  ']

F
 IG   0.28144 | 0.02536 0.37763 0.44133
FTP
 IG   0.35003 | 0.05105 0.52688 0.47218
NZW
 IG   1.0 | 1.0 1.0 1.0
AUPRC
 IG   0.47736 | 0.0 0.76406 0.668


Then, we perform the experiments for the different attention setups!

In [12]:
conf = []
for ci in ['Mean', 'Multi'] + list(range(12)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

728

In [16]:
import time 
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    
    file_name = save_path + 'HX_ROBERTA_ATTENTION_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    metricsP = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    time_r = []
    time_b = []
    time_b2 = []
    for con in conf:
        time_r.append([])
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_rational = new_rationale[ind]
        instance = test_texts[ind]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        my_explainers.save_states = {}
        prediction, attention, _ = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        
        interpretations = []
        kk = 0
        for con in conf:
            ts = time.time()
            my_explainers.config = con
            temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
            interpretations.append([maxabs_scale(i) for i in temp])
            time_r[kk].append(time.time()-ts)
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b.append(k)
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b2.append(k)
            metricsP[metric].append(evaluated)
with open(file_name+' (A).pickle', 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+' (P).pickle', 'wb') as handle:
    pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'_TIME.pickle', 'wb') as handle:
    pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
# time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

100%|██████████| 2000/2000 [3:34:48<00:00,  6.44s/it]  


We present the results of the different attention setups

In [None]:
print_results(file_name+' (A)', conf, metrics, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.0 | -0.21308 0.16343 0.04965
['Mean', 'Mean', 'To', False]  0.0 | -0.16071 0.11942 0.04129
['Mean', 'Mean', 'MeanColumns', False]  0.0 | -0.18395 0.13929 0.04465


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['Mean', 'Mean', 'MaxColumns', False]  0.0 | -0.12175 0.10918 0.01257
['Mean', 0, 'From', False]  0.0 | -0.16388 0.14354 0.02034
['Mean', 0, 'To', False]  0.0 | -0.12937 0.10461 0.02476
['Mean', 0, 'MeanColumns', False]  0.0 | -0.14641 0.12978 0.01663
['Mean', 0, 'MaxColumns', False]  0.0 | -0.10758 0.0978 0.00979
['Mean', 1, 'From', False]  0.0 | -0.20137 0.16359 0.03778
['Mean', 1, 'To', False]  0.0 | -0.12619 0.08917 0.03702
['Mean', 1, 'MeanColumns', False]  0.0 | -0.16574 0.13601 0.02973
['Mean', 1, 'MaxColumns', False]  0.0 | -0.11173 0.09996 0.01177
['Mean', 2, 'From', False]  0.0 | -0.16584 0.14647 0.01937
['Mean', 2, 'To', False]  0.0 | -0.14561 0.11094 0.03466
['Mean', 2, 'MeanColumns', False]  0.0 | -0.06392 0.06166 0.00226
['Mean', 2, 'MaxColumns', False]  0.0 | -0.05002 0.04502 0.005
['Mean', 3, 'From', False]  0.0 | -0.19514 0.14941 0.04572
['Mean', 3, 'To', False]  0.0 | -0.15722 0.11964 0.03758
['Mean', 3, 'MeanColumns', False]  0.0 | -0.13502 0.10538 0.02964
['Mean', 3

In [18]:
print_results(file_name+' (P)', conf, metricsP, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.34997 | 0.00836 0.54127 0.50028
['Mean', 'Mean', 'To', False]  0.29788 | 0.03212 0.42395 0.43759
['Mean', 'Mean', 'MeanColumns', False]  0.32379 | 0.01881 0.47862 0.47395


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['Mean', 'Mean', 'MaxColumns', False]  0.20701 | 0.02402 0.37941 0.2176
['Mean', 0, 'From', False]  0.29251 | 0.02899 0.49569 0.35286
['Mean', 0, 'To', False]  0.25156 | 0.03726 0.36771 0.34971
['Mean', 0, 'MeanColumns', False]  0.26329 | 0.02871 0.45442 0.30675
['Mean', 0, 'MaxColumns', False]  0.20958 | 0.03284 0.35287 0.24303
['Mean', 1, 'From', False]  0.33495 | 0.01765 0.54598 0.44123
['Mean', 1, 'To', False]  0.25789 | 0.0409 0.33729 0.39549
['Mean', 1, 'MeanColumns', False]  0.30482 | 0.03568 0.4769 0.40187
['Mean', 1, 'MaxColumns', False]  0.19727 | 0.02843 0.34932 0.21406
['Mean', 2, 'From', False]  0.26691 | 0.01726 0.48364 0.29981
['Mean', 2, 'To', False]  0.28964 | 0.04093 0.4058 0.4222
['Mean', 2, 'MeanColumns', False]  0.12496 | 0.03397 0.22364 0.11726
['Mean', 2, 'MaxColumns', False]  0.11191 | 0.03679 0.17484 0.12411
['Mean', 3, 'From', False]  0.32916 | 0.01327 0.49264 0.48157
['Mean', 3, 'To', False]  0.30546 | 0.03935 0.43272 0.44431
['Mean', 3, 'MeanColumns', False]

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation at this step)

In [19]:
print_results_ap(metrics, label_names, conf)

Baseline: 1.7544014296877464e-09  and NZW: 1.0 and AUPRC: 0.5230480725075674
Max Across: 2.841713355947184e-09  and NZW: 1.0 and AUPRC: 0.3302723078634748


  out=out, **kwargs)


Per Label Per Instance: 0.12210653584863607  and NZW:  0.9962263640394727 and AUPRC: 0.48074061828538245
Per Instance: 4.972818860686512e-08  and NZW:  0.9968681535913834 and AUPRC: 0.3300230825003802


In [20]:
print_results_ap(metricsP, label_names, conf)

Baseline: 0.3499721582979958  and NZW: 1.0 and AUPRC: 0.5230480725075674
Max Across: 0.3551529967270079  and NZW: 1.0 and AUPRC: 0.514411515352223
Per Label Per Instance: 0.4215107192106684  and NZW:  0.9976558454549584 and AUPRC: 0.5150572838839945
Per Instance: 0.4215107192106684  and NZW:  0.9976558454549584 and AUPRC: 0.48888244381752805


We repeat the process with Attention Scores with negative values (A*), thus by skipping the Softmax function. In the attention setups, we exclude the multiplication option in heads and layers, as a few combinations reach +/-inf

In [21]:
conf = []
for ci in ['Mean'] + list(range(12)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

676

In [22]:
import time 
import math
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    
    file_name = save_path + 'HX_ROBERTA_A_ATTENTION_NO_SOFTMAX_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    metricsP = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    time_r = []
    time_b = []
    time_b2 = []
    for con in conf:
        time_r.append([])
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_rational = new_rationale[ind]
        instance = test_texts[ind]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        my_explainers.save_states = {}
        prediction, _, hidden_states = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        
        attention = []
        for la in range(12):
            our_new_layer = []
            bob =  model.trainer.model.base_model.encoder.layer[la].attention
            has = hidden_states[la]
            aaa = bob.self.key(torch.tensor(has).to('cuda'))
            bbb = bob.self.query(torch.tensor(has).to('cuda'))
            for he in range(12):
                attention_scores = torch.matmul(bbb[:,he*64:(he+1)*64], aaa[:,he*64:(he+1)*64].transpose(-1, -2))
                attention_scores = attention_scores / math.sqrt(64)
                our_new_layer.append(attention_scores.cpu().detach().numpy())
            attention.append(our_new_layer)
        attention = np.array(attention)
        
        interpretations = []
        kk = 0
        for con in conf:
            ts = time.time()
            my_explainers.config = con
            temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
            interpretations.append([maxabs_scale(i) for i in temp])
            time_r[kk].append(time.time()-ts)
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b.append(k)
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b2.append(k)
            metricsP[metric].append(evaluated)        
with open(file_name+' (A).pickle', 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+' (P).pickle', 'wb') as handle:
    pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'_TIME.pickle', 'wb') as handle:
    pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
# time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

100%|██████████| 2000/2000 [3:21:21<00:00,  6.04s/it]  


We present the results for the different attention setups

In [24]:
print_results(file_name+' (A)', conf, metrics, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.0 | -0.15285 0.12945 0.0234
['Mean', 'Mean', 'To', False]  0.0 | -0.1944 0.13837 0.05603
['Mean', 'Mean', 'MeanColumns', False]  0.0 | -0.11117 0.10639 0.00478


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['Mean', 'Mean', 'MaxColumns', False]  0.0 | -0.07519 0.07725 -0.00207
['Mean', 0, 'From', False]  -0.0 | -0.13736 0.12191 0.01545
['Mean', 0, 'To', False]  0.0 | -0.17898 0.11907 0.05991
['Mean', 0, 'MeanColumns', False]  0.0 | -0.12786 0.12085 0.00702
['Mean', 0, 'MaxColumns', False]  0.0 | -0.0778 0.07744 0.00037
['Mean', 1, 'From', False]  0.0 | -0.1735 0.14227 0.03122
['Mean', 1, 'To', False]  0.0 | -0.1889 0.12985 0.05905
['Mean', 1, 'MeanColumns', False]  0.0 | -0.13549 0.13574 -0.00026
['Mean', 1, 'MaxColumns', False]  0.0 | -0.09501 0.09776 -0.00275
['Mean', 2, 'From', False]  -0.0 | -0.10922 0.11895 -0.00973
['Mean', 2, 'To', False]  0.0 | -0.16873 0.12466 0.04407
['Mean', 2, 'MeanColumns', False]  0.0 | -0.07321 0.06864 0.00457
['Mean', 2, 'MaxColumns', False]  0.0 | -0.05642 0.05581 0.00061
['Mean', 3, 'From', False]  -0.0 | -0.0363 0.03465 0.00165
['Mean', 3, 'To', False]  0.0 | -0.18383 0.13917 0.04465
['Mean', 3, 'MeanColumns', False]  0.0 | -0.12083 0.10867 0.01216
['Me

In [25]:
print_results(file_name+' (P)', conf, metricsP, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.24573 | -0.00734 0.4294 0.31513
['Mean', 'Mean', 'To', False]  0.32844 | 0.01381 0.47042 0.5011
['Mean', 'Mean', 'MeanColumns', False]  0.22146 | 0.04711 0.3962 0.22108
['Mean', 'Mean', 'MaxColumns', False]  0.14764 | 0.03861 0.28648 0.11782
['Mean', 0, 'From', False]  0.22276 | 0.00073 0.40897 0.25858
['Mean', 0, 'To', False]  0.31247 | 0.01687 0.41471 0.50583
['Mean', 0, 'MeanColumns', False]  0.23638 | 0.03821 0.42789 0.24306
['Mean', 0, 'MaxColumns', False]  0.15484 | 0.03776 0.2871 0.13964
['Mean', 1, 'From', False]  0.28394 | -0.00584 0.47458 0.38308
['Mean', 1, 'To', False]  0.32568 | 0.01588 0.45353 0.50763
['Mean', 1, 'MeanColumns', False]  0.25019 | 0.04459 0.48426 0.2217
['Mean', 1, 'MaxColumns', False]  0.17113 | 0.03456 0.34888 0.12994
['Mean', 2, 'From', False]  0.12446 | -0.03154 0.36261 0.04233
['Mean', 2, 'To', False]  0.29359 | 0.01766 0.42385 0.43926
['Mean', 2, 'MeanColumns', False]  0.16385 | 0.04873 0.26531 0.17751
['Mean', 2

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation script at this step)

In [26]:
print_results_ap(metrics, label_names, conf)

Baseline: 1.6713944348915413e-10  and NZW: 1.0 and AUPRC: 0.48271108390913753
Max Across: 2.7573005158034367e-09  and NZW: 1.0 and AUPRC: 0.36765689294535475


  out=out, **kwargs)


Per Label Per Instance: 0.23726878004831597  and NZW:  1.0 and AUPRC: 0.471900178566996
Per Instance: 1.0146648886478922e-07  and NZW:  1.0 and AUPRC: 0.346345213467173


In [27]:
print_results_ap(metricsP, label_names, conf)

Baseline: 0.24573240053763992  and NZW: 1.0 and AUPRC: 0.48271108390913753
Max Across: 0.3463847875089523  and NZW: 1.0 and AUPRC: 0.5432329825733309
Per Label Per Instance: 0.4376824211968728  and NZW:  1.0 and AUPRC: 0.4923238352416271
Per Instance: 0.4376824211968728  and NZW:  1.0 and AUPRC: 0.48237473915757806
