## ESNLI DistilBERT multiclass
In this notebook we examine the performance of interpretability techniques in the ESNLI dataset using DistilBERT on token level 

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, average_precision_score
from dataset import Dataset
from myModel import MyModel, MyDataset
from myExplainers import MyExplainer
from myEvaluation import MyEvaluation
from sklearn.preprocessing import maxabs_scale
import pickle
from tqdm import tqdm
import datetime
import csv
import warnings
import torch
import tensorflow as tf
from scipy.special import softmax
from helper import print_results, print_results_ap

Loading model and dataset, defining transformer model, and if rationales are available in the dataset

In [2]:
data_path = '../datasets/esnli_multiclass.pickle'
model_path = 'Trained Models/'
save_path = 'Results/esnli_multiclass/'

In [3]:
model_name = 'roberta'
dataset_name='esnli_roberta_uncased_multiclass'
existing_rationales = True

Load MyModel, and the subsequent tokenizer

In [5]:
task = 'single_label'
sentence_level = False
labels = 3

model = MyModel(model_path, dataset_name, model_name, task, labels, cased=False, attention=True)
model_no_attention = MyModel(model_path, dataset_name, model_name, task, labels, cased=False, attention=False)
max_sequence_len = model.tokenizer.max_len_single_sentence
tokenizer = model.tokenizer

import torch
print(torch.cuda.is_available())
model.trainer.model.to('cuda')

True


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [6]:
esnli = Dataset(path = data_path)
dataset, label_names = esnli.load_esnli_multiclass()

In [7]:
train_texts, test_texts, test_rationales, validation_texts, train_labels, test_labels, validation_labels = dataset

In [8]:
for i in range(len(test_rationales)):
    if (test_rationales[i][0] == []) & ((test_rationales[i][1] == [])):
        test_rationales[i][0] = [0] * len(test_rationales[i][2])
        test_rationales[i][1] = [0] * len(test_rationales[i][2])
        test_rationales[i][2] = list(test_rationales[i][2])
    
    elif (test_rationales[i][0] == []) & (test_rationales[i][2] == []):
        test_rationales[i][0] = [0] * len(test_rationales[i][1])
        test_rationales[i][1] = list(test_rationales[i][1])
        test_rationales[i][2] = [0] * len(test_rationales[i][1])
    else:
        test_rationales[i][0] = list(test_rationales[i][0])
        test_rationales[i][1] = [0] * len(test_rationales[i][0])
        test_rationales[i][2] = [0] * len(test_rationales[i][0])

  
  import sys


In [9]:
test_test_rationales = test_rationales

In [10]:
new_rationale = []
len_test = len(test_labels) # 2000
num_labels = len(np.unique(test_labels)) #3

for i in range(len_test):
    rationale = []
    test_t = test_texts[i].split(' ')
    for j in range(num_labels):
        label_rational = []
        for k in range(len(test_t)):
            # enc = model.tokenizer(['a ' + test_t[k], test_t[k]], truncation=True, padding=False)[0]
            # mask = enc.attention_mask
            # tokens = enc.tokens
            if k == 0:
                tokens = tokenizer.tokenize(test_t[k] + ' a')
                for r in tokens[:-1]:
                    #if r == '.':
                    #    print(r)
                    #print(r)
                    rationall = 1 if test_test_rationales[i][j][k] > 0 else 0
                    label_rational.append(rationall)
            else:
                tokens = tokenizer.tokenize('a ' + test_t[k])
                for r in tokens[1:]:
                    #if r == '.':
                    #    print(r)
                    #print(r)
                    rationall = 1 if test_test_rationales[i][j][k] > 0 else 0
                    label_rational.append(rationall)
        rationale.append(label_rational)
    new_rationale.append(rationale)

Then, we measure the performance of the model using f1 score (both macro)

In [11]:
predictions = []
for test_text in test_texts:
    outputs = model.my_predict(test_text)
    predictions.append(outputs[0])

pred_labels = []
for prediction in predictions:
    pred_labels.append(np.argmax(softmax(prediction)))

accuracy_score(test_labels, pred_labels), f1_score(test_labels, pred_labels, average='macro'), f1_score(test_labels, pred_labels, average='micro')

2000it [02:04, 14.59it/s]            

(0.83, 0.8304768716032468, 0.83)

2000it [02:19, 14.59it/s]

In [10]:
my_explainers = MyExplainer(label_names, model_no_attention) #model 2

my_evaluators = MyEvaluation(label_names, model_no_attention.my_predict, sentence_level = False, task = 'multi-class', evaluation_level_all = True, tokenizer=model.tokenizer) #model 2
my_evaluatorsP = MyEvaluation(label_names, model_no_attention.my_predict, sentence_level = False, task = 'multi-class', evaluation_level_all = False, tokenizer=model.tokenizer) #model 2
evaluation =  {'F':my_evaluators.faithfulness, 'FTP': my_evaluators.faithful_truthfulness_penalty, 
          'NZW': my_evaluators.nzw, 'AUPRC': my_evaluators.auprc}
evaluationP = {'F':my_evaluatorsP.faithfulness, 'FTP': my_evaluatorsP.faithful_truthfulness_penalty, 
          'NZW': my_evaluatorsP.nzw, 'AUPRC': my_evaluators.auprc}

In [11]:
import time
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    file_name = save_path + 'ESNLI_ROBERTA_IG_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'F':[], 'FTP':[], 'NZW':[], 'AUPRC' : []}
    metricsP = {'F':[], 'FTP':[], 'NZW':[], 'AUPRC' : []}
    time_r = []
    my_explainers.neighbours = 2000
    techniques = [my_explainers.ig] #my_explainers.lime 
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_rational = new_rationale[ind]
        instance = test_texts[ind]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        # model_no_attention.predict xwris attention + hidden states
        prediction, _, _ = model_no_attention.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
    
        interpretations = []
        kk = 0
        for technique in techniques:
            ts = time.time()
            temp = technique(instance, prediction, tokens, mask, _, _)
            interpretations.append([np.array(i)/np.max(abs(np.array(i))) for i in temp])
            time_r.append(time.time()-ts)
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            for interpretation in interpretations:
                # print(len(interpretation[0]))
                # print(len(test_rational[0]))
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        my_evaluators.clear_states()
        for metric in metrics.keys():
            evaluatedP = []
            for interpretation in interpretations:
                evaluatedP.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
            metricsP[metric].append(evaluatedP)
with open(file_name+'(A).pickle', 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'(P).pickle', 'wb') as handle:
    pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'_TIME.pickle', 'wb') as handle:
    pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
# time_r.mean()
# time_r.mean(axis=1)

100%|██████████| 2000/2000 [46:28<00:00,  1.39s/it] 


We present the results for IG

In [12]:
print(time_r)

[1.21452975 0.7353301  0.78451133 ... 1.17576408 0.61950946 0.60852265]


In [13]:
print_results(file_name+'(A)', [' IG '], metrics, label_names) #[' LIME', ' IG  ']

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


F
 IG   0.06876000016927719 | 0.07385 0.16597 -0.03353
FTP
 IG   0.26021 | 0.18868 0.32036 0.2716
NZW
 IG   1.0 | 1.0 1.0 1.0
AUPRC
 IG   0.31576 | 0.36912 0.25126 0.3269


In [14]:
print_results(file_name+'(P)', [' IG '], metricsP, label_names) #[' LIME', ' IG  ']

F
 IG   0.29211 | 0.27918 0.50539 0.09176
FTP
 IG   0.44162 | 0.31776 0.89863 0.10848
NZW
 IG   1.0 | 1.0 1.0 1.0
AUPRC
 IG   0.31576 | 0.36912 0.25126 0.3269


Then, we perform the experiments for the different attention setups!

In [15]:
conf = []
for ci in ['Mean', 'Multi'] + list(range(12)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

728

In [16]:
import time 
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    
    file_name = save_path + 'ESNLI_ROBERTA_ATTENTION_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    metricsP = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    time_r = []
    time_b = []
    time_b2 = []
    for con in conf:
        time_r.append([])
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_rational = new_rationale[ind]
        instance = test_texts[ind]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        my_explainers.save_states = {}
        prediction, attention, _ = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        
        interpretations = []
        kk = 0
        for con in conf:
            ts = time.time()
            my_explainers.config = con
            temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
            interpretations.append([maxabs_scale(i) for i in temp])
            time_r[kk].append(time.time()-ts)
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b.append(k)
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b2.append(k)
            metricsP[metric].append(evaluated)
with open(file_name+' (A).pickle', 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+' (P).pickle', 'wb') as handle:
    pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'_TIME.pickle', 'wb') as handle:
    pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
# time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

  0%|          | 0/2000 [00:00<?, ?it/s]

100%|██████████| 2000/2000 [3:10:04<00:00,  5.70s/it]  


We present the results of the different attention setups

In [17]:
try:
    time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)
except:
    print('Failure')

In [18]:
print_results(file_name+' (A)', conf, metrics, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.0 | 0.01181 0.13682 -0.14863
['Mean', 'Mean', 'To', False]  -0.0 | 0.02161 0.18753 -0.20915
['Mean', 'Mean', 'MeanColumns', False]  0.0 | 0.04649 0.16369 -0.21018


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['Mean', 'Mean', 'MaxColumns', False]  0.0 | 0.07249 0.15436 -0.22685
['Mean', 0, 'From', False]  0.0 | 0.01366 0.1509 -0.16456
['Mean', 0, 'To', False]  0.0 | -0.00665 0.21425 -0.2076
['Mean', 0, 'MeanColumns', False]  0.0 | 0.038 0.17427 -0.21227
['Mean', 0, 'MaxColumns', False]  0.0 | 0.06645 0.18493 -0.25138
['Mean', 1, 'From', False]  0.0 | 0.00233 0.15149 -0.15382
['Mean', 1, 'To', False]  -0.0 | 0.00838 0.18517 -0.19355
['Mean', 1, 'MeanColumns', False]  0.0 | 0.04151 0.15623 -0.19773
['Mean', 1, 'MaxColumns', False]  0.0 | 0.05177 0.15445 -0.20622
['Mean', 2, 'From', False]  0.0 | 0.01043 0.13833 -0.14875
['Mean', 2, 'To', False]  0.0 | 0.02294 0.18429 -0.20723
['Mean', 2, 'MeanColumns', False]  0.0 | 0.04592 0.15152 -0.19744
['Mean', 2, 'MaxColumns', False]  0.0 | 0.05739 0.15426 -0.21165
['Mean', 3, 'From', False]  0.0 | -0.00433 0.13559 -0.13125
['Mean', 3, 'To', False]  0.0 | -0.00477 0.19392 -0.18915
['Mean', 3, 'MeanColumns', False]  0.0 | 0.03548 0.14906 -0.18454
['Mean'

In [19]:
print_results(file_name+' (P)', conf, metricsP, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.26633 | 0.21539 0.47654 0.10704
['Mean', 'Mean', 'To', False]  0.34166 | 0.27798 0.63324 0.11375
['Mean', 'Mean', 'MeanColumns', False]  0.3596 | 0.35765 0.59491 0.12624
['Mean', 'Mean', 'MaxColumns', False]  0.37231 | 0.42393 0.57609 0.11691
['Mean', 0, 'From', False]  0.28462 | 0.22426 0.52001 0.1096
['Mean', 0, 'To', False]  0.33844 | 0.20836 0.69195 0.115
['Mean', 0, 'MeanColumns', False]  0.36147 | 0.3353 0.62306 0.12605
['Mean', 0, 'MaxColumns', False]  0.40561 | 0.42967 0.6663 0.12085
['Mean', 1, 'From', False]  0.26892 | 0.19606 0.50616 0.10453
['Mean', 1, 'To', False]  0.3244 | 0.24532 0.61511 0.11278
['Mean', 1, 'MeanColumns', False]  0.34055 | 0.3361 0.56574 0.1198
['Mean', 1, 'MaxColumns', False]  0.34865 | 0.36259 0.56837 0.11498
['Mean', 2, 'From', False]  0.26699 | 0.21266 0.4795 0.1088
['Mean', 2, 'To', False]  0.34134 | 0.2839 0.62369 0.11643
['Mean', 2, 'MeanColumns', False]  0.34053 | 0.3379 0.55958 0.12411
['Mean', 2, 'MaxColum

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation at this step)

In [20]:
print_results_ap(metrics, label_names, conf)

Baseline: 6.355458011716072e-10  and NZW: 1.0 and AUPRC: 0.27370922283549076
Max Across: 2.238238659228268e-09  and NZW: 1.0 and AUPRC: 0.3460633685303492
Per Label Per Instance: 0.13580163759051486  and NZW:  0.9994424747549747 and AUPRC: 0.295972036173079
Per Instance: 6.92581818654651e-08  and NZW:  0.9967447603121457 and AUPRC: 0.24632081131425285


In [21]:
print_results_ap(metricsP, label_names, conf)

Baseline: 0.2663250315500095  and NZW: 1.0 and AUPRC: 0.27370922283549076
Max Across: 0.5972881369953119  and NZW: 1.0 and AUPRC: 0.593336399232493
Per Label Per Instance: 0.21569956076050212  and NZW:  0.9990124583548975 and AUPRC: 0.34942476761025104


  out=out, **kwargs)


Per Instance: 0.8763609159462781  and NZW:  0.9990124583548975 and AUPRC: 0.4084736283608806


We repeat the process with Attention Scores with negative values (A*), thus by skipping the Softmax function. In the attention setups, we exclude the multiplication option in heads and layers, as a few combinations reach +/-inf

In [22]:
conf = []
for ci in ['Mean'] + list(range(12)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

676

In [23]:
import time 
import math
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    now = datetime.datetime.now()
    
    file_name = save_path + 'ESNLI_ROBERTA_A_ATTENTION_NO_SOFTMAX_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)
    metrics = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    metricsP = {'FTP':[], 'F':[], 'NZW':[], 'AUPRC' : []}
    time_r = []
    time_b = []
    time_b2 = []
    for con in conf:
        time_r.append([])
    for ind in tqdm(range(len(test_texts))):
        torch.cuda.empty_cache() 
        test_rational = new_rationale[ind]
        instance = test_texts[ind]
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()
        my_explainers.save_states = {}
        prediction, _, hidden_states = model.my_predict(instance)
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]
        mask = enc.attention_mask
        tokens = enc.tokens
        
        attention = []
        for la in range(12):
            our_new_layer = []
            bob =  model.trainer.model.base_model.encoder.layer[la].attention
            has = hidden_states[la]
            aaa = bob.self.key(torch.tensor(has).to('cuda'))
            bbb = bob.self.query(torch.tensor(has).to('cuda'))
            for he in range(12):
                attention_scores = torch.matmul(bbb[:,he*64:(he+1)*64], aaa[:,he*64:(he+1)*64].transpose(-1, -2))
                attention_scores = attention_scores / math.sqrt(64)
                our_new_layer.append(attention_scores.cpu().detach().numpy())
            attention.append(our_new_layer)
        attention = np.array(attention)
        
        interpretations = []
        kk = 0
        for con in conf:
            ts = time.time()
            my_explainers.config = con
            temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
            interpretations.append([maxabs_scale(i) for i in temp])
            time_r[kk].append(time.time()-ts)
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b.append(k)
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, test_rational))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b2.append(k)
            metricsP[metric].append(evaluated)        
with open(file_name+' (A).pickle', 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+' (P).pickle', 'wb') as handle:
    pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(file_name+'_TIME.pickle', 'wb') as handle:
    pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

100%|██████████| 2000/2000 [2:58:59<00:00,  5.37s/it]  


We present the results for the different attention setups

In [25]:
print_results(file_name+' (A)', conf, metrics, label_names)

FTP
['Mean', 'Mean', 'From', False]  -0.0 | 0.06015 -0.01987 -0.04029
['Mean', 'Mean', 'To', False]  -0.0 | 0.03847 0.23426 -0.27273


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['Mean', 'Mean', 'MeanColumns', False]  0.0 | 0.00513 0.14233 -0.14746
['Mean', 'Mean', 'MaxColumns', False]  0.0 | 0.0544 0.13069 -0.18509
['Mean', 0, 'From', False]  -0.0 | 0.04986 0.01316 -0.06302
['Mean', 0, 'To', False]  -0.0 | 0.02327 0.19773 -0.221
['Mean', 0, 'MeanColumns', False]  0.0 | -0.0008 0.1525 -0.1517
['Mean', 0, 'MaxColumns', False]  -0.0 | 0.04284 0.14683 -0.18966
['Mean', 1, 'From', False]  -0.0 | 0.03564 0.04963 -0.08527
['Mean', 1, 'To', False]  -0.0 | 0.02022 0.21748 -0.2377
['Mean', 1, 'MeanColumns', False]  0.0 | 0.01393 0.10004 -0.11397
['Mean', 1, 'MaxColumns', False]  -0.0 | 0.05323 0.12777 -0.181
['Mean', 2, 'From', False]  -0.0 | 0.07478 -0.0425 -0.03228
['Mean', 2, 'To', False]  -0.0 | 0.01604 0.27494 -0.29097
['Mean', 2, 'MeanColumns', False]  0.0 | 0.01569 0.15164 -0.16733
['Mean', 2, 'MaxColumns', False]  0.0 | 0.02422 0.14666 -0.17087
['Mean', 3, 'From', False]  -0.0 | 0.04774 -0.10132 0.05358
['Mean', 3, 'To', False]  -0.0 | 0.00261 0.22732 -0.22992


In [26]:
print_results(file_name+' (P)', conf, metricsP, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.02142 | 0.15852 -0.05311 -0.04115
['Mean', 'Mean', 'To', False]  0.44939 | 0.39427 0.80796 0.14593
['Mean', 'Mean', 'MeanColumns', False]  0.27596 | 0.2212 0.49282 0.11386
['Mean', 'Mean', 'MaxColumns', False]  0.31848 | 0.35707 0.48555 0.11284
['Mean', 0, 'From', False]  0.02856 | 0.11262 0.0361 -0.06304
['Mean', 0, 'To', False]  0.38241 | 0.33166 0.67626 0.13932
['Mean', 0, 'MeanColumns', False]  0.27757 | 0.21093 0.51509 0.10669
['Mean', 0, 'MaxColumns', False]  0.3216 | 0.33724 0.5201 0.10746
['Mean', 1, 'From', False]  0.11203 | 0.16111 0.16884 0.00614
['Mean', 1, 'To', False]  0.4135 | 0.3458 0.74063 0.15407
['Mean', 1, 'MeanColumns', False]  0.23571 | 0.22775 0.36609 0.11328
['Mean', 1, 'MaxColumns', False]  0.31869 | 0.35736 0.48018 0.11853
['Mean', 2, 'From', False]  0.00415 | 0.18323 -0.1229 -0.04788
['Mean', 2, 'To', False]  0.47132 | 0.36124 0.90777 0.14494
['Mean', 2, 'MeanColumns', False]  0.3044 | 0.26776 0.52773 0.11771
['Mean', 2,

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation script at this step)

In [27]:
print_results_ap(metrics, label_names, conf)

Baseline: -8.430266924356703e-10  and NZW: 1.0 and AUPRC: 0.3007993244184506
Max Across: 3.1107445878456965e-09  and NZW: 1.0 and AUPRC: 0.37818009498387956
Per Label Per Instance: 0.2858818703880026  and NZW:  1.0 and AUPRC: 0.2993247448324681
Per Instance: 1.4940706986472005e-07  and NZW:  1.0 and AUPRC: 0.24557778620093895


In [28]:
print_results_ap(metricsP, label_names, conf)

Baseline: 0.021419146638864277  and NZW: 1.0 and AUPRC: 0.3007993244184506
Max Across: 0.5953242977878842  and NZW: 1.0 and AUPRC: 0.5908744240714098
Per Label Per Instance: 0.22061073223594355  and NZW:  1.0 and AUPRC: 0.36930883130574754


  out=out, **kwargs)


Per Instance: 0.8956977855085383  and NZW:  1.0 and AUPRC: 0.41354336714479706
