In [None]:
#We first need to conect to our drive, in order to access the projects files and store results
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Thesis')

In [None]:
#Now, it is time to install the appropriate version of the transformers library
!pip install transformers-interpret==0.5.2
!pip install transformers==4.15.0
!pip install lime==0.2.0.1 #this line is included in order for 'myExplainers.py' to load properly
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Collecting transformers-interpret==0.5.2
  Downloading transformers-interpret-0.5.2.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=3.0.0 (from transformers-interpret==0.5.2)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting captum>=0.3.1 (from transformers-interpret==0.5.2)
  Downloading captum-0.6.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers>=3.0.0->transformers-interpret==0.5.2)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers>=3.0.0->transfo

In [None]:
#Imports of libraries required for finetuning and explaining ALBERT
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, average_precision_score
from sklearn.model_selection import train_test_split
from helper import print_results, print_results_ap
from sklearn.preprocessing import maxabs_scale
from myModel import MyModel, MyDataset
from myEvaluation import MyEvaluation
from myExplainers import MyExplainer
from dataset import Dataset
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
import datetime
import pickle
import torch
import time
import csv
import re

In [None]:
#defining the paths of the model and data
data_path = '/content/drive/MyDrive/Thesis/'
model_path = '/content/drive/MyDrive/Thesis/'
save_path = '/content/drive/MyDrive/Thesis/Results/'

Now, it is time to name the model and to define the parameters of 'MyModel' class that loads transformer models.

In [None]:
model_name = 'albert'
existing_rationales = False #no explanations
task = 'multi_label' #multi-labeld ethos
labels = 8 #violence, directed_vs_generalized, gender, race, national_origin, disability, religion, sexual_orientation

Now, let us load the Ethos dataset, through the 'dataset.py' file and the 'load_ethos' function. X: are the instances, y: are the labels and label names: are the names of the labels(including 'hate speech'). The 'Dataset' class of 'dataset.py' is utilized.

In [None]:
hs = Dataset(path = data_path) #Dataset class is in 'dataset.py': parameters (path, x=None, y=None, rationales=None ,label_names=None)
x, y, label_names = hs.load_ethos() #function in Dataset class to load ethos dataset
label_names = label_names[1:] #Ethos multiclass labels(without 'hate speech')

In [None]:
indices = np.arange(len(y)) #len(y) -> 433

#at first train instances is 80% of the data
train_texts, test_texts, train_labels, test_labels, _, test_indexes = train_test_split(x, y, indices, test_size=.2, random_state=26) #reproducible results
#test size -> 20% of all data

#in our case there are no rationales in ALBERT
if existing_rationales:
    test_rationales = [rationales[x] for x in test_indexes]

#We also need a validation dataset:
size = (0.1 * len(y)) / len(train_labels) #len(train_labels) -> 346
#43.3/346 -> 0.12 size
train_texts, validation_texts, train_labels, validation_labels = train_test_split(train_texts, train_labels, test_size=size, random_state=42)

Now the dataset is not in the appropriate form for the transformer to process. It is necessary to define the tokenizer of the model, so as to call 'myDataset' class in 'myModel.py'.

In [None]:
from transformers import AlbertTokenizerFast

#unlike BERT and Distilbert, ALBERT does not contain 'cs'
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')

Now, it is time to transform the train, test and validation sets to the
appropriate form. We will use 'MyDataset' class from 'myModel.py'.

In [None]:
train_dataset = MyDataset(train_texts, train_labels, tokenizer)
validation_dataset = MyDataset(validation_texts, validation_labels, tokenizer)
#test_dataset = MyDataset(test_texts, test_labels, tokenizer)

But before using 'MyModel' class from 'myModel.py', ALBERT should be finetuned!

In [None]:
from transformers import Trainer, TrainingArguments
from myTransformer import AlbertForMultilabelSequenceClassification as transformer_model


#calling the base pretrained ALBERT model
model = transformer_model.from_pretrained('albert-base-v2',num_labels = len(label_names), output_attentions=True,
                              output_hidden_states=True)

#the training arguments that we will pass to the trainer of the transformers. 15 epochs were used for training
training_arguments = TrainingArguments(evaluation_strategy='epoch', save_strategy='epoch', logging_strategy='epoch',
                                                log_level='critical', output_dir='./results', num_train_epochs=15,
                                                per_device_train_batch_size=8, per_device_eval_batch_size=8,
                                                warmup_steps=200, weight_decay=0.01, logging_dir='./logs')

#passing to the trainer the model, the arguments and all train and validation instances
trainer = Trainer(model=model, args=training_arguments, train_dataset=train_dataset, eval_dataset=validation_dataset)

#Let's train the model!
trainer.train()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMultilabelSequenceClassification: ['predictions.decoder.weight', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForMultilabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMultilabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForMultilabelSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classi

Epoch,Training Loss,Validation Loss
1,0.6435,0.546277
2,0.5057,0.489561
3,0.4638,0.457803
4,0.4116,0.434529
5,0.3358,0.341019
6,0.2624,0.272878
7,0.1853,0.300792
8,0.1311,0.254375
9,0.0958,0.263803
10,0.0703,0.265581


TrainOutput(global_step=570, training_loss=0.2189256955657089, metrics={'train_runtime': 18470.9933, 'train_samples_per_second': 0.245, 'train_steps_per_second': 0.031, 'total_flos': 108322610135040.0, 'train_loss': 0.2189256955657089, 'epoch': 15.0})

Now, an 'albert_hs' folder will be created, containing the trained model. Now, it is time to make predictions. We will use 'MyModel' with the suitable parameters. It is worth noting that ALBERT does not contain 'cased' or 'uncased' argument, but this parameter is passed in 'MyModel' anyway, because other transformers use it.

Now, it is time to save the model in 'roberta_hs' file.

In [None]:
trainer.model.save_pretrained('/content/drive/MyDrive/Thesis/albert_hs')

Now, we can use 'MyModel' and make then make predictions.

In [None]:
#new model
model = MyModel(model_path,'albert_hs', model_name, task, labels, 'cased')

#the maximum number of tokens a single sentence can have e.g. 512
max_sequence_len = model.tokenizer.max_len_single_sentence

#again the tokenizer is RobertaTokenizerFast, that is selected through 'MyModel' and '__load_model__' function
tokenizer = model.tokenizer

#gpu training
torch.cuda.is_available()
model.trainer.model.to('cuda')

AlbertForMultilabelSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_fea

It is time to make predictions for the test instances.

In [None]:
predictions = []

#time for predictions
starting_prediction_time = time.time()

#make the predictions with the model that was trained
for test_instance in test_texts:
    outputs = model.my_predict(test_instance)
    predictions.append(outputs[0])

a = tf.constant(predictions, dtype = tf.float32)
b = tf.keras.activations.sigmoid(a)
predictions = b.numpy()

#printing the total time that predictions took
ending_prediction_time = time.time()
total_time = ending_prediction_time - starting_prediction_time
print('The total time for predictions is:' ,round(total_time,3),' seconds')

The total time for predictions is: 4.348  seconds


Let's print the precision and f1 score of ALBERT's performance!

In [None]:
#labels of the predictions produced
pred_labels = []

for prediction in predictions:
    pred_labels.append([1 if i >= 0.5 else 0 for i in prediction]) #1 if the score for the label in the certain prediction is greater than or equal to 0.5

def average_precision_wrapper(y, y_pred, view):
    #predictions from list to array
    return average_precision_score(y, y_pred.toarray(), average=view)

#macro scores
p_s = f"Average precision score: {round(average_precision_score(test_labels, pred_labels, average='macro'),4)} %"
f1 = f"f1 score score: {round(f1_score(test_labels, pred_labels, average='macro'),4)} %"

#printing results
print(p_s)
print(f1)

Average precision score: 0.6897 %
f1 score score: 0.7822 %


We can also change the hyperparameters for training, but we notice that the performance of ALBERT is already satisfactory and the focus should be shifted on the interpretations. Let's store the results in the 'Results' file.

In [None]:
#the data to write in the file
data = (p_s, f1)
now = datetime.datetime.now()
file_name = save_path + 'ALBERT_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)

#results in files
with open(file_name+ 'PERFORMANCE.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) #data
    #pickle.dump(f1, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(file_name+'TIME.pickle', 'wb') as handle:
    pickle.dump(total_time, handle, protocol=pickle.HIGHEST_PROTOCOL)

Let's ensure that the results are properly loaded from the file that we stored them.

In [None]:
with open(file_name+'PERFORMANCE.pickle', 'rb') as handle:
     performance = pickle.load(handle)
     for score in performance:
         print(score)

with open(file_name+'TIME.pickle', 'rb') as handle:
     time = pickle.load(handle)
     print('The total time for predictions is:' ,round(time,3),' seconds')

Average precision score: 0.6897 %
f1 score score: 0.7822 %
The total time for predictions is: 4.348  seconds


Now, let us initialize the explainers and the evaluation module, as well as define the metrics that will be utilized. In this case, the following is true:

* F=Faithfulness
* FTP=RFT (Ranked Faithful Truthfulness)
* NZW=Complexity
* AUPRC=For the rationales.

In [None]:
#layers are 12 this time
my_explainers = MyExplainer(label_names, model, layers=12)

#non zero weights, faithfulness, RFT
my_evaluators = MyEvaluation(label_names, model.my_predict, False, True, tokenizer=tokenizer) #parameters: (label_names, predict, sentence_level, evaluation_level_all=True)
my_evaluatorsP = MyEvaluation(label_names, model.my_predict, False, False, tokenizer=tokenizer)

evaluation =  {'F':my_evaluators.faithfulness, 'FTP': my_evaluators.faithful_truthfulness_penalty,
          'NZW': my_evaluators.nzw}
evaluationP = {'F':my_evaluatorsP.faithfulness, 'FTP': my_evaluatorsP.faithful_truthfulness_penalty,
          'NZW': my_evaluatorsP.nzw}

We will now measure the performance of IG.

In [None]:
import time
with warnings.catch_warnings():

    #ignore the warnings
    warnings.simplefilter("ignore", category=RuntimeWarning)

    #date
    now = datetime.datetime.now()

    #saving results
    file_name = save_path + 'ETHOS_ALBERT_IG_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)

    #metrics
    metrics = {'F':[], 'FTP':[], 'NZW':[]}
    metricsP = {'F':[], 'FTP':[], 'NZW':[]}

    #time_r = [[],[]]: sublists for each technique
    time_r = [ [] ] #now only ig is present

    #neighnbors
    #my_explainers.neighbours = 2000

    #ig
    techniques = [my_explainers.ig]

    #for each test instance
    for ind in tqdm(range(0,len(test_texts))): #progress bar

        #to not run out of memory
        torch.cuda.empty_cache()

        #the instance of test set
        instance = test_texts[ind]

        #reseting the state memory
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()

        #prediction, attention matrix and hidden states. Here we care about predictions
        prediction, _, _ = model.my_predict(instance)

        #RobetaTokenizerFast
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0] #first element of output dict: input IDs

        #real tokens or padding: extracting the mask
        mask = enc.attention_mask

        #extract special tokens
        tokens = enc.tokens

        interpretations = []
        kk = 0

        #ig now. This piece of code did not change. because other techniques will be included later
        for technique in techniques:
            ts = time.time()

            #returns interpretations
            temp = technique(instance, prediction, tokens, enc.ids, _, _) #no attention and hidden states

            #normalization in interpretations
            interpretations.append([np.array(i)/np.max(abs(np.array(i))) for i in temp])

            #append the time it took
            time_r[kk].append(time.time()-ts)
            kk = kk + 1

        #'F','FTP','NZW'
        for metric in metrics.keys():
            evaluated = []
            for interpretation in interpretations:

                #all parameters: interpretation, tweaked_interpretation, instance, prediction, tokens, hidden_states, t_hidden_states, rationales
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, _))

            #save evaluations in dict
            metrics[metric].append(evaluated)

        #copy of saved state
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()

        #clear again all states
        my_evaluators.clear_states()

        for metric in metrics.keys():
            evaluatedP = []
            for interpretation in interpretations:

                #in a similar way as 'evaluation'
                evaluatedP.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, _))

            #save evaluations
            metricsP[metric].append(evaluatedP)

        #write results to files
        with open(file_name+'(A).pickle', 'wb') as handle:
            pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(file_name+'(P).pickle', 'wb') as handle:
            pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(file_name+'_TIME.pickle', 'wb') as handle:
            pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)

time_r = np.array(time_r)
time_r.mean(axis=1)

  0%|          | 0/87 [00:00<?, ?it/s]

  1%|          | 1/87 [00:04<06:46,  4.73s/it]

  2%|▏         | 2/87 [00:08<06:08,  4.33s/it]

  3%|▎         | 3/87 [00:13<06:16,  4.48s/it]

  5%|▍         | 4/87 [00:18<06:28,  4.68s/it]

  6%|▌         | 5/87 [00:22<05:56,  4.35s/it]

  7%|▋         | 6/87 [00:26<05:42,  4.23s/it]

  8%|▊         | 7/87 [00:32<06:34,  4.93s/it]

  9%|▉         | 8/87 [00:37<06:27,  4.90s/it]

 10%|█         | 9/87 [00:39<05:07,  3.94s/it]

 11%|█▏        | 10/87 [00:41<04:35,  3.57s/it]

 13%|█▎        | 11/87 [00:44<04:04,  3.21s/it]

 14%|█▍        | 12/87 [00:47<04:05,  3.27s/it]

 15%|█▍        | 13/87 [00:50<03:55,  3.19s/it]

 16%|█▌        | 14/87 [00:52<03:29,  2.87s/it]

 17%|█▋        | 15/87 [00:55<03:16,  2.73s/it]

 18%|█▊        | 16/87 [00:57<02:55,  2.47s/it]

 20%|█▉        | 17/87 [00:59<02:55,  2.51s/it]

 21%|██        | 18/87 [01:02<02:52,  2.50s/it]

 22%|██▏       | 19/87 [01:06<03:34,  3.15s/it]

 23%|██▎       | 20/87 [01:11<03:53,  3.49s/it]

 24%|██▍       | 21/87 [01:13<03:20,  3.04s/it]

 25%|██▌       | 22/87 [01:15<03:09,  2.91s/it]

 26%|██▋       | 23/87 [01:18<03:11,  3.00s/it]

 28%|██▊       | 24/87 [01:22<03:08,  3.00s/it]

 29%|██▊       | 25/87 [01:24<02:56,  2.84s/it]

 30%|██▉       | 26/87 [01:26<02:38,  2.59s/it]

 31%|███       | 27/87 [01:28<02:27,  2.46s/it]

 32%|███▏      | 28/87 [01:30<02:12,  2.25s/it]

 33%|███▎      | 29/87 [01:33<02:32,  2.63s/it]

 34%|███▍      | 30/87 [01:37<02:48,  2.95s/it]

 36%|███▌      | 31/87 [01:39<02:27,  2.63s/it]

 37%|███▋      | 32/87 [01:44<03:11,  3.48s/it]

 38%|███▊      | 33/87 [01:47<02:48,  3.13s/it]

 39%|███▉      | 34/87 [01:49<02:37,  2.96s/it]

 40%|████      | 35/87 [01:53<02:43,  3.14s/it]

 41%|████▏     | 36/87 [01:56<02:38,  3.10s/it]

 43%|████▎     | 37/87 [01:58<02:17,  2.76s/it]

 44%|████▎     | 38/87 [02:01<02:14,  2.75s/it]

 45%|████▍     | 39/87 [02:11<03:58,  4.97s/it]

 46%|████▌     | 40/87 [02:13<03:20,  4.26s/it]

 47%|████▋     | 41/87 [02:15<02:46,  3.61s/it]

 48%|████▊     | 42/87 [02:17<02:16,  3.03s/it]

 49%|████▉     | 43/87 [02:22<02:39,  3.62s/it]

 51%|█████     | 44/87 [02:25<02:28,  3.46s/it]

 52%|█████▏    | 45/87 [02:28<02:15,  3.23s/it]

 53%|█████▎    | 46/87 [02:30<01:55,  2.82s/it]

 54%|█████▍    | 47/87 [02:31<01:39,  2.49s/it]

 55%|█████▌    | 48/87 [02:35<01:46,  2.73s/it]

 56%|█████▋    | 49/87 [02:38<01:48,  2.86s/it]

 57%|█████▋    | 50/87 [02:41<01:45,  2.86s/it]

 59%|█████▊    | 51/87 [02:43<01:33,  2.61s/it]

 60%|█████▉    | 52/87 [02:45<01:27,  2.50s/it]

 61%|██████    | 53/87 [02:47<01:22,  2.42s/it]

 62%|██████▏   | 54/87 [02:50<01:17,  2.36s/it]

 63%|██████▎   | 55/87 [02:55<01:43,  3.22s/it]

 64%|██████▍   | 56/87 [02:58<01:37,  3.16s/it]

 66%|██████▌   | 57/87 [03:02<01:40,  3.36s/it]

 67%|██████▋   | 58/87 [03:04<01:27,  3.03s/it]

 68%|██████▊   | 59/87 [03:06<01:17,  2.78s/it]

 69%|██████▉   | 60/87 [03:09<01:17,  2.87s/it]

 70%|███████   | 61/87 [03:12<01:17,  2.99s/it]

 71%|███████▏  | 62/87 [03:15<01:14,  3.00s/it]

 72%|███████▏  | 63/87 [03:23<01:41,  4.24s/it]

 74%|███████▎  | 64/87 [03:25<01:28,  3.84s/it]

 75%|███████▍  | 65/87 [03:29<01:21,  3.70s/it]

 76%|███████▌  | 66/87 [03:31<01:09,  3.29s/it]

 77%|███████▋  | 67/87 [03:33<00:56,  2.83s/it]

 78%|███████▊  | 68/87 [03:35<00:48,  2.55s/it]

 79%|███████▉  | 69/87 [03:38<00:50,  2.80s/it]

 80%|████████  | 70/87 [03:42<00:54,  3.22s/it]

 82%|████████▏ | 71/87 [03:47<00:57,  3.61s/it]

 83%|████████▎ | 72/87 [03:50<00:52,  3.49s/it]

 84%|████████▍ | 73/87 [03:54<00:49,  3.53s/it]

 85%|████████▌ | 74/87 [03:58<00:46,  3.61s/it]

 86%|████████▌ | 75/87 [04:01<00:44,  3.70s/it]

 87%|████████▋ | 76/87 [04:08<00:49,  4.54s/it]

 89%|████████▊ | 77/87 [04:13<00:46,  4.64s/it]

 90%|████████▉ | 78/87 [04:15<00:36,  4.01s/it]

 91%|█████████ | 79/87 [04:18<00:30,  3.75s/it]

 92%|█████████▏| 80/87 [04:24<00:29,  4.16s/it]

 93%|█████████▎| 81/87 [04:27<00:23,  3.87s/it]

 94%|█████████▍| 82/87 [04:29<00:16,  3.30s/it]

 95%|█████████▌| 83/87 [04:31<00:12,  3.04s/it]

 97%|█████████▋| 84/87 [04:35<00:09,  3.17s/it]

 98%|█████████▊| 85/87 [04:54<00:16,  8.02s/it]

 99%|█████████▉| 86/87 [04:56<00:06,  6.33s/it]

100%|██████████| 87/87 [05:00<00:00,  3.45s/it]


array([2.33821695])

In [None]:
print(time_r)
print(time_r.mean(axis=1))

[[4.2008462  2.99340439 3.4092381  4.09793687 2.93011785 3.24460316
  4.45658207 3.65851998 1.4735539  1.92377877 1.69717383 2.22895646
  2.3105793  1.55564594 1.70537925 1.50068283 1.76056409 1.85805845
  2.89363694 2.69129109 1.55168796 1.76659846 2.13104844 2.42292261
  1.71866536 1.5323894  1.55830455 1.5120461  2.21454191 2.42069912
  1.50606179 3.27811146 1.60091352 2.05311203 2.35726333 2.00895834
  1.50937772 1.79924345 5.13466215 2.03292155 1.53977942 1.49004745
  2.83256984 2.44976163 1.78812575 1.47696376 1.53324986 2.09543324
  2.16507936 2.49622655 1.5313735  1.57698774 1.58938313 1.5559051
  2.98705673 2.4513278  2.35616374 1.58155918 1.56794691 1.95257902
  2.20580292 2.07012701 4.08930039 1.9768486  2.26162267 1.84353042
  1.48475528 1.4740994  2.16418076 2.43393683 2.70805502 2.09453344
  2.30115914 2.33612657 3.25800085 3.74314427 3.17704773 2.09631062
  2.26322412 3.08229923 2.11767817 1.57418108 1.97634506 2.36584234
  8.42566586 2.06144547 2.12201285]]
[2.33821695]

In [None]:
print_results(file_name+'(A)', [' IG  '], metrics, label_names)

F
 IG    0.04297000169754028 | 0.0477 0.05172 0.03493 0.05733 0.02088 0.01599 0.04116 0.07403
FTP
 IG    0.06574 | 0.04106 0.18587 0.06591 0.03148 0.04383 0.03518 0.06468 0.05792
NZW
 IG    1.0 | 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [None]:
print_results(file_name+'(P)', [' IG  '], metricsP, label_names)

F
 IG    0.21211 | 0.12896 0.03997 0.26159 0.32117 0.1991 0.11042 0.15645 0.4792
FTP
 IG    0.152 | 0.0546 0.12613 0.22669 0.19521 0.17807 0.12812 0.16005 0.1471
NZW
 IG    1.0 | 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0


We will now experiment on various attention setups.

In [None]:
conf = []
#'Mean', 'Multi', 0, 1, 2, 3, 4, 5
for ci in ['Mean', 'Multi'] + list(range(12)):

    #'Mean', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
    for ce in ['Mean'] + list(range(12)):

        # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows (rows?)
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']:

            # Selection: True: select layers per head, False: do not
            for cl in [False]:
                conf.append([ci, ce, cp, cl])

len(conf) #8*13*4*1

728

In [None]:
import time
with warnings.catch_warnings():

    #ignore the warnings
    warnings.simplefilter("ignore", category=RuntimeWarning)

    #date
    now = datetime.datetime.now()

    #saving results
    file_name = save_path + 'ETHOS_ALBERT_ATTENTION_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)

    #metrics
    metrics = {'FTP':[], 'F':[], 'NZW':[]}
    metricsP = {'FTP':[], 'F':[], 'NZW':[]}

    #times
    time_r = []
    time_b = []
    time_b2 = []

    #attentions setups
    for con in conf:
        time_r.append([])

    for ind in tqdm(range(0,len(test_texts))):

        #to not run out of memory
        torch.cuda.empty_cache()

        #one instance
        instance = test_texts[ind]

        #clear states of evaluators
        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()

        #save calculated configurations
        my_explainers.save_states = {}

        #prediction, attention matrix and hidden states. Here we care about predictions and attention.
        prediction, attention, _ = model.my_predict(instance)

        #RobertaTokenizerFast
        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]

        #real tokens or padding: extracting the mask
        mask = enc.attention_mask

        #extract special tokens
        tokens = enc.tokens

        interpretations = []
        kk = 0
        for con in conf:

            #time
            ts = time.time()

            #set configuration
            my_explainers.config = con

            #returns interpretations
            temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _) #no hidden states

            #scaling interpretations
            interpretations.append([maxabs_scale(i) for i in temp])

            #append time
            time_r[kk].append(time.time()-ts)
            kk = kk + 1

        #'F','FTP','NZW'
        for metric in metrics.keys():
            evaluated = []
            k = 0

            for interpretation in interpretations:
                tt = time.time()

                #all parameters: interpretation, tweaked_interpretation, instance, prediction, tokens, hidden_states, t_hidden_states, rationales
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, _))
                k = k + (time.time()-tt) #time
            if metric == 'FTP':
                time_b.append(k)
            metrics[metric].append(evaluated)

        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()

        for metricP in metricsP.keys():
            evaluated = []
            k = 0

            for interpretation in interpretations:
                tt = time.time()

                #all parameters: interpretation, tweaked_interpretation, instance, prediction, tokens, hidden_states, t_hidden_states, rationales
                evaluated.append(evaluationP[metricP](interpretation, _, instance, prediction, tokens, _, _, _))
                k = k + (time.time()-tt)

            if metricP == 'FTP':
                time_b2.append(k)
            metricsP[metricP].append(evaluated)

        if(ind != 0):
            with open(file_name+' (A).pickle', 'rb') as handle:
                old_metrics = pickle.load(handle)
            with open(file_name+' (P).pickle', 'rb') as handle:
                old_metricsP = pickle.load(handle)

            #append new results
            for key in metrics.keys():
                old_metrics[key].append(metrics[key][0])
                old_metricsP[key].append(metricsP[key][0])
        else:
            old_metrics = metrics
            old_metricsP = metricsP

        #save metrics as below
        with open(file_name+' (A).pickle', 'wb') as handle:
            pickle.dump(old_metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(file_name+' (P).pickle', 'wb') as handle:
            pickle.dump(old_metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(file_name+'_TIME.pickle', 'wb') as handle:
            pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)

        del old_metrics,old_metricsP
        metrics = {'FTP':[], 'F':[], 'NZW':[]}
        metricsP = {'FTP':[], 'F':[], 'NZW':[]}

#times
time_r = np.array(time_r)
time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

  0%|          | 0/87 [00:00<?, ?it/s]

  1%|          | 1/87 [00:34<49:09, 34.30s/it]

  2%|▏         | 2/87 [01:46<1:20:21, 56.72s/it]

  3%|▎         | 3/87 [02:55<1:27:16, 62.34s/it]

  5%|▍         | 4/87 [03:44<1:18:53, 57.03s/it]

  6%|▌         | 5/87 [04:45<1:19:37, 58.26s/it]

  7%|▋         | 6/87 [05:20<1:08:03, 50.42s/it]

  8%|▊         | 7/87 [06:17<1:10:13, 52.67s/it]

  9%|▉         | 8/87 [08:25<1:40:45, 76.52s/it]

 10%|█         | 9/87 [09:05<1:24:35, 65.07s/it]

 11%|█▏        | 10/87 [10:35<1:33:26, 72.81s/it]

 13%|█▎        | 11/87 [11:51<1:33:27, 73.79s/it]

 14%|█▍        | 12/87 [13:32<1:42:37, 82.10s/it]

 15%|█▍        | 13/87 [14:28<1:31:39, 74.31s/it]

 16%|█▌        | 14/87 [15:27<1:24:51, 69.75s/it]

 17%|█▋        | 15/87 [16:48<1:27:40, 73.06s/it]

 18%|█▊        | 16/87 [17:31<1:15:51, 64.10s/it]

 20%|█▉        | 17/87 [19:08<1:26:03, 73.76s/it]

 21%|██        | 18/87 [19:55<1:15:46, 65.89s/it]

 22%|██▏       | 19/87 [22:40<1:48:24, 95.65s/it]

 23%|██▎       | 20/87 [25:13<2:06:00, 112.84s/it]

 24%|██▍       | 21/87 [25:59<1:42:02, 92.77s/it] 

 25%|██▌       | 22/87 [27:33<1:40:50, 93.08s/it]

 26%|██▋       | 23/87 [29:05<1:38:54, 92.72s/it]

 28%|██▊       | 24/87 [30:07<1:27:48, 83.62s/it]

 29%|██▊       | 25/87 [31:36<1:27:52, 85.04s/it]

 30%|██▉       | 26/87 [32:26<1:15:57, 74.72s/it]

 31%|███       | 27/87 [33:32<1:11:59, 71.99s/it]

 32%|███▏      | 28/87 [34:03<58:44, 59.73s/it]  

 33%|███▎      | 29/87 [36:05<1:15:51, 78.47s/it]

 34%|███▍      | 30/87 [37:47<1:21:06, 85.38s/it]

 36%|███▌      | 31/87 [38:31<1:08:14, 73.11s/it]

 37%|███▋      | 32/87 [41:33<1:36:56, 105.75s/it]

 38%|███▊      | 33/87 [42:48<1:26:44, 96.37s/it] 

 39%|███▉      | 34/87 [43:37<1:12:42, 82.30s/it]

 40%|████      | 35/87 [45:11<1:14:23, 85.83s/it]

 41%|████▏     | 36/87 [46:57<1:17:59, 91.76s/it]

 43%|████▎     | 37/87 [47:46<1:05:52, 79.05s/it]

 44%|████▎     | 38/87 [49:15<1:06:57, 81.99s/it]

 45%|████▍     | 39/87 [54:52<2:06:44, 158.42s/it]

 46%|████▌     | 40/87 [55:48<1:40:02, 127.71s/it]

 47%|████▋     | 41/87 [56:46<1:22:02, 107.01s/it]

 48%|████▊     | 42/87 [57:08<1:01:00, 81.35s/it] 

 49%|████▉     | 43/87 [59:43<1:15:54, 103.52s/it]

 51%|█████     | 44/87 [1:00:35<1:03:09, 88.12s/it]

 52%|█████▏    | 45/87 [1:02:11<1:03:13, 90.32s/it]

 53%|█████▎    | 46/87 [1:02:50<51:15, 75.01s/it]  

 54%|█████▍    | 47/87 [1:03:15<40:01, 60.03s/it]

 55%|█████▌    | 48/87 [1:05:16<50:57, 78.39s/it]

 56%|█████▋    | 49/87 [1:06:33<49:17, 77.83s/it]

 57%|█████▋    | 50/87 [1:07:04<39:26, 63.95s/it]

 59%|█████▊    | 51/87 [1:07:53<35:40, 59.47s/it]

 60%|█████▉    | 52/87 [1:09:00<35:59, 61.70s/it]

 61%|██████    | 53/87 [1:10:06<35:38, 62.90s/it]

 62%|██████▏   | 54/87 [1:11:13<35:19, 64.21s/it]

 63%|██████▎   | 55/87 [1:13:59<50:28, 94.64s/it]

 64%|██████▍   | 56/87 [1:14:53<42:35, 82.43s/it]

 66%|██████▌   | 57/87 [1:17:03<48:24, 96.83s/it]

 67%|██████▋   | 58/87 [1:18:10<42:22, 87.67s/it]

 68%|██████▊   | 59/87 [1:19:11<37:17, 79.89s/it]

 69%|██████▉   | 60/87 [1:20:42<37:26, 83.19s/it]

 70%|███████   | 61/87 [1:22:14<37:12, 85.88s/it]

 71%|███████▏  | 62/87 [1:23:52<37:10, 89.24s/it]

 72%|███████▏  | 63/87 [1:28:25<57:45, 144.38s/it]

 74%|███████▎  | 64/87 [1:29:41<47:32, 124.02s/it]

 75%|███████▍  | 65/87 [1:31:02<40:40, 110.95s/it]

 76%|███████▌  | 66/87 [1:31:55<32:45, 93.59s/it] 

 77%|███████▋  | 67/87 [1:32:27<25:07, 75.38s/it]

 78%|███████▊  | 68/87 [1:33:14<21:08, 66.74s/it]

 79%|███████▉  | 69/87 [1:35:11<24:34, 81.90s/it]

 80%|████████  | 70/87 [1:37:47<29:27, 104.00s/it]

 82%|████████▏ | 71/87 [1:40:19<31:36, 118.56s/it]

 83%|████████▎ | 72/87 [1:42:11<29:06, 116.43s/it]

 84%|████████▍ | 73/87 [1:44:39<29:22, 125.92s/it]

 85%|████████▌ | 74/87 [1:46:32<26:28, 122.18s/it]

 86%|████████▌ | 75/87 [1:47:35<20:52, 104.36s/it]

 87%|████████▋ | 76/87 [1:51:26<26:03, 142.15s/it]

 89%|████████▊ | 77/87 [1:53:55<24:02, 144.25s/it]

 90%|████████▉ | 78/87 [1:54:42<17:16, 115.21s/it]

 91%|█████████ | 79/87 [1:55:56<13:41, 102.72s/it]

 92%|█████████▏| 80/87 [1:58:57<14:43, 126.24s/it]

 93%|█████████▎| 81/87 [2:00:36<11:48, 118.00s/it]

 94%|█████████▍| 82/87 [2:01:25<08:06, 97.31s/it] 

 95%|█████████▌| 83/87 [2:02:03<05:18, 79.61s/it]

 97%|█████████▋| 84/87 [2:03:35<04:10, 83.49s/it]

 98%|█████████▊| 85/87 [2:14:14<08:20, 250.09s/it]

 99%|█████████▉| 86/87 [2:14:44<03:03, 183.90s/it]

100%|██████████| 87/87 [2:16:42<00:00, 94.29s/it] 


(0.0022199236113449624,
 0.004733987238215304,
 0.0024909273955020035,
 0.2167106834086743,
 69.36918258941037,
 13.276981910069784)

In [None]:
print(time_r)
print(time_r.mean(axis=1).min())
time_r.mean(axis=1).max()
time_r.sum(axis=1).mean()
print(time_b)
np.mean(time_b)
print(time_b2)
np.mean(time_b2)

[[0.00477672 0.00272202 0.00271893 ... 0.00717759 0.00230956 0.00377965]
 [0.00400567 0.00288415 0.00224257 ... 0.0054462  0.00282812 0.00362611]
 [0.00412488 0.00549674 0.00188589 ... 0.00608397 0.00326872 0.00366354]
 ...
 [0.00547028 0.00266099 0.00194359 ... 0.00308752 0.00261092 0.00298095]
 [0.00385761 0.00199795 0.00187397 ... 0.00317287 0.00279474 0.00322962]
 [0.00396538 0.00175023 0.00183535 ... 0.00302577 0.00255895 0.00371671]]
0.0022199236113449624
[21.976471662521362, 52.21499967575073, 54.82641816139221, 30.416115522384644, 42.225149393081665, 21.382168292999268, 38.70888423919678, 94.36554145812988, 28.469467401504517, 73.09067678451538, 60.14785885810852, 82.35438513755798, 42.950785398483276, 44.97806429862976, 57.243204832077026, 27.28883457183838, 69.03838610649109, 39.18368172645569, 123.33758211135864, 128.04475951194763, 33.17163324356079, 66.68647122383118, 66.02033567428589, 42.61258316040039, 56.70116925239563, 36.162602186203, 44.849853515625, 18.551288366317

13.276981910069784

In [None]:
#print_results(file_name+' (A)', conf, metrics, label_names)

with open(file_name+' (A).pickle', 'rb') as handle:
    metrics = pickle.load(handle)

In [None]:
#print_results(file_name+' (P)', conf, metricsP, label_names)

with open(file_name+' (P).pickle', 'rb') as handle:
    metricsP = pickle.load(handle)

We calculate the best attention setup using Optimus variations (we do not use the Optimus implementation at this step).

In [None]:
print_results_ap(metrics, label_names, conf)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Baseline: -0.007889153683207972  and NZW: 1.0
Max Across: 0.004105331586395504  and NZW: 1.0
Per Label Per Instance: 0.09744070213219137  and NZW:  1.0
Per Instance: 0.038024286580071495  and NZW:  1.0


In [None]:
print_results_ap(metricsP, label_names, conf)

Baseline: 0.43920283264613524  and NZW: 1.0
Max Across: 0.4532741760125956  and NZW: 1.0
Per Label Per Instance: 0.5614195545491769  and NZW:  1.0


  return _methods._mean(a, axis=axis, dtype=dtype,


Per Instance: 0.5142787023781719  and NZW:  1.0


We repeat the process with Attention Scores with negative values (A*), thus by skipping the Softmax function. In the attention setups, we exclude the multiplication option in heads and layers, as a few combinations reach +/-inf.

In [None]:
conf = []
for ci in ['Mean'] + list(range(12)):
    for ce in ['Mean'] + list(range(12)):
        for cp in ['From', 'To', 'MeanColumns', 'MaxColumns']: # Matrix: From, To, MeanColumns, MeanRows, MaxColumns, MaxRows
            for cl in [False]: # Selection: True: select layers per head, False: do not
                conf.append([ci, ce, cp, cl])
len(conf)

676

In [None]:
import time
import math
with warnings.catch_warnings():

    warnings.simplefilter("ignore", category=RuntimeWarning)

    now = datetime.datetime.now()

    file_name = save_path + 'ETHOS_ALBERT_A_ATTENTION_NO_SOFTMAX_'+str(now.day) + '_' + str(now.month) + '_' + str(now.year)

    metrics = {'FTP':[], 'F':[], 'NZW':[]}
    metricsP = {'FTP':[], 'F':[], 'NZW':[]}

    time_r = []
    time_b = []
    time_b2 = []

    for con in conf:
        time_r.append([])

    for ind in tqdm(range(0,len(test_texts))):
        torch.cuda.empty_cache()

        instance = test_texts[ind]

        my_evaluators.clear_states()
        my_evaluatorsP.clear_states()

        my_explainers.save_states = {}

        prediction, _, hidden_states = model.my_predict(instance)

        enc = model.tokenizer([instance,instance], truncation=True, padding=True)[0]

        mask = enc.attention_mask

        tokens = enc.tokens

        attention = []

        for la in range(12):
            our_new_layer = []
            bob = model.trainer.model.albert.encoder.albert_layer_groups[0].albert_layers[0].attention
            has = hidden_states[la]

            aaa = bob.key(torch.tensor(has).to('cuda'))
            bbb = bob.query(torch.tensor(has).to('cuda'))
            for he in range(12):
                attention_scores = torch.matmul(bbb[:,he*64:(he+1)*64], aaa[:,he*64:(he+1)*64].transpose(-1, -2))
                attention_scores = attention_scores / math.sqrt(64)
                our_new_layer.append(attention_scores.cpu().detach().numpy())
            attention.append(our_new_layer)
        attention = np.array(attention)

        interpretations = []
        kk = 0
        for con in conf:
            ts = time.time()
            my_explainers.config = con
            temp = my_explainers.my_attention(instance, prediction, tokens, mask, attention, _)
            interpretations.append([maxabs_scale(i) for i in temp])
            time_r[kk].append(time.time()-ts)
            kk = kk + 1
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluation[metric](interpretation, _, instance, prediction, tokens, _, _, _))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b.append(k)
            metrics[metric].append(evaluated)
        my_evaluatorsP.saved_state = my_evaluators.saved_state.copy()
        for metric in metrics.keys():
            evaluated = []
            k = 0
            for interpretation in interpretations:
                tt = time.time()
                evaluated.append(evaluationP[metric](interpretation, _, instance, prediction, tokens, _, _, _))
                k = k + (time.time()-tt)
            if metric == 'FTP':
                time_b2.append(k)
            metricsP[metric].append(evaluated)
        with open(file_name+' (A).pickle', 'wb') as handle:
            pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(file_name+' (P).pickle', 'wb') as handle:
            pickle.dump(metricsP, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(file_name+'_TIME.pickle', 'wb') as handle:
            pickle.dump(time_r, handle, protocol=pickle.HIGHEST_PROTOCOL)
time_r = np.array(time_r)
time_r.mean(axis=1).min(),time_r.mean(axis=1).max(), time_r.mean(axis=1).mean(), time_r.sum(axis=1).mean(), np.mean(time_b), np.mean(time_b2)

  0%|          | 0/87 [00:00<?, ?it/s]

  1%|          | 1/87 [00:48<1:09:52, 48.75s/it]

  2%|▏         | 2/87 [02:00<1:28:35, 62.53s/it]

  3%|▎         | 3/87 [03:05<1:28:55, 63.52s/it]

  5%|▍         | 4/87 [03:48<1:16:36, 55.38s/it]

  6%|▌         | 5/87 [04:48<1:17:53, 57.00s/it]

  7%|▋         | 6/87 [05:20<1:05:42, 48.67s/it]

  8%|▊         | 7/87 [06:12<1:06:15, 49.69s/it]

  9%|▉         | 8/87 [08:10<1:33:51, 71.28s/it]

 10%|█         | 9/87 [08:46<1:18:19, 60.25s/it]

 11%|█▏        | 10/87 [10:08<1:26:03, 67.06s/it]

 13%|█▎        | 11/87 [11:19<1:26:16, 68.11s/it]

 14%|█▍        | 12/87 [12:52<1:34:43, 75.79s/it]

 15%|█▍        | 13/87 [13:43<1:24:14, 68.31s/it]

 16%|█▌        | 14/87 [14:37<1:17:55, 64.05s/it]

 17%|█▋        | 15/87 [15:51<1:20:21, 66.97s/it]

 18%|█▊        | 16/87 [16:31<1:09:38, 58.85s/it]

 20%|█▉        | 17/87 [18:00<1:19:09, 67.85s/it]

 21%|██        | 18/87 [18:43<1:09:42, 60.62s/it]

 22%|██▏       | 19/87 [21:15<1:39:30, 87.80s/it]

 23%|██▎       | 20/87 [23:37<1:56:19, 104.17s/it]

 24%|██▍       | 21/87 [24:20<1:34:16, 85.71s/it] 

 25%|██▌       | 22/87 [25:45<1:32:54, 85.76s/it]

 26%|██▋       | 23/87 [27:11<1:31:32, 85.83s/it]

 28%|██▊       | 24/87 [28:09<1:21:21, 77.49s/it]

 29%|██▊       | 25/87 [29:32<1:21:34, 78.95s/it]

 30%|██▉       | 26/87 [30:18<1:10:10, 69.02s/it]

 31%|███       | 27/87 [31:18<1:06:27, 66.46s/it]

 32%|███▏      | 28/87 [31:46<54:00, 54.92s/it]  

 33%|███▎      | 29/87 [33:37<1:09:23, 71.79s/it]

 34%|███▍      | 30/87 [35:10<1:14:09, 78.05s/it]

 36%|███▌      | 31/87 [35:51<1:02:26, 66.90s/it]

 37%|███▋      | 32/87 [38:41<1:29:37, 97.77s/it]

 38%|███▊      | 33/87 [39:49<1:20:08, 89.04s/it]

 39%|███▉      | 34/87 [40:33<1:06:41, 75.51s/it]

 40%|████      | 35/87 [42:00<1:08:22, 78.90s/it]

 41%|████▏     | 36/87 [43:38<1:12:02, 84.75s/it]

 43%|████▎     | 37/87 [44:23<1:00:27, 72.55s/it]

 44%|████▎     | 38/87 [45:45<1:01:42, 75.57s/it]

 45%|████▍     | 39/87 [50:59<1:57:35, 146.99s/it]

 46%|████▌     | 40/87 [51:50<1:32:38, 118.27s/it]

 47%|████▋     | 41/87 [52:44<1:15:50, 98.93s/it] 

 48%|████▊     | 42/87 [53:05<56:48, 75.74s/it]  

 49%|████▉     | 43/87 [55:29<1:10:20, 95.93s/it]

 51%|█████     | 44/87 [56:17<58:33, 81.72s/it]  

 52%|█████▏    | 45/87 [57:45<58:26, 83.48s/it]

 53%|█████▎    | 46/87 [58:20<47:08, 68.99s/it]

 54%|█████▍    | 47/87 [58:43<36:54, 55.36s/it]

 55%|█████▌    | 48/87 [1:00:33<46:35, 71.67s/it]

 56%|█████▋    | 49/87 [1:01:42<44:56, 70.97s/it]

 57%|█████▋    | 50/87 [1:02:12<36:02, 58.45s/it]

 59%|█████▊    | 51/87 [1:02:57<32:41, 54.48s/it]

 60%|█████▉    | 52/87 [1:03:58<32:54, 56.41s/it]

 61%|██████    | 53/87 [1:04:59<32:50, 57.97s/it]

 62%|██████▏   | 54/87 [1:06:02<32:37, 59.32s/it]

 63%|██████▎   | 55/87 [1:08:36<46:51, 87.85s/it]

 64%|██████▍   | 56/87 [1:09:27<39:35, 76.62s/it]

 66%|██████▌   | 57/87 [1:11:26<44:43, 89.46s/it]

 67%|██████▋   | 58/87 [1:12:27<39:00, 80.72s/it]

 68%|██████▊   | 59/87 [1:13:22<34:06, 73.10s/it]

 69%|██████▉   | 60/87 [1:14:48<34:38, 76.96s/it]

 70%|███████   | 61/87 [1:16:13<34:24, 79.41s/it]

 71%|███████▏  | 62/87 [1:17:43<34:25, 82.60s/it]

 72%|███████▏  | 63/87 [1:21:55<53:20, 133.35s/it]

 74%|███████▎  | 64/87 [1:23:04<43:42, 114.01s/it]

 75%|███████▍  | 65/87 [1:24:20<37:40, 102.74s/it]

 76%|███████▌  | 66/87 [1:25:06<30:02, 85.82s/it] 

 77%|███████▋  | 67/87 [1:25:36<23:00, 69.04s/it]

 78%|███████▊  | 68/87 [1:26:19<19:21, 61.12s/it]

 79%|███████▉  | 69/87 [1:28:06<22:28, 74.91s/it]

 80%|████████  | 70/87 [1:30:29<26:59, 95.27s/it]

 82%|████████▏ | 71/87 [1:32:51<29:11, 109.45s/it]

 83%|████████▎ | 72/87 [1:34:33<26:47, 107.14s/it]

 84%|████████▍ | 73/87 [1:36:48<26:55, 115.43s/it]

 85%|████████▌ | 74/87 [1:38:30<24:10, 111.57s/it]

 86%|████████▌ | 75/87 [1:39:29<19:06, 95.53s/it] 

 87%|████████▋ | 76/87 [1:43:04<24:07, 131.64s/it]

 89%|████████▊ | 77/87 [1:45:22<22:14, 133.46s/it]

 90%|████████▉ | 78/87 [1:46:05<15:57, 106.42s/it]

 91%|█████████ | 79/87 [1:47:14<12:41, 95.18s/it] 

 92%|█████████▏| 80/87 [1:50:01<13:36, 116.62s/it]

 93%|█████████▎| 81/87 [1:51:34<10:57, 109.51s/it]

 94%|█████████▍| 82/87 [1:52:15<07:24, 88.96s/it] 

 95%|█████████▌| 83/87 [1:52:52<04:53, 73.40s/it]

 97%|█████████▋| 84/87 [1:54:15<03:49, 76.34s/it]

 98%|█████████▊| 85/87 [2:04:20<07:49, 234.77s/it]

 99%|█████████▉| 86/87 [2:04:46<02:52, 172.12s/it]

100%|██████████| 87/87 [2:06:36<00:00, 87.32s/it] 


(0.0022355413984978334,
 0.003647165736932864,
 0.0024486161499358127,
 0.2130296050444157,
 64.65809125735842,
 12.344256792945423)

In [None]:
print_results(file_name+' (A)', conf, metrics, label_names)

FTP
['Mean', 'Mean', 'From', False]  0.00719 | 0.01572 0.08674 0.03691 -0.04833 0.02949 -0.03454 -0.00022 -0.02828
['Mean', 'Mean', 'To', False]  -0.00693 | -0.02939 -0.13666 -0.0526 0.05448 -0.02045 0.02822 0.05098 0.04994
['Mean', 'Mean', 'MeanColumns', False]  0.00458 | 0.03278 0.01368 0.02271 -0.04995 0.02869 -0.03367 0.05192 -0.02956
['Mean', 'Mean', 'MaxColumns', False]  -0.00648 | 0.0175 -0.16733 -0.04935 0.05446 -0.02672 0.0259 0.03989 0.0538
['Mean', 0, 'From', False]  0.005 | 0.01711 0.01947 0.02062 -0.04324 0.02647 -0.02651 0.03409 -0.00798
['Mean', 0, 'To', False]  0.01165 | 0.0658 0.0916 0.04023 -0.04865 0.02691 -0.02257 -0.04436 -0.01578
['Mean', 0, 'MeanColumns', False]  -0.00024 | 0.02144 -0.02101 -0.00143 -0.04243 0.03401 -0.02774 0.05277 -0.01751
['Mean', 0, 'MaxColumns', False]  -0.00451 | 0.00564 -0.20564 -0.09494 0.07981 -0.0065 0.01434 0.08164 0.08957
['Mean', 1, 'From', False]  -0.00902 | -0.00687 -0.20152 -0.05419 0.00372 0.03168 -0.02019 0.10703 0.06819
['Mean'

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['Mean', 5, 'From', False]  0.01001 | 0.02209 0.14608 0.04955 -0.05509 0.02821 -0.02532 -0.02966 -0.05575
['Mean', 5, 'To', False]  0.00511 | 0.01573 0.18817 0.04474 -0.08505 0.03775 -0.02334 -0.06511 -0.07197
['Mean', 5, 'MeanColumns', False]  0.008 | 0.01098 0.14455 0.05031 -0.05338 0.0264 -0.02686 -0.03087 -0.05712
['Mean', 5, 'MaxColumns', False]  0.0069 | 0.01038 0.12115 0.05393 -0.05619 0.0308 -0.02733 -0.01928 -0.05829
['Mean', 6, 'From', False]  0.00867 | 0.02496 0.11337 0.02875 -0.05571 0.03285 -0.0232 0.00213 -0.05375
['Mean', 6, 'To', False]  0.00577 | -0.01004 0.13947 0.0327 -0.05802 0.03673 -0.02552 -0.03873 -0.03043
['Mean', 6, 'MeanColumns', False]  0.00329 | 0.02861 0.0983 0.04133 -0.06072 0.01661 -0.03479 -0.01898 -0.04406
['Mean', 6, 'MaxColumns', False]  -0.0078 | 0.00051 -0.16147 -0.05425 0.05075 -0.02956 0.03139 0.0326 0.06766
['Mean', 7, 'From', False]  -0.00117 | 0.01702 -0.14516 -0.02953 0.01115 0.0126 -0.00973 0.07347 0.06084
['Mean', 7, 'To', False]  -0.00941 

In [None]:
print_results(file_name+' (P)', conf, metricsP, label_names)

FTP
['Mean', 'Mean', 'From', False]  -0.06011 | -0.14725 -0.01824 0.02889 -0.24263 0.04776 -0.1065 0.08112 -0.12402
['Mean', 'Mean', 'To', False]  0.20059 | 0.15171 0.09674 0.16385 0.31593 0.09876 0.19375 0.14118 0.4428
['Mean', 'Mean', 'MeanColumns', False]  0.01232 | -0.07162 -0.08028 0.11419 -0.24047 0.13628 -0.07574 0.34078 -0.02456
['Mean', 'Mean', 'MaxColumns', False]  0.17427 | 0.2345 0.02198 0.07637 0.33407 0.03856 0.17655 0.09856 0.41361
['Mean', 0, 'From', False]  0.03038 | -0.09244 -0.06222 0.10395 -0.19614 0.12971 0.01648 0.24261 0.1011
['Mean', 0, 'To', False]  -0.10258 | -0.01219 -0.02497 -0.03921 -0.31949 -0.01801 -0.15441 -0.13765 -0.1147
['Mean', 0, 'MeanColumns', False]  0.04295 | -0.0434 -0.06175 0.06688 -0.21972 0.16226 0.01572 0.34732 0.07626
['Mean', 0, 'MaxColumns', False]  0.33188 | 0.19928 0.04698 0.1581 0.50671 0.29469 0.303 0.34855 0.79774
['Mean', 1, 'From', False]  0.33287 | 0.07188 -0.04435 0.41429 0.12671 0.40411 0.28607 0.58483 0.81942
['Mean', 1, 'To', 

We calculate the best attention setup using Optimus variations.

In [None]:
print_results_ap(metrics, label_names, conf)

Baseline: 0.007186108700844222  and NZW: 1.0
Max Across: 0.017651358768390416  and NZW: 1.0
Per Label Per Instance: 0.24208423274290375  and NZW:  1.0
Per Instance: 0.09411942202397976  and NZW:  1.0


In [None]:
print_results_ap(metricsP, label_names, conf)

Baseline: -0.06010905810267556  and NZW: 1.0
Max Across: 0.3990316517689547  and NZW: 1.0
Per Label Per Instance: 0.60628275265561  and NZW:  1.0


  return _methods._mean(a, axis=axis, dtype=dtype,


Per Instance: 0.537123295228136  and NZW:  1.0
