# Utils for attention annotation

In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel

## Load model

Change tokenizer and encoder to the desired models, e.g., by loading a checkpoint.

In [2]:
device = "cpu"
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = AutoModel.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Feed a text through model

- Tokenize the text
- Feed through encoder
- Get the attention score for the first token

In [3]:
sentence = """This is an example sentence, which you will replace later."""

# Tokenize
inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
inputs = inputs.to(device)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# Encode
outputs = encoder(**inputs, output_attentions=True)

# Encoding of text = encoding of the first token
encoding = outputs[0][:, 0, :]

# Get the attention scores for the first token
attns = outputs.attentions[-1].mean(dim=1)[0][0].cpu().detach().numpy()

pd.Series(attns, index=tokens)

<s>          0.323200
This         0.015502
Ġis          0.019581
Ġan          0.017148
Ġexample     0.018047
Ġsentence    0.034983
,            0.025972
Ġwhich       0.026652
Ġyou         0.030326
Ġwill        0.025305
Ġreplace     0.023227
Ġlater       0.021675
.            0.310481
</s>         0.107901
dtype: float32

## Reconstruct from list of tokens

Since we may have sub-word tokens, we will reconstruct the words.

In [4]:
# Input = token
# Outputs:
#  - the token stripped of the special character
#  - whether the token is the start of a new word
#
# This is an example for RoBERTa, where if a token starts
# with "Ġ", then it is the beginning of a new word.
#
# TODO: Change this for BERT or another model.
def token_strip(token):
    cont_char = "Ġ"
    starts_with_cont = token.startswith(cont_char)
    stripped_token = token
    if starts_with_cont:
        stripped_token = token[len(cont_char):]
    is_new_token = starts_with_cont
    return stripped_token, is_new_token

In [5]:
# strip_left_special: if true, remove the <s> token for RoBERTa
# strip_right_special: if true, remove the </s> token for RoBERTa
#
# Works the same for other models, where the BOS and EOS tokens 
# are different.
def tokens_to_string_with_attn(tokens, scores,
                               strip_left_special=True,
                               strip_right_special=True,
                               strip_fn=token_strip):
    if strip_left_special:
        tokens = tokens[1:]
        scores = scores[1:]
    if strip_right_special:
        tokens = tokens[:-1]
        scores = scores[:-1]
    token_list = []
    score_list = []
    curr_token = None
    curr_scores = []
    for token, score in zip(tokens, scores):
        token, is_new_token = strip_fn(token)
        if not is_new_token:
            if curr_token is None:
                curr_token = ""
            curr_token += token
            curr_scores.append(score)
        else:
            if curr_token is not None:
                token_list.append(curr_token)
                # Take the average of the attention scores
                # for all sub-word tokens of the word
                score_list.append(np.mean(curr_scores))
            curr_token = token
            curr_scores = [score]
    if len(curr_scores) > 0:
        token_list.append(curr_token)
        score_list.append(np.mean(curr_scores))
    return token_list, score_list

In [6]:
token_list, score_list = tokens_to_string_with_attn(tokens=tokens,
                                                    scores=attns,
                                                    strip_left_special=True,
                                                    strip_right_special=True)
pd.Series(score_list, index=token_list)

This         0.015502
is           0.019581
an           0.017148
example      0.018047
sentence,    0.030478
which        0.026652
you          0.030326
will         0.025305
replace      0.023227
later.       0.166078
dtype: float32