In [1]:
!pip install transformers datasets torch -q

In [73]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

In [83]:
# dataset = load_dataset("squad")

# example = dataset["train"][0]
# example

from datasets import load_dataset
dataset = load_dataset("AmazonScience/massive", "en-US", split='test')

Downloading data: 100%|██████████| 676k/676k [00:00<00:00, 979kB/s]
Downloading data: 100%|██████████| 140k/140k [00:00<00:00, 509kB/s]
Downloading data: 100%|██████████| 191k/191k [00:00<00:00, 357kB/s]
Generating train split: 100%|██████████| 11514/11514 [00:00<00:00, 383657.06 examples/s]
Generating validation split: 100%|██████████| 2033/2033 [00:00<00:00, 227375.07 examples/s]
Generating test split: 100%|██████████| 2974/2974 [00:00<00:00, 203492.07 examples/s]


In [76]:
# model_name = "deepset/roberta-base-squad2"

# model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

name = "qanastek/XLMRoberta-Alexa-Intents-Classification"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)

In [None]:
model.config.id2label

In [5]:
inputs = tokenizer(
    example['question'],
    example['context'],
    return_tensors='pt'
)

In [6]:
import requests

url = "http://localhost:4000"

# myobj = {
#     'name': self.name,
#     'model': self.model.config.model_type,
#     'text': e.text,
#     'tokens': e.tokens,
#     'attributions': scores
# }
    

def send_body(body):
    # posting it to the xai-platform
    u = f"{url}/runs"
    requests.post(u, json = body)

In [8]:
with torch.no_grad():
    outputs = model(**inputs)

start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits)

r = inputs['input_ids'][0][start_idx:end_idx + 1]
predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(r))

In [10]:
print(f"Answer: {predicted_answer}")

Answer 1:  Saint Bernadette Soubirous


In [12]:
def predict(inputs, token_type_ids=None, position_ids=None, attention_mask=None):
    output = model(inputs, token_type_ids=token_type_ids,
                 position_ids=position_ids, attention_mask=attention_mask, )
    return output.start_logits, output.end_logits

In [13]:
def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
    pred = predict(inputs,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    pred = pred[position]
    return pred.max(1).values

In [14]:
!pip install captum -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [84]:
from transformers.tokenization_utils_base import BatchEncoding
from captum.attr import InputXGradient

class ModelHelper:
    """
    Wrapper class to interface with HuggingFace models
    """

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def _tokenize(self, text: str, **tok_kwargs) -> BatchEncoding:
        return self.tokenizer(text, return_tensors="pt", truncation=True, **tok_kwargs)
    
    def get_input_embeds(self, text: str) -> torch.Tensor:
        """Extract input embeddings

        :param text str: the string to extract embeddings from.
        """
        item = self._tokenize(text)
        item = {k: v.to(self.model.device) for k, v in item.items()}
        embeddings = self._get_input_embeds_from_ids(item["input_ids"][0])
        embeddings = embeddings.unsqueeze(0)
        return embeddings

    def _get_input_embeds_from_ids(self, ids) -> torch.Tensor:
        return self.model.get_input_embeddings()(ids)

In [85]:
dataset[11]

{'id': '59',
 'locale': 'en-US',
 'partition': 'test',
 'scenario': 16,
 'intent': 52,
 'utt': 'cancel my seven am alarm',
 'annot_utt': 'cancel my [time : seven am] alarm',
 'worker_id': '70',
 'slot_method': {'slot': [], 'method': []},
 'judgments': {'worker_id': [],
  'intent_score': [],
  'slots_score': [],
  'grammar_score': [],
  'spelling_score': [],
  'language_identification': []}}

In [97]:
text = dataset[11]["utt"]
target = 1
model_helper = ModelHelper(model, tokenizer)

item = tokenizer(text=text, return_tensors='pt')
input_len = item["attention_mask"].sum().item()

def func(input_embeds):
    outputs = model_helper.model(
        inputs_embeds=input_embeds, attention_mask=item["attention_mask"]
    )
    return outputs.logits

dl = InputXGradient(func)
inputs = model_helper.get_input_embeds(text)

attr = dl.attribute(inputs, target=target)

In [98]:
attr

tensor([[[-2.3997e-04, -1.9094e-04,  2.2474e-04,  ...,  1.3069e-03,
           6.3588e-05, -2.2603e-04],
         [-7.6990e-04,  7.4859e-05, -2.5630e-04,  ...,  1.0900e-03,
          -1.0825e-02,  9.6698e-03],
         [-1.8734e-05,  2.8926e-05, -5.6548e-05,  ...,  1.2224e-04,
          -2.0386e-03,  2.3753e-03],
         ...,
         [-1.8494e-05, -2.5548e-04, -9.9333e-04,  ..., -6.2607e-04,
          -2.1831e-03, -4.7613e-03],
         [ 3.0155e-05,  1.2161e-03, -3.6249e-04,  ..., -5.1833e-03,
           1.8906e-03,  1.0435e-03],
         [-1.5737e-03, -2.9266e-04, -7.4562e-04,  ..., -1.0935e-03,
          -3.4146e-04,  3.5755e-03]]], grad_fn=<MulBackward0>)

In [99]:
attr = attr[0, :input_len, :].detach().cpu()

# pool over hidden size
attr = attr.sum(-1).numpy()
attr

array([-0.00679844,  0.13200764, -0.01119853,  0.00851695, -0.00574999,
        0.03258043, -0.00890212], dtype=float32)

In [1]:
!pip install inseq

Looking in indexes: https://pypi.org/simple, https://pypi.org/simple
Collecting inseq
  Downloading inseq-0.5.0-py3-none-any.whl.metadata (18 kB)
Collecting jaxtyping<0.3.0,>=0.2.23 (from inseq)
  Downloading jaxtyping-0.2.25-py3-none-any.whl.metadata (6.4 kB)
Collecting poethepoet>=0.13.1 (from inseq)
  Downloading poethepoet-0.24.4-py3-none-any.whl.metadata (5.0 kB)
Collecting rich>=10.13.0 (from inseq)
  Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)
Collecting torch==2.0.0 (from inseq)
  Downloading torch-2.0.0-cp39-none-macosx_10_9_x86_64.whl (139.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 MB[0m [31m633.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting typeguard<=2.13.3 (from inseq)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting pastel<0.3.0,>=0.2.1 (from poethepoet>=0.13.1->inseq)
  Downloading pastel-0.2.1-py2.py3-none-any.whl (6.0 kB)
Collecting tomli>=1.2.2 (from poethepoet>=0.13.1->inseq)
  Using 