In [1]:
question = "Which deep learning libraries back 🤗 Transformers?"

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""


## Get start/end output tensors

In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

In [3]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

torch.Size([1, 67]) torch.Size([1, 67])


## Mask output 

In [19]:
import torch

sequence_ids = inputs.sequence_ids()
# Mask everything apart from the tokens of the context
mask = [i != 1 for i in sequence_ids]
# Unmask the [CLS] token
mask[0] = False
#mask

mask = torch.tensor(mask)[None]  #End None gives it a column orientation i beleive.
mask

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False,  True]])

In [7]:
start_logits[mask]

tensor([-6.4454, -4.7115, -7.0968, -7.0726, -7.4981, -5.5397, -4.1368, -5.9199,
        -5.4193, -5.4193], grad_fn=<IndexBackward0>)

In [20]:
#mask them!!

start_logits[mask] = -10000
end_logits[mask] = -10000

#start_logits

## Softmaxes

In [10]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]

## Scores Matrix

In [25]:
#colum vector matrix multiplication of start/end probabilities
scores = start_probabilities[:, None] * end_probabilities[None, :]
scores.shape

torch.Size([67, 67])

## Upper triangulate as lower half is duplicate of top half

In [26]:
#matrix is duplicated over i=j and is symmetric.  get rid of the duplicate half
scores = torch.triu(scores)
scores.shape

torch.Size([67, 67])

## Topk scores - note that scores will be flattened here.

In [52]:
scores.view(-1)#.shape

tensor([9.4340e-13, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00], grad_fn=<ViewBackward0>)

In [53]:
# Find the top 5 largest values and their indices in the `scores` tensor
top_k_values, top_k_indices = torch.topk(scores.view(-1), k=5)
# 5 element vals tensor
# 5 element tuple tensor
top_k_values.shape, top_k_indices.shape

(torch.Size([5]), torch.Size([5]))

In [55]:
top_k_indices, top_k_values

(tensor([1576, 1577, 1107, 1570, 1710]),
 tensor([9.8026e-01, 8.2478e-03, 6.8415e-03, 1.3677e-03, 3.8109e-04],
        grad_fn=<TopkBackward0>))

In [56]:

# Extract the row and column indices of the top 5 largest values
row_indices = top_k_indices // scores.shape[1]
column_indices = top_k_indices % scores.shape[1]

row_indices, column_indices

(tensor([23, 23, 16, 23, 25]), tensor([35, 36, 35, 29, 35]))

### How does topk work?

In [48]:
x = [[1,2,93,],[4,5,6,],[3,4,4,]]
x = torch.tensor(x)
print(x.shape)
torch.topk(x,k=1)

torch.Size([3, 3])


torch.return_types.topk(
values=tensor([[93],
        [ 6],
        [ 4]]),
indices=tensor([[2],
        [2],
        [1]]))

## Top K predicted answers

In [66]:
def map_index_to_offsets(start_index, end_index):
    inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
    offsets = inputs_with_offsets["offset_mapping"]

    start_char, _ = offsets[start_index]
    _, end_char = offsets[end_index]
    answer = context[start_char:end_char]
    return answer



# Flatten the scores tensor and find the top 5 largest values and their indices
top_k_values, top_k_indices = torch.topk(scores.view(-1), k=5)

# Calculate the row and column indices of the top 5 largest values
row_indices = top_k_indices // scores.shape[1]
column_indices = top_k_indices % scores.shape[1]

# Print the top 5 largest values and their indices
for i in range(5):
    start_index = row_indices[i].item()
    end_index = column_indices[i].item()
    answer = map_index_to_offsets(start_index, end_index)
    #context[start_char:end_char]
    
    value = top_k_values[i].item()
    print(f"index {i }: {value} (start_index: {start_index}, end_index: {end_index}), answer: {answer}")


index 0: 0.9802601933479309 (start_index: 23, end_index: 35), answer: Jax, PyTorch, and TensorFlow
index 1: 0.008247788064181805 (start_index: 23, end_index: 36), answer: Jax, PyTorch, and TensorFlow —
index 2: 0.00684146536514163 (start_index: 16, end_index: 35), answer: three most popular deep learning libraries — Jax, PyTorch, and TensorFlow
index 3: 0.0013676981907337904 (start_index: 23, end_index: 29), answer: Jax, PyTorch
index 4: 0.0003810854977928102 (start_index: 25, end_index: 35), answer: PyTorch, and TensorFlow


## Check topk method against argmax method

In [57]:
max_index = scores.argmax().item()
max_index

1576

In [23]:
max_index // scores.shape[1],max_index % scores.shape[1]


(23, 35)

In [58]:
start_index = max_index // scores.shape[1]
end_index = max_index % scores.shape[1]
print(scores[start_index, end_index])

tensor(0.9803, grad_fn=<SelectBackward0>)


In [59]:
inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
offsets = inputs_with_offsets["offset_mapping"]

start_char, _ = offsets[start_index]
_, end_char = offsets[end_index]
answer = context[start_char:end_char]
answer

'Jax, PyTorch, and TensorFlow'

In [63]:
offsets[start_index]
offsets[end_index]

(103, 106)

In [60]:
start_char, end_char

(78, 106)