#### Transformers on Multiple Sequences
- Batching
- Padding
- Attention Masks
- Handling longer sequences

#### Batched Input

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [2]:
sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

In [4]:
# Can't just do this. You need to provide the input properly and make sure dimensions match
model(input_ids)

IndexError: too many indices for tensor of dimension 1

In [9]:
tokenized_inputs = tokenizer(sequence, return_tensors='pt')
print(tokenized_inputs['input_ids'])
print(tokenized_inputs['input_ids'].shape) # Correct Shape
print(input_ids.shape) # Current shape

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])
torch.Size([1, 16])
torch.Size([14])


In [10]:
# Make the shape right

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])
print(input_ids)

output = model(input_ids)
print(output.logits)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


#### Padding

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

seq1_ids = [[200, 200, 200]]
seq2_ids = [[200, 200]]

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

print(model(torch.tensor(seq1_ids)).logits)
print(model(torch.tensor(seq2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


#### Attention Mask
- Added to not apply attention to padded items

In [15]:
attention_mask = [
    [1,1,1],
    [1,1,0]
]

outputs = model(torch.tensor(batched_ids),
                attention_mask = torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [41]:
# Back to the start

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "Here is another sentence I hate a lot"
]

tokens = [tokenizer.tokenize(sequence) for sequence in sequences]
ids = [tokenizer.convert_tokens_to_ids(token_set) for token_set in tokens]
# input_ids = torch.tensor(ids)

In [42]:
pad_id = tokenizer.pad_token_id

max_len = len(tokens[0])

attention_mask = [
    [1] * len(ids[0]) + [0] * (max_len - len(ids[0])),
    [1] * len(ids[1]) + [0] * (max_len - len(ids[1]))
]

ids[1] = ids[1] + [pad_id] * (max_len - len(ids[1]))
input_ids = torch.tensor(ids)

In [43]:
input_ids

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 2182,  2003,  2178,  6251,  1045,  5223,  1037,  2843,     0,     0,
             0,     0,     0,     0]])

In [44]:
attention_mask

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]

In [46]:
outputs = model(input_ids,
                attention_mask = torch.tensor(attention_mask))
print(outputs.logits)

tensor([[-2.7276,  2.8789],
        [ 3.0176, -2.6385]], grad_fn=<AddmmBackward0>)


#### Longer Sequences
For longer sequences, either truncate, or use models that can allow for longer sequences