# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)

IndexError: too many indices for tensor of dimension 1

The above failed because we sent a single sequence to the model but Hugging Face transformers expect multiple sentences by default.

In [3]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [5]:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [6]:
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


The key feature of Transformer models is attention layers that contextualize each token. These will take into account the padding tokens since they attend to all of the tokens of a sequence. To get the same result when passing individual sentences of different lengths through the model or when passing a batch with the same sentences and padding applied, we need to tell those attention layers to ignore the padding tokens. This is done by using an attention mask.

In [8]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Sentences from the course
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

# ----- 1. Reference: high-level tokenization (for comparison) -----
encoded = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    ref_outputs = model(**encoded)
print("Reference logits (high-level tokenizer):")
print(ref_outputs.logits)
print()

# ----- 2. Manual tokenization for each sentence -----
# Tokenize & convert to IDs
tokens_list = [tokenizer.tokenize(s) for s in sentences]
ids_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list]

# Add special tokens [CLS] at start and [SEP] at end
cls_id = tokenizer.cls_token_id
sep_id = tokenizer.sep_token_id

ids_with_special = [[cls_id] + ids + [sep_id] for ids in ids_list]

# Create tensors for each sentence individually and get logits
input_ids_1 = torch.tensor([ids_with_special[0]])
input_ids_2 = torch.tensor([ids_with_special[1]])

with torch.no_grad():
    out1 = model(input_ids_1).logits
    out2 = model(input_ids_2).logits

print("Logits for sentence 1 (manual, single):", out1)
print("Logits for sentence 2 (manual, single):", out2)
print()

# ----- 3. Batch them together with padding + attention mask -----
pad_id = tokenizer.pad_token_id

# Pad sequences to the same length
max_len = max(len(seq) for seq in ids_with_special)
batched_ids = []
attention_masks = []

for seq in ids_with_special:
    pad_len = max_len - len(seq)
    batched_ids.append(seq + [pad_id] * pad_len)
    attention_masks.append([1] * len(seq) + [0] * pad_len)

batched_ids = torch.tensor(batched_ids)
attention_masks = torch.tensor(attention_masks)

with torch.no_grad():
    batched_outputs = model(batched_ids, attention_mask=attention_masks).logits

print("Logits for batched sentences (manual + padding + attention_mask):")
print(batched_outputs)
print()

# ----- 4. Check that batched and individual logits match -----
print("Sentence 1 equal? ", torch.allclose(out1, batched_outputs[0]))
print("Sentence 2 equal? ", torch.allclose(out2, batched_outputs[1]))

Reference logits (high-level tokenizer):
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]])

Logits for sentence 1 (manual, single): tensor([[-1.5607,  1.6123]])
Logits for sentence 2 (manual, single): tensor([[ 4.1692, -3.3464]])

Logits for batched sentences (manual + padding + attention_mask):
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]])

Sentence 1 equal?  True
Sentence 2 equal?  True
