In [1]:
!pip install transformers torch



In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1 = "I have been waiting for a vacation in France my whole life."
sequence2 = "I've been waiting for a HuggingFace course my whole life."
tokens1 = tokenizer.tokenize(sequence1)
print(tokens1)
ids1 = tokenizer.convert_tokens_to_ids(tokens1)
print(ids1)
tokens2 = tokenizer.tokenize(sequence2)
print(tokens2)
ids2 = tokenizer.convert_tokens_to_ids(tokens2)
print(ids2)
print(tokenizer.pad_token_id)
max_len = max(len(ids1), len(ids2))
batch_ids = [
    ids1 + [tokenizer.pad_token_id] * (max_len - len(ids1)), 
    ids2 + [tokenizer.pad_token_id] * (max_len - len(ids2)),
]
attention_mask = [
    [1 for _ in ids1] + [0] * (max_len - len(ids1)),
    [1 for _ in ids2] + [0] * (max_len - len(ids2))
]
input_ids = torch.tensor(batch_ids)
print("Input ids:", input_ids)
output = model(input_ids, attention_mask=torch.tensor(attention_mask))
print("Logits:", output.logits)

['i', 'have', 'been', 'waiting', 'for', 'a', 'vacation', 'in', 'france', 'my', 'whole', 'life', '.']
[1045, 2031, 2042, 3403, 2005, 1037, 10885, 1999, 2605, 2026, 2878, 2166, 1012]
['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
0
Input ids: tensor([[ 1045,  2031,  2042,  3403,  2005,  1037, 10885,  1999,  2605,  2026,
          2878,  2166,  1012,     0],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-0.5355,  0.6644],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [4]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  2031,  2042,  3403,  2005,  1037, 10885,  1999,  2605,
          2026,  2878,  2166,  1012,   102]])


In [23]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
