#### Putting the whole process together

In [5]:
from transformers import AutoTokenizer
from pprint import pprint

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
# Single Sequence

sequence = "I've been waiting for a HuggingFace course my whole life"
model_inputs = tokenizer(sequence)
model_inputs

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
# Multi Sequence
sequences = [
    "I've been waiting for a HuggingFace course my whole life.", 
    "So have I!"
    ]

model_inputs = tokenizer(sequences)
pprint(model_inputs)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1]],
 'input_ids': [[101,
                1045,
                1005,
                2310,
                2042,
                3403,
                2005,
                1037,
                17662,
                12172,
                2607,
                2026,
                2878,
                2166,
                1012,
                102],
               [101, 2061, 2031, 1045, 999, 102]]}


In [8]:
# Padding

# Pad to max seq_length
model_inputs = tokenizer(sequences, padding="longest")

# Pad to max model length
model_inputs = tokenizer(sequences, padding="max_length")

# Pad to specified length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [9]:
# Truncate

# Truncate to max model length
model_inputs = tokenizer(sequences, truncation=True)

# Truncate to specified length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

#### Special Tokens

In [11]:
# There's some extra tokens here
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [12]:
# And here we see why, start and end tokens
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


#### Final Model

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "I've been waiting for a HuggingFace course my whole life.", 
    "So have I!"
    ]

tokens = tokenizer(sequences,
                   padding=True,
                   truncation=True,
                   return_tensors='pt')
output = model(**tokens)

In [17]:
output.logits

tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>)