In [1]:
!pip install transformers



**Handling out-of-vocab words**

In [2]:
from transformers import AutoTokenizer

In [3]:
checkpoint = 'bert-base-cased'
sentence = 'Transformers are supercalifragilisticexpialidocious.'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokens = tokenizer.tokenize(sentence)
tokens

['Transformers',
 'are',
 'super',
 '##cal',
 '##if',
 '##rag',
 '##ilis',
 '##tic',
 '##ex',
 '##pia',
 '##lid',
 '##oc',
 '##ious',
 '.']

**Components of the tokenizer**

In [4]:
tokenizer(sentence)

{'input_ids': [101, 25267, 1132, 7688, 7867, 8914, 20484, 22279, 2941, 11708, 15748, 17299, 13335, 4179, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
tokenizer.convert_tokens_to_ids(tokens)

[25267,
 1132,
 7688,
 7867,
 8914,
 20484,
 22279,
 2941,
 11708,
 15748,
 17299,
 13335,
 4179,
 119]

In [6]:
tokenizer.decode([25267, 1132, 7688, 7867, 8914, 20484, 22279, 2941, 11708, 15748, 17299, 13335, 4179, 119])

'Transformers are supercalifragilisticexpialidocious.'

**Attention masks**

Tensors with the same shape as input IDs tensors, with 1s and 0s.
- 1s indicate that the corresponding tokens should be attended to.
- 0s mean that these tokens can be ignored.

**Sequence lengths**

Most models support sequences of up to 512 or 1024 tokens. There are 2 ways to deal with it.
- Truncate sequences when they exceed the supported number of tokens
- Use a model that can support longer sequences

In [7]:
inputs = tokenizer(sentence, padding="longest")
inputs

{'input_ids': [101, 25267, 1132, 7688, 7867, 8914, 20484, 22279, 2941, 11708, 15748, 17299, 13335, 4179, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokens = tokenizer.tokenize(sentence)
tokens

['Transformers',
 'are',
 'super',
 '##cal',
 '##if',
 '##rag',
 '##ilis',
 '##tic',
 '##ex',
 '##pia',
 '##lid',
 '##oc',
 '##ious',
 '.']

In [9]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[25267,
 1132,
 7688,
 7867,
 8914,
 20484,
 22279,
 2941,
 11708,
 15748,
 17299,
 13335,
 4179,
 119]

**Special tokens**

In [10]:
tokenizer.decode(inputs["input_ids"])

'[CLS] Transformers are supercalifragilisticexpialidocious. [SEP]'

**tokenizer to model**

In [11]:
inputs = tokenizer(sentence, padding=True, return_tensors='pt')
inputs

{'input_ids': tensor([[  101, 25267,  1132,  7688,  7867,  8914, 20484, 22279,  2941, 11708,
         15748, 17299, 13335,  4179,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ['Transformers are brilliant.',
             'They have changed NLP forever']

inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**inputs)
output

SequenceClassifierOutput([('logits', tensor([[-4.2946,  4.6448],
                                   [-3.1958,  3.2765]], grad_fn=<AddmmBackward>))])

In [13]:
output.logits.shape

torch.Size([2, 2])

In [14]:
torch.nn.functional.softmax(output.logits, dim=-1)

tensor([[1.3111e-04, 9.9987e-01],
        [1.5433e-03, 9.9846e-01]], grad_fn=<SoftmaxBackward>)

In [15]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}