In [2]:
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from transformers import AutoTokenizer

In [5]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
# Returns a dictionary with ids, token type id and attention mask
tokenizer("hello world")

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [8]:
# Get the tokens only
tokens = tokenizer.tokenize("hello world")
tokens

['hello', 'world']

In [9]:
# IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[7592, 2088]

In [10]:
# Convert IDs back to tokens
tokenizer.convert_ids_to_tokens(ids)

['hello', 'world']

In [13]:
# We can do the same ,and in addition join the tokens back into a string
tokenizer.decode(ids)

'hello world'

In [15]:
# 2steps in 1: sentence to token and tokens to ids
ids = tokenizer.encode("hello word")
ids

[101, 7592, 2773, 102]

In [16]:
# Special BERT tokesn CLS and SEP: This is the true input
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'hello', 'word', '[SEP]']

In [17]:
tokenizer.decode(ids)

'[CLS] hello word [SEP]'

In [18]:
model_inputs = tokenizer("hello world")
model_inputs

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [19]:
data = [
    "I like cats.",
    "Do you like cats too?",
]
tokenizer(data)

{'input_ids': [[101, 1045, 2066, 8870, 1012, 102], [101, 2079, 2017, 2066, 8870, 2205, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

## Things are getting more interesting

In [20]:
from transformers import AutoModelForSequenceClassification

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [22]:
# This will not work!
# model_inputs = tokenizer("hello world")
# The model accepts torch tensor, not list!
outputs = model(**model_inputs)

AttributeError: ignored

In [24]:
# We fix it as follows
model_inputs = tokenizer("hello world", return_tensors="pt")
model_inputs

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [25]:
# The default is to create a binary classifier
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1600, -0.0532]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [26]:
# logits are miningless
outputs.logits

tensor([[-0.1600, -0.0532]], grad_fn=<AddmmBackward0>)

In [28]:
# Create another model with three outputs instead of 2
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0855, -0.0961,  0.1948]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [31]:
# object with attribute logits
outputs.logits

tensor([[ 0.0855, -0.0961,  0.1948]], grad_fn=<AddmmBackward0>)

In [32]:
# same as above: dictionary key
outputs["logits"]

tensor([[ 0.0855, -0.0961,  0.1948]], grad_fn=<AddmmBackward0>)

In [33]:
# Another way: like a tuple
outputs[0]

tensor([[ 0.0855, -0.0961,  0.1948]], grad_fn=<AddmmBackward0>)

In [34]:
# convert to numpy array
outputs.logits.detach().cpu().numpy()

array([[ 0.08552645, -0.09610875,  0.19475536]], dtype=float32)

## Processing Multiple Strings

In [35]:
# Remember this will not work
data = [
    "I like cats.",
    "Do you like cats too?",
]
model_inputs = tokenizer(data, return_tensors="pt")
model_inputs

ValueError: ignored

In [37]:
# Correct way to do it
data = [
    "I like cats.",
    "Do you like cats too?",
]

model_inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt")
model_inputs

{'input_ids': tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [38]:
# Look at padding at the end!
model_inputs["input_ids"]

tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]])

In [39]:
# Look at 0's again at the end
model_inputs["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [40]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1654, -0.1472,  0.4832],
        [ 0.2007, -0.1435,  0.4759]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [41]:
#2x3 tensor: two documents, 3 classes (defined above)!
outputs.logits

tensor([[ 0.1654, -0.1472,  0.4832],
        [ 0.2007, -0.1435,  0.4759]], grad_fn=<AddmmBackward0>)