# Tokenizers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [1]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


transformers.BertTokenizer

In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

  from .autonotebook import tqdm as notebook_tqdm


transformers.AutoTokenizer

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [17]:
tokenizer("Let's try to tokenize!")

{'input_ids': [101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
tokenizer.save_pretrained("../my-first-model")

('../my-first-model/tokenizer_config.json',
 '../my-first-model/special_tokens_map.json',
 '../my-first-model/vocab.txt',
 '../my-first-model/added_tokens.json',
 '../my-first-model/tokenizer.json')

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Let's try to tokenize!"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Let', "'", 's', 'try', 'to', 'token', '##ize', '!']


In [23]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[2421, 112, 188, 2222, 1106, 22559, 3708, 106]


In [25]:
final_inputs = tokenizer.prepare_for_model(ids)
print(final_inputs["input_ids"])

[101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102]


In [26]:
decoded_string = tokenizer.decode([101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102])
print(decoded_string)

[CLS] Let's try to tokenize! [SEP]


In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
tokens = tokenizer.tokenize("Let's try to tokenize!")
print(tokens)

['▁let', "'", 's', '▁try', '▁to', '▁to', 'ken', 'ize', '!']


In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
inputs = tokenizer("Let's try to tokenize!")
print(tokenizer.decode(inputs["input_ids"]))

<s>Let's try to tokenize!</s>
