In [2]:
import os

# 更改缓存路径
os.environ["HF_HOME"] = "D:/huggingface"
os.environ["HF_DATASETS_CACHE"] = "D:/huggingface/datasets"

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# 当执行tokenization时，我们会丢失一些信息，并且也不容易看出token属于哪一个单词

In [4]:
# bert分词器会删除掉重复的空格
print(tokenizer("hello, how are you?")["input_ids"])
print(tokenizer("hello, how are         you?")["input_ids"])

[101, 19082, 117, 1293, 1132, 1128, 136, 102]
[101, 19082, 117, 1293, 1132, 1128, 136, 102]


![](https://chushi123.oss-cn-beijing.aliyuncs.com/img/202203031634218.png)

# Fast tokenizers能保持单词和token的对应关系

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
encoding = tokenizer("Let's talk about tokenizers superpowers.")
print(encoding.tokens())
print(encoding.word_ids())

['[CLS]', 'Let', "'", 's', 'talk', 'about', 'token', '##izer', '##s', 'super', '##power', '##s', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, 7, None]


![](https://chushi123.oss-cn-beijing.aliyuncs.com/img/202203031638253.png)

 ## 甚至能保持token与原始文本的索引区间对应关系

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
encoding = tokenizer(
    "Let's talk about tokenizers superpowers.", return_offsets_mapping=True
)
print(encoding.tokens())
print(encoding["offset_mapping"])

['[CLS]', 'Let', "'", 's', 'talk', 'about', 'token', '##izer', '##s', 'super', '##power', '##s', '.', '[SEP]']
[(0, 0), (0, 3), (3, 4), (4, 5), (6, 10), (11, 16), (17, 22), (22, 26), (26, 27), (28, 33), (33, 38), (38, 39), (39, 40), (0, 0)]


![](https://chushi123.oss-cn-beijing.aliyuncs.com/img/202203031646501.png)

## 获得单词在原始文本的索引区间

In [15]:
# 获得第5个单词tokenizers在原始文本的索引区间
start, end = encoding.word_to_chars(5)
print(start, end)

17 27


## Fast tokenizers保持了原始文本产生的每一个token在原始文本的对应区间索引

![](https://chushi123.oss-cn-beijing.aliyuncs.com/img/202203031707110.jpg)