# Tokenizer基本使用

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sen = '弱小的我也有大梦想!'

## Step1 加载与保存

In [None]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
# tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
# tokenizer

In [None]:
# tokenizer保存到本地
# tokenizer.save_pretrained('./roberta_tokenizer')

In [3]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('../../models/roberta-base-finetuned-dianping-chinese/')
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizerFast(name_or_path='../../models/roberta-base-finetuned-dianping-chinese/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

##  Step2 句子分词

In [4]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

## Step3 查看词典

In [5]:
tokenizer.vocab

{'##come': 12097,
 '##拜': 15933,
 '##適': 19957,
 '讧': 6373,
 '##閥': 20342,
 '##侵': 13966,
 '##;': 13333,
 '##挙': 15963,
 '##杠': 16396,
 '飛': 7606,
 '##悩': 15702,
 '銃': 7066,
 '##喔': 14652,
 '##丸': 13766,
 '污': 3738,
 'street': 9471,
 '##煉': 17257,
 '##腿': 18654,
 '##con': 10281,
 '闵': 7314,
 '亚': 762,
 '8591': 12376,
 '淩': 3913,
 '漣': 4032,
 '牒': 4279,
 '##lin': 9029,
 '##尋': 15261,
 '##犧': 17361,
 '##ina': 9314,
 '##倾': 14024,
 'w': 165,
 '##hy': 9943,
 '##煙': 17263,
 '設': 6257,
 '##哺': 14586,
 '326': 12044,
 '##寄': 15221,
 '蓄': 5898,
 '渴': 3951,
 '鶩': 7872,
 'low': 10611,
 '枣': 3365,
 '赓': 6607,
 '##冈': 14139,
 '##压': 14384,
 '282': 11431,
 '##嗔': 14682,
 'mail': 8313,
 '莹': 5816,
 '##奔': 15001,
 '##暱': 16330,
 '茭': 5757,
 'traction': 12600,
 '╱': 447,
 '[unused44]': 44,
 '嘣': 1663,
 'dear': 11694,
 '##rame': 13185,
 '##寞': 15231,
 '##娴': 15090,
 '枷': 3374,
 'img': 11412,
 '##ky': 10218,
 '１５': 11213,
 '##をお': 11236,
 '##hip': 11489,
 '##ince': 13199,
 '##凄': 14170,
 'ᆸ': 325,
 '##悱'

In [6]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [7]:
# 将词序列转化为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [8]:
# 将id序列转化为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [9]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

### 更便捷的实现方式

In [10]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [11]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'

## Step5 填充与截断

In [12]:
# 填充
ids = tokenizer.encode(sen, padding='max_length', max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [13]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

## Step6 其他输入部分

In [14]:
ids = tokenizer.encode(sen, padding='max_length', max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [15]:
attention_mask = [1 if idx !=0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)

ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Step7 快速调用方式

In [16]:
inputs = tokenizer.encode_plus(sen, padding='max_length', max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

## Step8 处理batch数据

In [17]:
sens = ['弱小的我也有大梦想',
        '有梦想谁都了不起',
        '追逐梦想的心，比梦想本身，更可贵'
]
res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [18]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: total: 78.1 ms
Wall time: 72.8 ms


In [19]:
%%time
# 处理batch数据
res = tokenizer([sen]*1000)

CPU times: total: 203 ms
Wall time: 11 ms


In [20]:
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizerFast(name_or_path='../../models/roberta-base-finetuned-dianping-chinese/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Fast / Slow Tokenizer

In [21]:
sen = '弱小的我也有大Dreaming!'

In [22]:
fast_tokenizer = AutoTokenizer.from_pretrained('../../models/roberta-base-finetuned-dianping-chinese/')
fast_tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizerFast(name_or_path='../../models/roberta-base-finetuned-dianping-chinese/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [23]:
slow_tokenizer = AutoTokenizer.from_pretrained('../../models/roberta-base-finetuned-dianping-chinese/', use_fast=False)
slow_tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='../../models/roberta-base-finetuned-dianping-chinese/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [24]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: total: 688 ms
Wall time: 671 ms


In [25]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: total: 1.67 s
Wall time: 1.67 s


In [26]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: total: 1.03 s
Wall time: 156 ms


In [27]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: total: 1.55 s
Wall time: 1.54 s


In [28]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [29]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [30]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

# 特殊的Tokenizer的加载

In [None]:
# 加载非huggingface官方模型，需要加 trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer

In [37]:
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizerFast(name_or_path='chatglm_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [34]:
tokenizer.save_pretrained("chatglm_tokenizer")

('chatglm_tokenizer\\tokenizer_config.json',
 'chatglm_tokenizer\\special_tokens_map.json',
 'chatglm_tokenizer\\vocab.txt',
 'chatglm_tokenizer\\added_tokens.json',
 'chatglm_tokenizer\\tokenizer.json')

In [35]:
tokenizer = AutoTokenizer.from_pretrained("chatglm_tokenizer", trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
tokenizer.decode(tokenizer.encode(sen))

'[CLS] 弱 小 的 我 也 有 大 dreaming! [SEP]'