# *1. 零训练样本分类*

In [1]:
from transformers import pipeline

In [2]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
result = classifier(
    "This is a course about the Transformers library.",
    candidate_labels = ["education", "politics", "business", "comment"]
)

In [4]:
print(result)

{'sequence': 'This is a course about the Transformers library.', 'labels': ['education', 'comment', 'business', 'politics'], 'scores': [0.6840799450874329, 0.21549372375011444, 0.0737949013710022, 0.026631459593772888]}


# *2.情感分析*

In [5]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")

In [7]:
result = classifier("I've been waiting for HuggingFace course my whole life.")

In [8]:
print(result)

[{'label': 'POSITIVE', 'score': 0.985034167766571}]


In [9]:
results = classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

In [10]:
print(results)

[{'label': 'POSITIVE', 'score': 0.9598049521446228}, {'label': 'NEGATIVE', 'score': 0.9994558691978455}]


# *3.文本生成*

In [11]:
from transformers import pipeline

In [None]:
generator = pipeline("text-generation", model="distilgpt2")

In [13]:
results = generator("In this course, we will teach you how to")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [14]:
print(results)

[{'generated_text': 'In this course, we will teach you how to use a combination of the common sense and knowledge available as well as the well-developed approach that helps identify a problem in our system. To learn more, click here.\nFurther Resources'}]


In [15]:
results = generator(
    "In this course, we will teach you how to",
    num_return_sequences=2,
    truncation=True,
    max_length=30
)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [16]:
print(results)

[{'generated_text': 'In this course, we will teach you how to build scalable networks within this app that, without even using a built-in JavaScript module, will render'}, {'generated_text': 'In this course, we will teach you how to perform some interesting techniques in our language and how to use a method which you can use in your programming'}]


* 生成古诗词

In [None]:
generator = pipeline("text-generation", model="uer/gpt2-chinese-poem")

In [18]:
results = generator(
    "[CLS] 万 叠 春 山 积 雨 晴，",
    max_length=40,
    num_return_sequences=2,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
print(results)

[{'generated_text': '[CLS] 万 叠 春 山 积 雨 晴， 施 与 客 闲 。 夜 堂 闻 杜 鹃 泣 血 ， 朝 食 杏 花 疑 象 龙 。 一 物 不 同 之 字 法 ，'}, {'generated_text': '[CLS] 万 叠 春 山 积 雨 晴， 兹 千 顷 波 。 楼 上 多 好 风 终 日 ， 客 中 岁 晚 奈 情 何 。 见 远 峰 浓 又 淡 ，'}]


# *遮盖词填充*

In [20]:
from transformers import pipeline

In [None]:
unmasker = pipeline("fill-mask")

In [22]:
results = unmasker("This course will teach you all about <mask> models.", top_k=2)

In [23]:
print(results)

[{'score': 0.19198468327522278, 'token': 30412, 'token_str': ' mathematical', 'sequence': 'This course will teach you all about mathematical models.'}, {'score': 0.042092032730579376, 'token': 38163, 'token_str': ' computational', 'sequence': 'This course will teach you all about computational models.'}]


# *命名实体识别*

In [24]:
from transformers import pipeline

In [None]:
ner = pipeline("ner", grouped_entities=True)

In [26]:
results = ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [27]:
print(results)

[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity_group': 'ORG', 'score': 0.9796019, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity_group': 'LOC', 'score': 0.9932106, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


# *自动问答*

In [28]:
from transformers import pipeline

In [None]:
question_answerer = pipeline("question-answering")

In [30]:
answer = question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn"
)

In [31]:
print(answer)

{'score': 0.6949766278266907, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}


# *自动摘要*

In [32]:
from transformers import pipeline

In [None]:
summarizer = pipeline("summarization")

In [34]:
results = summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of
    graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science. As a result, there
    are declining offerings in engineering subjects dealing with infrastructure,
    the environment, and related issues, and greater concentration on high
    technology subjects, largely supporting increasingly complex scientific
    developments. While the latter is important, it should not be at the expense
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other
    industrial countries in Europe and Asia, continue to encourage and advance
    the teaching of engineering. Both China and India, respectively, graduate
    six and eight times as many traditional engineers as does the United States.
    Other industrial countries at minimum maintain their output, while America
    suffers an increasingly serious decline in the number of engineering graduates
    and a lack of well-educated engineers.
    """
)

In [35]:
print(results)

[{'summary_text': ' America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil, electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India, as well as other industrial countries in Europe and Asia, continue to encourage and advance engineering .'}]


# *使用分词器进行预处理*

In [36]:
from transformers import AutoTokenizer

In [37]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [38]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

In [39]:
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

In [40]:
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


# *将预处理好的输入送入模型*

In [41]:
from transformers import AutoTokenizer, AutoModel

In [42]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# 分词器
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# 模型
model = AutoModel.from_pretrained(checkpoint)

In [43]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

In [44]:
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

In [45]:
outputs = model(**inputs)

In [46]:
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


## *对于情感分类的任务，最后需要使用的是一个文本分类的head，使用AtuoModelForSequenceClassification*

In [47]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [48]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [49]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

In [50]:
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)

In [51]:
print(outputs.logits.shape)

torch.Size([2, 2])


In [52]:
# 所有 Transformers 模型都会输出 logits 值，因为训练时的损失函数通常会自动结合激活函数（例如 SoftMax）与实际的损失函数（例如交叉熵 cross entropy）。
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [53]:
# 将他们转换为概率值，还需要让他们经过一个softmax层
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [54]:
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


# *分词器分词*

In [55]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [57]:
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

In [59]:
print(tokens)
print(tokenizer(sequence))

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [61]:
ids = tokenizer.convert_tokens_to_ids(tokens)

In [64]:
print(ids)

# 采用encode 简化操作, 其中101和102分别是[CLS] [SEP]对应的token的IDs
print(tokenizer.encode(sequence))

[7993, 170, 13809, 23763, 2443, 1110, 3014]
[101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102]


In [65]:
# 采用decode将ids转为tokens并且将拆分的词进行了合并
decode_tokens = tokenizer.decode(ids)

print(decode_tokens)

token_ids = tokenizer.encode(sequence)

decode_tokens = tokenizer.decode(token_ids)

print(decode_tokens)

Using a Transformer network is simple
[CLS] Using a Transformer network is simple [SEP]


# *处理多行文本*

In [67]:
# 演示多行文本处理过程
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)

ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

input_ids = torch.tensor([ids])

print("Input IDs:\n", input_ids)

output = model(input_ids)

print("Logits:\n", output.logits)


[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
Input IDs:
 tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits:
 tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [69]:
# 实际处理可以简化

tokenized_inputs = tokenizer(sequence, return_tensors="pt")

print("Inputs Keys:\n", tokenized_inputs.keys())

print("\nInput IDs:\n", tokenized_inputs["input_ids"])

print(tokenized_inputs)

output = model(**tokenized_inputs)

print(output.logits)

Inputs Keys:
 dict_keys(['input_ids', 'attention_mask'])

Input IDs:
 tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])
{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>)


# *padding*

In [4]:
# 直接填充padding id 会导致模型将pad_token_id也当做上下文进行编码
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [6]:
# 需要填充不参与编码，需要使用Attention Mask
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

batched_attention_masks = [
    [1, 1, 1],
    [1, 1, 0]
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)

outputs = model(
    torch.tensor(batched_ids),
    attention_mask = torch.tensor(batched_attention_masks)
)
print(outputs.logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


# *直接使用分词器*

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = [
    "I've been waiting for a HuggingFace course my whole life.",
    "So have I!"
]

model_inputs = tokenizer(sequence)

print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [10]:
# padding 操作通过tokenizer的padding参数控制
"""
padding=True 等同于 padding="longest"
padding="longest" 将序列填充到当前batch中最长序列的长度
padding="max_length" 将所有序列填充到模型能够接受的最大长度，BERT模型的最大长度是512
"""
model_inputs = tokenizer(sequence, padding="longest")
print(model_inputs)
model_inputs = tokenizer(sequence, padding="max_length")
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [15]:
# 截断操作 通过truncation参数控制
"""
truncation=True 将序列截断到模型能够接受的最大长度,超出部分将会截断
max_length 参数用来控制截断长度
"""
model_inputs = tokenizer(sequence, padding="longest", truncation=True)
print(f"truncation=True':\n{model_inputs}")

model_inputs = tokenizer(sequence, padding="longest", truncation=True, max_length=8)
print(f"truncation=True && max_length=8 :\n{model_inputs}")


truncation=True':
{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
truncation=True && max_length=8 :
{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0]]}


In [None]:
# 可以通过return_tensors参数指定返回的张量格式
"""
return_tersors="pt" 返回PyTorch张量
return_tensors="tf" 返回TensorFlow张量
return_tensors="np" 返回NumPy数组
"""
model_inputs = tokenizer(sequence, padding="longest", truncation=True, return_tensors="pt")
print(f"PyTorch类型张量：\n{model_inputs}")

model_inputs = tokenizer(sequence, padding="longest", truncation=True, return_tensors="tf")
print(f"TensorFlow类型张量：\n{model_inputs}")

model_inputs = tokenizer(sequence, padding="longest", truncation=True, return_tensors="np")
print(f"NumPy类型数组：\n{model_inputs}")


In [19]:
# 常用方式Pytorch张
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokens = tokenizer(sequence, padding=True, truncation=True, return_tensors="pt")

print(tokens)

outputs = model(**tokens)

print(outputs.logits)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>)


# *编码句子对*

In [23]:
# bert 模型支持"句子对"编码
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

inputs = tokenizer("This is the first sentence.", "This is the second one.")

print(f"IDs:\n{inputs}")

# token_type_ids 标识了与token序列的对齐位置，那些token数据第一个句子，那些token属于第二个句子
"""
就可以看到第一个句子
  "[CLS] sentencel [SEP]"
  所有 token 的 type ID 都为 0
而第二个句子
  "sentence2 [SEP]"
  对应的 token type ID 都为 1
"""
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])

print(f"Tokens:\n{tokens}")



IDs:
{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Tokens:
['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']


In [25]:
# 实际使用时，我们不需要去关注编码结果中是否包含 token_type_ids 项，分词器会根据 checkpoint 自动调整输出格式，例如

from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence1_list = ["First sentence.", "This is the second sentence.", "Third one."]
sentence2_list = ["First sentence is short.", "The second sentence is very very very long.", "ok."]

tokens = tokenizer(
    sentence1_list,
    sentence2_list,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
print(f"Tokens tensor:\n{tokens}")
print(f"IDs.shape:\n {tokens['input_ids'].shape}")

print(f"Tokens1:\n {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")
print(f"Tokens2:\n {tokenizer.convert_ids_to_tokens(tokens['input_ids'][1])}")
print(f"Tokens3:\n {tokenizer.convert_ids_to_tokens(tokens['input_ids'][2])}")

Tokens tensor:
{'input_ids': tensor([[ 101, 2034, 6251, 1012,  102, 2034, 6251, 2003, 2460, 1012,  102,    0,
            0,    0,    0,    0,    0,    0],
        [ 101, 2023, 2003, 1996, 2117, 6251, 1012,  102, 1996, 2117, 6251, 2003,
         2200, 2200, 2200, 2146, 1012,  102],
        [ 101, 2353, 2028, 1012,  102, 7929, 1012,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
IDs.shape:
 torch.Size([3, 18])
Tokens1:
 ['[CLS]', 'first', 'sentence', '.', '[SEP]', 'first', 'sentence', 'is', 'short', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', 

# *添加token*

In [27]:
"""
我们还经常会遇到输入中需要包含特殊标记符的情况，例如使用和标记出文本中的实体。
由于这些自定义 token 并不在预训练模型原来的词表中，因此直接运用分词器处理就会出现问题
"""

# 下面效果没有达到我们的预期，将[ENT_START]和[ENT_END]当做特殊字符处理
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence = "Two [ENT_START] cars [ENT_END] collided in a [ENT_START] tunnel [ENT_END] this morning."

print(f"Tokens:\n{tokenizer.tokenize(sentence)}")


Tokens:
['two', '[', 'en', '##t', '_', 'start', ']', 'cars', '[', 'en', '##t', '_', 'end', ']', 'collided', 'in', 'a', '[', 'en', '##t', '_', 'start', ']', 'tunnel', '[', 'en', '##t', '_', 'end', ']', 'this', 'morning', '.']


# *添加新的token*
> 一些领域的专业词汇，例如使用多个词语的缩写拼接而成的医学术语，同样也不在模型的词表中，因此也会出现上面的问题。此时我们就需要将这些新 token 添加到模型的词表中，让分词器与模型可以识别并处理这些 token

In [46]:
"""
token 分为两种：
普通token：添加的token会统一转为小写字符
特殊符号token：不会自动转为小写，保留原始

***注意***
向词表中添加新 token 后，必须重置模型 embedding 矩阵的大小，
也就是向矩阵中添加新 token 对应的 embedding，这样模型才可以正常工作，将 token 映射到对应的 embedding。
"""
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 查看所有的tokens vocab tokens
print(f"All Tokens:\n{tokenizer.vocab}")

# 查看特殊的tokens含义序列
print(f"特殊的Tokens:\n{tokenizer.special_tokens_map}")

# 获取vocabulary size， 通过vocab_size 和 len(tokenizer)都可获取
print(f"Before add vocabulary size: {len(tokenizer)}")

# 添加普通的tokens， 添加的token的ids会在最大的ids的数值上累加
num_added_tokens = tokenizer.add_tokens(["my_new_token1", "my_new_token2"])
print(f"添加的tokens数量：{num_added_tokens}")
# print(f"添加后的Tokens:\n{tokenizer.get_added_vocab()}")
# print(f"添加后的tokens序列：\n{tokenizer.vocab}")

# 添加token是防止添加已有的tokens
new_tokens = ["my_new_token1", "my_new_token3"]
# tokens 去重
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

print(f"去重后的Tokens:\n{new_tokens}")
# 添加tokens
num_added_tokens = tokenizer.add_tokens(list(new_tokens))
print(f"添加的tokens数量：{num_added_tokens}")
# 添加特殊的tokens
# 1. 通过add_special_tokens添加
num_added_tokens = tokenizer.add_special_tokens({"additional_special_tokens": ["[ENT_START]", "[ENT_END]"]})
print(f"添加的tokens数量：{num_added_tokens}")
print(f"添加后的特殊tokens：{tokenizer.special_tokens_map}")

# 2. 通过add_tokens添加
special_tokens = ["[ENT_START]", "[ENT_END]", "[NEW_TOKEN]"]
special_tokens = set(special_tokens) - set(tokenizer.vocab.keys())
print(f"去重后的Tokens:\n{special_tokens}")
num_added_tokens = tokenizer.add_tokens(list(special_tokens), special_tokens=True)
print(f"添加的tokens数量：{num_added_tokens}")
print(f"添加后的特殊tokens：{tokenizer.special_tokens_map}")
print(f"vocab:\n{tokenizer.vocab['[NEW_TOKEN]']}")

print(f"After add and vocabulary size: {len(tokenizer)}")



print()

# 得到了想要的结果
sentence = "Two [ENT_START] cars [ENT_END] collided in a [ENT_START] tunnel [ENT_END] this morning."

print(f"Tokens:\n{tokenizer.tokenize(sentence)}")




All Tokens:
特殊的Tokens:
{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
Before add vocabulary size: 30522
添加的tokens数量：2
去重后的Tokens:
{'my_new_token3'}
添加的tokens数量：1
添加的tokens数量：2
添加后的特殊tokens：{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[ENT_START]', '[ENT_END]']}
去重后的Tokens:
{'[NEW_TOKEN]'}
添加的tokens数量：1
添加后的特殊tokens：{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[ENT_START]', '[ENT_END]']}
vocab:
30527
After add and vocabulary size: 30528

Tokens:
['two', '[ENT_START]', 'cars', '[ENT_END]', 'collided', 'in', 'a', '[ENT_START]', 'tunnel', '[ENT_END]', 'this', 'morning', '.']


# *调整embedding矩阵*
> ****注意：****向词表中添加新 token 后，必须重置模型 embedding 矩阵的大小，也就是向矩阵中添加新 token 对应的 embedding，这样模型才可以正常工作，将 token 映射到对应的 embedding

In [6]:
# 添加tokens并embedding矩阵
from transformers import AutoTokenizer, AutoModel

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

print(f"vocabulary size: {len(tokenizer)}")

# 添加tokens
num_added_tokens = tokenizer.add_tokens(["[ENT_START]", "[ENT_END]"], special_tokens=True)

print(f"添加的tokens数量：{num_added_tokens}")
print(f"vocabulary size: {len(tokenizer)}")

print(f"Before resize_token_embeddings:\n {model.embeddings.word_embeddings.weight.size()}")

# 调整模型embedding 矩阵
"""
默认：mean_resizing=True
为了使得新嵌入的分布与原有词汇的分布相匹配，通常会计算旧词嵌入（原始词汇表中的词）的均值和协方差，
并用这些统计量来初始化新词的嵌入。这样可以确保新词嵌入的初始值与已有词汇的嵌入空间在统计特性上保持一致，
从而帮助模型更好地学习

mean_resizing=False
新词嵌入将不会基于旧词嵌入的统计信息进行调整，而是采用其他初始化策略（例如随机初始化）来生成新词的嵌入
"""
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

print(f"After resize_token_embeddings:\n{model.embeddings.word_embeddings.weight.size()}")

# 随机生成矩阵 Randomly generated matrix
# 新 token 会添加在词表的末尾，因此只需打印出最后两行
# 如果mean_resizing=False,你多次运行上面的代码，就会发现每次打印出的和 的 embedding 是不同的。
print(model.embeddings.word_embeddings.weight[-2:,:])




vocabulary size: 30522
添加的tokens数量：2
vocabulary size: 30524
Before resize_token_embeddings:
 torch.Size([30522, 768])
After resize_token_embeddings:
torch.Size([30524, 768])
tensor([[-0.0007,  0.0133, -0.0028,  ..., -0.0118, -0.0090,  0.0044],
        [ 0.0213,  0.0009,  0.0035,  ...,  0.0199, -0.0079,  0.0064]],
       grad_fn=<SliceBackward0>)


# *Token embedding初始化*

In [3]:
from transformers import AutoTokenizer, AutoModel

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

# 添加tokens
num_added_tokens = tokenizer.add_tokens(["[ENT_START]", "[ENT_END]"], special_tokens=True)

model.resize_token_embeddings(len(tokenizer))

print(model.embeddings.word_embeddings.weight[-2:,:])



tensor([[-0.0221, -0.0379, -0.0354,  ..., -0.0287, -0.0341, -0.0267],
        [-0.0221, -0.0379, -0.0354,  ..., -0.0287, -0.0341, -0.0267]],
       grad_fn=<SliceBackward0>)


In [4]:
# 直接复制
import torch

with torch.no_grad():
  print(model.config.hidden_size)
  model.embeddings.word_embeddings.weight[-2:,:] = torch.zeros([2, model.config.hidden_size], requires_grad=True)

print(model.embeddings.word_embeddings.weight[-2:,:])

768
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SliceBackward0>)


In [9]:
# 常用的做法是采用已有的token的embedding来初始化token

import torch

token_id = tokenizer.convert_tokens_to_ids("entity")
# 取出对应token ids 对应的矩阵
token_embedding = model.embeddings.word_embeddings.weight[token_id]

print(f"Token ID:\n{token_id}\n")

# print(f"Token embedding:\n{token_embedding}\n")


with torch.no_grad():
  for i in range(1, num_added_tokens + 1):
    model.embeddings.word_embeddings.weight[-i:,:] = token_embedding.clone().detach().requires_grad_(True)

print(model.embeddings.word_embeddings.weight[-2:,:])

Token ID:
9178

tensor([[-0.0039, -0.0131, -0.0946,  ..., -0.0223,  0.0107, -0.0419],
        [-0.0039, -0.0131, -0.0946,  ..., -0.0223,  0.0107, -0.0419]],
       grad_fn=<SliceBackward0>)


In [10]:
# 初始化为已有的token值
"""
更为高级的做法是根据新添加 token 的语义来进行初始化
可以分别为和编写对应的描述，然后再对它们的值进行初始化
"""

descriptions = ["start of entity", "end of entity"]

with torch.no_grad():
  for i, description in enumerate(reversed(descriptions), start=1):
    # 分词
    tokenized = tokenizer.tokenize(description)
    print(tokenized)

    tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized)

    new_embedding = model.embeddings.word_embeddings.weight[tokenized_ids].mean(axis=0)

    # print(new_embedding)

    model.embeddings.word_embeddings.weight[-i,:] = new_embedding.clone().detach().requires_grad_(True)

print(model.embeddings.word_embeddings.weight[-2:,:])

['end', 'of', 'entity']
['start', 'of', 'entity']
tensor([[-0.0340, -0.0144, -0.0441,  ..., -0.0016,  0.0318, -0.0151],
        [-0.0060, -0.0202, -0.0312,  ..., -0.0084,  0.0193, -0.0296]],
       grad_fn=<SliceBackward0>)
