## Chinese BERT with Whole Word Masking
For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**. 

**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**  
Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu

This repository is developed based on：https://github.com/google-research/bert

You may also interested in,
- Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm
- Chinese MacBERT: https://github.com/ymcui/MacBERT
- Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA
- Chinese XLNet: https://github.com/ymcui/Chinese-XLNet
- Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer

More resources by HFL: https://github.com/ymcui/HFL-Anthology

## 示例代码



In [None]:
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks


pipeline_ins = pipeline(
		'fill-mask',
		model='dienstag/chinese-bert-wwm-ext',
        model_revision='v1.0.0'
)

print(pipeline_ins('巴黎是[MASK]国的首都。'))



## Citation
If you find the technical report or resource is useful, please cite the following technical report in your paper.
- Primary: https://arxiv.org/abs/2004.13922
```
@inproceedings{cui-etal-2020-revisiting,
    title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing",
    author = "Cui, Yiming  and
      Che, Wanxiang  and
      Liu, Ting  and
      Qin, Bing  and
      Wang, Shijin  and
      Hu, Guoping",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58",
    pages = "657--668",
}
```
- Secondary: https://arxiv.org/abs/1906.08101  
```
@article{chinese-bert-wwm,
  title={Pre-Training with Whole Word Masking for Chinese BERT},
  author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping},
  journal={arXiv preprint arXiv:1906.08101},
  year={2019}
 }
```


In [4]:
from transformers import BertTokenizer

# 加载Bert预训练模型的分词器
tokenizer = BertTokenizer.from_pretrained('/mnt/workspace/model/chinese_bert')

# 原始文本数据
texts = ['This is an example sentence.', 'Another example sentence.']

# 标签
labels = ['label11', 'label12']

# 创建空列表用于存储标注后的数据
labeled_data = []

# 对每个文本进行标注
for text, label in zip(texts, labels):
    # 使用Bert分词器对文本进行分词
    tokens = tokenizer.tokenize(text)
    
    # 将分词后的文本转换为Bert模型所需的输入格式
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)
    
    # 创建样本字典，包含输入文本和标签
    sample = {'input_ids': input_ids, 'label': label}
    
    # 将标注后的样本添加到列表中
    labeled_data.append(sample)

# 打印标注后的数据
for sample in labeled_data:
    print(sample)

NameError: name 'BertForSequenceClassification' is not defined

In [None]:

# 原始文本数据
texts = ['I hite this movie!', 'This book is amazing!', 'The weather is nice today.']

# 标签
labels = ['positive', 'positive', 'neutral']

# 创建空列表用于存储标注后的数据
labeled_data = []

# 对每个文本进行标注
for text, label in zip(texts, labels):
    # 使用Bert分词器对文本进行分词
    tokens = tokenizer.tokenize(text)
    
    
    # 将分词后的文本转换为Bert模型所需的输入格式
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)
    
    # 创建样本字典，包含输入文本和标签
    sample = {'input_ids': input_ids, 'label': label}
    
    # 将标注后的样本添加到列表中
    labeled_data.append(sample)

# 打印标注后的数据
for sample in labeled_data:
    print(sample)

{'input_ids': [101, 151, 10295, 8154, 8554, 11099, 106, 102], 'label': 'positive'}
{'input_ids': [101, 8554, 9106, 8310, 8413, 8139, 10112, 8291, 106, 102], 'label': 'positive'}
{'input_ids': [101, 8174, 8997, 12290, 8310, 10192, 11262, 119, 102], 'label': 'neutral'}


In [None]:
# 输入文本
text = "Hello, how are you?"

# 使用tokenizer.encode_plus方法进行编码
encoding = tokenizer.encode_plus(
    text,
    add_special_tokens=True,  # 添加特殊标记
    max_length=512,  # 设置最大长度
    padding="max_length",  # 填充序列至最大长度
    truncation=True,  # 截断序列
    return_attention_mask=True,  # 返回注意力掩码
    return_token_type_ids=True  # 返回段落编号
)

input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
token_type_ids = encoding["token_type_ids"]

print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)
print("Token Type IDs:", token_type_ids)

Input IDs: [101, 8701, 117, 9510, 8995, 8357, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
import torch
import os
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, AutoModel

# 初始化Bert分词器和Bert模型
MODEL_DIR = os.path.abspath("/")
model_dir = os.path.join(MODEL_DIR, '/mnt/workspace/model/chinese_L-12_H-768_A-12')

tokenizer = BertTokenizer.from_pretrained('/mnt/workspace/model/chinese_L-12_H-768_A-12')
bert_config = BertConfig.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir, config=bert_config)

# 输入文本和标签
texts = ["I love this movie", "This is a great book", "I hate this product", "The weather is nice today"]
labels = [1, 1, 0, 1]

# 编码文本并进行文本分类
for text, label in zip(texts, labels):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # 添加特殊标记
        max_length=512,  # 设置最大长度
        padding="max_length",  # 填充序列至最大长度
        truncation=True,  # 截断序列
        return_attention_mask=True,  # 返回注意力掩码
        return_token_type_ids=True  # 返回段落编号
    )

    input_ids = torch.tensor([encoding["input_ids"]])
    attention_mask = torch.tensor([encoding["attention_mask"]])
    token_type_ids = torch.tensor([encoding["token_type_ids"]])

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()

    print("Text:", text)
    print("True Label:", label)
    print("Predicted Label:", predicted_label)
    print("Probabilities:", probabilities)
    print()

transformers-cli convert --model_type bert --tf_checkpoint ./chinese_L-12_H-768_A-12/bert_model.ckpt --config ./chinese_L-12_H-768_A-12/config.json --pytorch_dump_output ./chinese_L-12_H-768_A-12/pytorch_model.bin

转换

wget https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip

In [34]:
# python
import torch

# 创建一个需要计算梯度的张量
x = torch.tensor([2.0], requires_grad=True)

# 定义一个函数
def f(x):
    return 2 * x**2 + 3 * x - 4

# 计算函数在 x 处的值
y = f(x)

# 计算梯度
y.backward()

# 获取梯度值
gradient = x.grad

print(gradient)

tensor([11.])
