# 先导入 Torch 版本的模型

In [1]:
# 把 ChineseBert 源码的根目录，加入到 python sys.path
import sys
sys.path.append('/data1/workspace/research/ChineseBERT-Paddle/ChineseBert')

In [2]:
# 把 clone paddlenlp 源码，加入到 python sys.path
import sys
sys.path.append('/data1/workspace/research/ChineseBERT-Paddle/Paddle_ChineseBert/PaddleNLP')

In [3]:
# ChineseBert Model
from datasets.bert_dataset import BertDataset
from models.modeling_glycebert import GlyceBertModel
CHINESEBERT_PATH='./pretrain_models/torch/ChineseBERT-base/'
tokenizer = BertDataset(CHINESEBERT_PATH)
chinese_bert = GlyceBertModel.from_pretrained(CHINESEBERT_PATH)

In [4]:
config = chinese_bert.config

In [5]:
sentence = '我喜欢猫'
input_ids, pinyin_ids = tokenizer.tokenize_sentence(sentence)
length = input_ids.shape[0]
input_ids = input_ids.view(1, length)
pinyin_ids = pinyin_ids.view(1, length, 8)
output_hidden = chinese_bert.forward(input_ids, pinyin_ids)[0]

In [6]:
print(output_hidden.shape)

torch.Size([1, 6, 768])


## 对比 ChineseBert 和 Bert 的网络参数

In [7]:
from transformers import BertModel, AutoModelForMaskedLM
torch_chinese_bert_keys = chinese_bert.state_dict().keys()
torch_bert_keys = BertModel.from_pretrained("bert-base-chinese").state_dict().keys()

In [8]:
# 全部的 ChineseBert 的参数
list(torch_chinese_bert_keys)

['embeddings.position_ids',
 'embeddings.word_embeddings.weight',
 'embeddings.position_embeddings.weight',
 'embeddings.token_type_embeddings.weight',
 'embeddings.pinyin_embeddings.embedding.weight',
 'embeddings.pinyin_embeddings.conv.weight',
 'embeddings.pinyin_embeddings.conv.bias',
 'embeddings.glyph_embeddings.embedding.weight',
 'embeddings.glyph_map.weight',
 'embeddings.glyph_map.bias',
 'embeddings.map_fc.weight',
 'embeddings.map_fc.bias',
 'embeddings.LayerNorm.weight',
 'embeddings.LayerNorm.bias',
 'encoder.layer.0.attention.self.query.weight',
 'encoder.layer.0.attention.self.query.bias',
 'encoder.layer.0.attention.self.key.weight',
 'encoder.layer.0.attention.self.key.bias',
 'encoder.layer.0.attention.self.value.weight',
 'encoder.layer.0.attention.self.value.bias',
 'encoder.layer.0.attention.output.dense.weight',
 'encoder.layer.0.attention.output.dense.bias',
 'encoder.layer.0.attention.output.LayerNorm.weight',
 'encoder.layer.0.attention.output.LayerNorm.bias',

In [9]:
# ChineseBert 多出来的参数（embedding 层）
set(list(torch_chinese_bert_keys)) - set(list(torch_bert_keys))

{'embeddings.glyph_embeddings.embedding.weight',
 'embeddings.glyph_map.bias',
 'embeddings.glyph_map.weight',
 'embeddings.map_fc.bias',
 'embeddings.map_fc.weight',
 'embeddings.pinyin_embeddings.conv.bias',
 'embeddings.pinyin_embeddings.conv.weight',
 'embeddings.pinyin_embeddings.embedding.weight'}

In [10]:
# ChineseBert 少的参数
set(list(torch_bert_keys)) - set(list(torch_chinese_bert_keys))

set()

## 模型参数 torch 转换 paddle

In [11]:
import paddle
import torch
import numpy as np

# ChineseBERT-base:  ./pretrain_models/torch/ChineseBERT-base/pytorch_model.bin
# ChineseBERT-large: ./pretrain_models/torch/ChineseBERT-large/pytorch_model.bin
torch_model_path = "./pretrain_models/torch/ChineseBERT-large/pytorch_model.bin"
torch_state_dict = torch.load(torch_model_path)
paddle_model_path = "./pretrain_models/paddle/ChineseBERT-large/model_state.pdparams"
paddle_state_dict = {}

# State_dict's keys mapping: from torch to paddle
keys_dict = {
    # about encoder layer
    'encoder.layer': 'encoder.layers',
    'attention.self.query': 'self_attn.q_proj', # 需要转置
    'attention.self.key': 'self_attn.k_proj',   # 需要转置
    'attention.self.value': 'self_attn.v_proj', # 需要转置
    'attention.output.dense': 'self_attn.out_proj',  # 需要转置
    'attention.output.LayerNorm': 'norm1', # 需要转置
    'intermediate.dense': 'linear1', # 需要转置
    'output.dense': 'linear2', # 需要转置
    'output.LayerNorm': 'norm2', # 需要转置
}


for torch_key in torch_state_dict:
    paddle_key = torch_key
    for k in keys_dict:
        if k in paddle_key:
            paddle_key = paddle_key.replace(k, keys_dict[k])

    if ('map_fc' in paddle_key) or ('glyph_map' in paddle_key) or ('linear' in paddle_key) or ('proj' in  paddle_key) or ('vocab' in  paddle_key and 'weight' in  paddle_key) or ("dense.weight" in paddle_key) or ('transform.weight' in paddle_key) or ('seq_relationship.weight' in paddle_key):
        print("transpose(permute) ---------->")
        paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[torch_key].cpu().numpy().transpose())
    else:
        paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[torch_key].cpu().numpy())

    print("t: ", torch_key,"\t", torch_state_dict[torch_key].shape)
    print("p: ", paddle_key, "\t", paddle_state_dict[paddle_key].shape, "\n")

paddle.save(paddle_state_dict, paddle_model_path)

t:  bert.embeddings.position_ids 	 torch.Size([1, 512])
p:  bert.embeddings.position_ids 	 [1, 512] 

t:  bert.embeddings.word_embeddings.weight 	 torch.Size([23236, 1024])
p:  bert.embeddings.word_embeddings.weight 	 [23236, 1024] 

t:  bert.embeddings.position_embeddings.weight 	 torch.Size([512, 1024])
p:  bert.embeddings.position_embeddings.weight 	 [512, 1024] 

t:  bert.embeddings.token_type_embeddings.weight 	 torch.Size([2, 1024])
p:  bert.embeddings.token_type_embeddings.weight 	 [2, 1024] 

t:  bert.embeddings.pinyin_embeddings.embedding.weight 	 torch.Size([32, 128])
p:  bert.embeddings.pinyin_embeddings.embedding.weight 	 [32, 128] 

t:  bert.embeddings.pinyin_embeddings.conv.weight 	 torch.Size([1024, 128, 2])
p:  bert.embeddings.pinyin_embeddings.conv.weight 	 [1024, 128, 2] 

t:  bert.embeddings.pinyin_embeddings.conv.bias 	 torch.Size([1024])
p:  bert.embeddings.pinyin_embeddings.conv.bias 	 [1024] 

t:  bert.embeddings.glyph_embeddings.embedding.weight 	 torch.Size([23

# 使用 Paddle 转写 forward
* 我们按照源码的组成部分分批转写

## Tokenizer

### 对齐 Bert tokenizer 

In [12]:
# =========================================
# 对齐 paddle 和 torch 的 bert tokenizer 
# =========================================
from tokenizers import BertWordPieceTokenizer
from paddlenlp.transformers import BertTokenizer

torch_token = BertWordPieceTokenizer('./pretrain_models/torch/ChineseBERT-base/vocab.txt')
berttokenizer = BertTokenizer('./pretrain_models/torch/ChineseBERT-base/vocab.txt')

sentence="我喜欢猫"

# torch
print("============ PyTorch ==================")
bert_tokens = torch_token.encode(sentence)
print(bert_tokens.ids)
print(bert_tokens.tokens)
print(bert_tokens.offsets)

print("============= Paddle =================")

# paddle
# ids
p_bert_tokens = berttokenizer.encode(sentence)
p_bert_tokens_ids = p_bert_tokens['input_ids']
print(p_bert_tokens_ids)

# tokens
p_bert_tokens_tokens = berttokenizer.tokenize(sentence)
p_bert_tokens_tokens.insert(0, '[CLS]')
p_bert_tokens_tokens.append('[SEP]')
print(p_bert_tokens_tokens)

# offsets
p_bert_tokens_offsets = berttokenizer.get_offset_mapping(sentence)
p_bert_tokens_offsets.insert(0, (0, 0))
p_bert_tokens_offsets.append((0, 0))
print(p_bert_tokens_offsets)

[101, 2769, 1599, 3614, 4344, 102]
['[CLS]', '我', '喜', '欢', '猫', '[SEP]']
[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (0, 0)]
[101, 2769, 1599, 3614, 4344, 102]
['[CLS]', '我', '喜', '欢', '猫', '[SEP]']
[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (0, 0)]


### 对齐 ChineseBert tokenizer

In [13]:
# =================================================
# PyTorch 的 ChineseBert BertDataset (tokenizer)
# =================================================

import json
import os
from typing import List

import tokenizers
import torch
from pypinyin import pinyin, Style
from tokenizers import BertWordPieceTokenizer


class BertDataset(object):

    def __init__(self, bert_path, max_length: int = 512):
        super().__init__()
        vocab_file = os.path.join(bert_path, 'vocab.txt')
        config_path = os.path.join(bert_path, 'config')
        self.max_length = max_length
        self.tokenizer = BertWordPieceTokenizer(vocab_file)

        # load pinyin map dict
        with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
            self.pinyin_dict = json.load(fin)
        # load char id map tensor
        with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
            self.id2pinyin = json.load(fin)
        # load pinyin map tensor
        with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
            self.pinyin2tensor = json.load(fin)

    def tokenize_sentence(self, sentence):
        # convert sentence to ids
        tokenizer_output = self.tokenizer.encode(sentence)
        bert_tokens = tokenizer_output.ids
        pinyin_tokens = self.convert_sentence_to_pinyin_ids(sentence, tokenizer_output)
        # assert，token nums should be same as pinyin token nums
        assert len(bert_tokens) <= self.max_length
        assert len(bert_tokens) == len(pinyin_tokens)
        # convert list to tensor
        input_ids = torch.LongTensor(bert_tokens)
        pinyin_ids = torch.LongTensor(pinyin_tokens).view(-1)
        return input_ids, pinyin_ids

    def convert_sentence_to_pinyin_ids(self, sentence: str, tokenizer_output: tokenizers.Encoding) -> List[List[int]]:
        # get pinyin of a sentence
        pinyin_list = pinyin(sentence, style=Style.TONE3, heteronym=True, errors=lambda x: [['not chinese'] for _ in x])
        pinyin_locs = {}
        # get pinyin of each location
        for index, item in enumerate(pinyin_list):
            pinyin_string = item[0]
            # not a Chinese character, pass
            if pinyin_string == "not chinese":
                continue
            if pinyin_string in self.pinyin2tensor:
                pinyin_locs[index] = self.pinyin2tensor[pinyin_string]
            else:
                ids = [0] * 8
                for i, p in enumerate(pinyin_string):
                    if p not in self.pinyin_dict["char2idx"]:
                        ids = [0] * 8
                        break
                    ids[i] = self.pinyin_dict["char2idx"][p]
                pinyin_locs[index] = ids

        # find chinese character location, and generate pinyin ids
        pinyin_ids = []
        for idx, (token, offset) in enumerate(zip(tokenizer_output.tokens, tokenizer_output.offsets)):
            if offset[1] - offset[0] != 1:
                pinyin_ids.append([0] * 8)
                continue
            if offset[0] in pinyin_locs:
                pinyin_ids.append(pinyin_locs[offset[0]])
            else:
                pinyin_ids.append([0] * 8)

        return pinyin_ids

In [14]:
# =================================================
# Paddle 的 ChineseBert BertDataset (tokenizer)
# =================================================
import json
import os
from typing import List
from pypinyin import pinyin, Style

import paddle
from paddlenlp.transformers import BertTokenizer

class PaddleBertDataset(object):
    def __init__(self, bert_path, max_length: int = 512):
        super().__init__()
        vocab_file = os.path.join(bert_path, 'vocab.txt')
        config_path = os.path.join(bert_path, 'config')
        self.max_length = max_length
        self.tokenizer = BertTokenizer(vocab_file)
        
        # load pinyin map dict
        with open(os.path.join(config_path, 'pinyin_map.json'), encoding='utf8') as fin:
            self.pinyin_dict = json.load(fin)
        # load char id map tensor
        with open(os.path.join(config_path, 'id2pinyin.json'), encoding='utf8') as fin:
            self.id2pinyin = json.load(fin)
        # load pinyin map tensor
        with open(os.path.join(config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
            self.pinyin2tensor = json.load(fin)
    
    def tokenize_sentence(self, sentence):
        # convert sentence to ids
        tokenizer_output = self.tokenizer.encode(sentence)
        bert_tokens = tokenizer_output['input_ids']
        pinyin_tokens = self.convert_sentence_to_pinyin_ids(sentence)
        # assert，token nums should be same as pinyin token nums
        assert len(bert_tokens) <= self.max_length
        assert len(bert_tokens) == len(pinyin_tokens)
        
        # convert list to tensor
        input_ids = paddle.to_tensor(bert_tokens)
        pinyin_ids = paddle.to_tensor(pinyin_tokens).reshape([-1])
        return input_ids, pinyin_ids

    def convert_sentence_to_pinyin_ids(self, sentence: str) -> List[List[int]]:
        # get offsets
        bert_tokens_offsets = self.tokenizer.get_offset_mapping(sentence)
        bert_tokens_offsets.insert(0, (0, 0))
        bert_tokens_offsets.append((0, 0))
        
        # get tokens
        bert_tokens_tokens = self.tokenizer.tokenize(sentence)
        bert_tokens_tokens.insert(0, '[CLS]')
        bert_tokens_tokens.append('[SEP]')
        
        # get pinyin of a sentence
        pinyin_list = pinyin(sentence, style=Style.TONE3, heteronym=True, errors=lambda x: [['not chinese'] for _ in x])
        pinyin_locs = {}
        # get pinyin of each location
        for index, item in enumerate(pinyin_list):
            pinyin_string = item[0]
            # not a Chinese character, pass
            if pinyin_string == "not chinese":
                continue
            if pinyin_string in self.pinyin2tensor:
                pinyin_locs[index] = self.pinyin2tensor[pinyin_string]
            else:
                ids = [0] * 8
                for i, p in enumerate(pinyin_string):
                    if p not in self.pinyin_dict["char2idx"]:
                        ids = [0] * 8
                        break
                    ids[i] = self.pinyin_dict["char2idx"][p]
                pinyin_locs[index] = ids

        # find chinese character location, and generate pinyin ids
        pinyin_ids = []
        for idx, (token, offset) in enumerate(zip(bert_tokens_tokens, bert_tokens_offsets)):
            if offset[1] - offset[0] != 1:
                pinyin_ids.append([0] * 8)
                continue
            if offset[0] in pinyin_locs:
                pinyin_ids.append(pinyin_locs[offset[0]])
            else:
                pinyin_ids.append([0] * 8)

        return pinyin_ids

In [15]:
# torch
sentence = '我喜欢猫'
tokenizer = BertDataset(CHINESEBERT_PATH)
input_ids, pinyin_ids = tokenizer.tokenize_sentence(sentence)
print("============================== torch =============================")
print(input_ids.cpu().detach().numpy(), pinyin_ids.cpu().detach().numpy())
print()

# paddle
sentence = '我喜欢猫'
paddle_tokenizer = PaddleBertDataset(CHINESEBERT_PATH)
paddle_input_ids, paddle_pinyin_ids = paddle_tokenizer.tokenize_sentence(sentence)
print("============================== paddle =============================")
print(paddle_input_ids.cpu().detach().numpy(), paddle_pinyin_ids.cpu().detach().numpy())

[ 101 2769 1599 3614 4344  102] [ 0  0  0  0  0  0  0  0 28 20  3  0  0  0  0  0 29 14  3  0  0  0  0  0
 13 26  6 19  1  0  0  0 18  6 20  1  0  0  0  0  0  0  0  0  0  0  0  0]



W0912 23:18:46.616715 23468 device_context.cc:404] Please NOTE: device: 0, GPU Compute Capability: 6.0, Driver API Version: 11.0, Runtime API Version: 10.1
W0912 23:18:46.620927 23468 device_context.cc:422] device: 0, cuDNN Version: 8.0.


[ 101 2769 1599 3614 4344  102] [ 0  0  0  0  0  0  0  0 28 20  3  0  0  0  0  0 29 14  3  0  0  0  0  0
 13 26  6 19  1  0  0  0 18  6 20  1  0  0  0  0  0  0  0  0  0  0  0  0]


## PinyinEmbedding

In [16]:
# Torch PinyinEmbedding

import json
import os

from torch import nn
from torch.nn import functional as F


class PinyinEmbedding(nn.Module):
    def __init__(self, embedding_size: int, pinyin_out_dim: int, config_path):
        """
            Pinyin Embedding Module
        Args:
            embedding_size: the size of each embedding vector
            pinyin_out_dim: kernel number of conv
        """
        super(PinyinEmbedding, self).__init__()
        with open(os.path.join(config_path, 'pinyin_map.json')) as fin:
            pinyin_dict = json.load(fin)
        self.pinyin_out_dim = pinyin_out_dim
        self.embedding = nn.Embedding(len(pinyin_dict['idx2char']), embedding_size)
        self.conv = nn.Conv1d(in_channels=embedding_size, 
                              out_channels=self.pinyin_out_dim, 
                              kernel_size=2,
                              stride=1, 
                              padding=0)

    def forward(self, pinyin_ids):
        """
        Args:
            pinyin_ids: (bs*sentence_length*pinyin_locs)

        Returns:
            pinyin_embed: (bs,sentence_length,pinyin_out_dim)
        """
        # input pinyin ids for 1-D conv
        embed = self.embedding(pinyin_ids)  # [bs,sentence_length,pinyin_locs,embed_size]
        bs, sentence_length, pinyin_locs, embed_size = embed.shape
        view_embed = embed.view(-1, pinyin_locs, embed_size)  # [(bs*sentence_length),pinyin_locs,embed_size]
        input_embed = view_embed.permute(0, 2, 1)  # [(bs*sentence_length), embed_size, pinyin_locs]
        # conv + max_pooling
        pinyin_conv = self.conv(input_embed)  # [(bs*sentence_length),pinyin_out_dim,H]
        pinyin_embed = F.max_pool1d(pinyin_conv, pinyin_conv.shape[-1])  # [(bs*sentence_length),pinyin_out_dim,1]
        return pinyin_embed.view(bs, sentence_length, self.pinyin_out_dim)  # [bs,sentence_length,pinyin_out_dim]


In [17]:
# Paddle PinyinEmbedding

import json
import os
import paddle


class PaddlePinyinEmbedding(paddle.nn.Layer):
    def __init__(self, embedding_size: int, pinyin_out_dim: int, config_path):
        """
            Pinyin Embedding Module
        Args:
            embedding_size: the size of each embedding vector
            pinyin_out_dim: kernel number of conv
        """
        super(PaddlePinyinEmbedding, self).__init__()
        with open(os.path.join(config_path, 'pinyin_map.json')) as fin:
            pinyin_dict = json.load(fin)
        self.pinyin_out_dim = pinyin_out_dim
        self.embedding = paddle.nn.Embedding(len(pinyin_dict['idx2char']), embedding_size)
        self.conv = paddle.nn.Conv1D(in_channels=embedding_size, 
                                     out_channels=self.pinyin_out_dim, 
                                     kernel_size=2,
                                     stride=1, 
                                     padding=0,
                                     bias_attr=True)

    def forward(self, pinyin_ids):
        """
        Args:
            pinyin_ids: (bs*sentence_length*pinyin_locs)

        Returns:
            pinyin_embed: (bs,sentence_length,pinyin_out_dim)
        """
        # input pinyin ids for 1-D conv
        embed = self.embedding(pinyin_ids)  # [bs,sentence_length,pinyin_locs,embed_size]
        bs, sentence_length, pinyin_locs, embed_size = embed.shape
        view_embed = embed.reshape((-1, pinyin_locs, embed_size))  # [(bs*sentence_length),pinyin_locs,embed_size]
        input_embed = view_embed.transpose([0, 2, 1])  # [(bs*sentence_length), embed_size, pinyin_locs]
        # conv + max_pooling
        pinyin_conv = self.conv(input_embed)  # [(bs*sentence_length),pinyin_out_dim,H]
        pinyin_embed = paddle.nn.functional.max_pool1d(pinyin_conv, pinyin_conv.shape[-1])  # [(bs*sentence_length),pinyin_out_dim,1]
        return pinyin_embed.reshape((bs, sentence_length, self.pinyin_out_dim))  # [bs,sentence_length,pinyin_out_dim]


In [18]:
# torch
print("============================== torch =============================")
sentence = '我喜欢猫'
tokenizer = BertDataset(CHINESEBERT_PATH)
input_ids, pinyin_ids = tokenizer.tokenize_sentence(sentence)
length = input_ids.shape[0]
print(f"length: {length}")
print("torch size:", input_ids.size())
print(pinyin_ids)
pinyin_ids = pinyin_ids.view(1, length, 8)
print(pinyin_ids, pinyin_ids.shape)


hidden_size = 768
config_path='./pretrain_models/torch/ChineseBERT-base/config/'
pinyin_embeddings = PinyinEmbedding(embedding_size=128, 
                                    pinyin_out_dim=hidden_size,
                                    config_path=config_path)

torch_pinyin_emb = pinyin_embeddings(pinyin_ids)
print(">>>>torch_pinyin_emb<<<<")
print(torch_pinyin_emb, torch_pinyin_emb.shape)
print()

# paddle
sentence = '我喜欢猫'
paddle_tokenizer = PaddleBertDataset(CHINESEBERT_PATH)
paddle_input_ids, paddle_pinyin_ids = paddle_tokenizer.tokenize_sentence(sentence)
print("============================== paddle =============================")
length = paddle_input_ids.shape[0]
print(f"length: {length}")
print("paddle size(shape)", paddle_input_ids.shape)
print(paddle_pinyin_ids)
paddle_pinyin_ids = paddle_pinyin_ids.reshape((1, length, 8))
print(paddle_pinyin_ids, paddle_pinyin_ids.shape)

hidden_size = 768
config_path='./pretrain_models/torch/ChineseBERT-base/config/'
paddle_pinyin_embeddings = PaddlePinyinEmbedding(embedding_size=128, 
                                                 pinyin_out_dim=hidden_size,
                                                 config_path=config_path)
paddle_pinyin_emb = paddle_pinyin_embeddings(paddle_pinyin_ids)
print(">>>>paddle_pinyin_emb<<<<")
print(paddle_pinyin_emb, paddle_pinyin_emb.shape)

length: 6
torch size: torch.Size([6])
tensor([ 0,  0,  0,  0,  0,  0,  0,  0, 28, 20,  3,  0,  0,  0,  0,  0, 29, 14,
         3,  0,  0,  0,  0,  0, 13, 26,  6, 19,  1,  0,  0,  0, 18,  6, 20,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor([[[ 0,  0,  0,  0,  0,  0,  0,  0],
         [28, 20,  3,  0,  0,  0,  0,  0],
         [29, 14,  3,  0,  0,  0,  0,  0],
         [13, 26,  6, 19,  1,  0,  0,  0],
         [18,  6, 20,  1,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0,  0,  0,  0]]]) torch.Size([1, 6, 8])
>>>>torch_pinyin_emb<<<<
tensor([[[-0.6347, -0.2307, -0.9765,  ..., -0.5059,  0.1007,  0.6182],
         [-0.4700,  1.3561, -0.0175,  ...,  0.9705,  0.7019,  0.6182],
         [ 0.1531, -0.0253, -0.1894,  ...,  0.5447,  0.7327,  0.6182],
         [ 0.7492,  1.5474,  0.8866,  ...,  0.4418,  0.6674,  0.8616],
         [ 0.0071,  0.6976,  0.9401,  ...,  0.8413,  0.3559,  0.8616],
         [-0.6347, -0.2307, -0.9765,  ..., -0.5059,  0.1007,  0.6182]]],
       gra

In [19]:
print(f"torch pinyin_embeddings: {pinyin_embeddings}")
print(f"paddle paddle_pinyin_embeddings: {paddle_pinyin_embeddings}")

torch pinyin_embeddings: PinyinEmbedding(
  (embedding): Embedding(32, 128)
  (conv): Conv1d(128, 768, kernel_size=(2,), stride=(1,))
)
paddle paddle_pinyin_embeddings: PaddlePinyinEmbedding(
  (embedding): Embedding(32, 128, sparse=False)
  (conv): Conv1D(128, 768, kernel_size=[2], data_format=NCL)
)


## GlyphEmbedding

In [20]:
# torch
from typing import List

import numpy as np
import torch
from torch import nn
class GlyphEmbedding(nn.Module):
    """Glyph2Image Embedding"""

    def __init__(self, font_npy_files: List[str]):
        super(GlyphEmbedding, self).__init__()
        font_arrays = [
            np.load(np_file).astype(np.float32) for np_file in font_npy_files
        ]
        self.vocab_size = font_arrays[0].shape[0]
        self.font_num = len(font_arrays)
        self.font_size = font_arrays[0].shape[-1]
        # N, C, H, W
        font_array = np.stack(font_arrays, axis=1)
        self.embedding = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.font_size ** 2 * self.font_num,
            _weight=torch.from_numpy(font_array.reshape([self.vocab_size, -1]))
        )

    def forward(self, input_ids):
        """
            get glyph images for batch inputs
        Args:
            input_ids: [batch, sentence_length]
        Returns:
            images: [batch, sentence_length, self.font_num*self.font_size*self.font_size]
        """
        # return self.embedding(input_ids).view([-1, self.font_num, self.font_size, self.font_size])
        return self.embedding(input_ids)

In [21]:
# paddle
from typing import List

import numpy as np
import paddle

class PaddleGlyphEmbedding(paddle.nn.Layer):
    """Glyph2Image Embedding"""

    def __init__(self, font_npy_files: List[str]):
        super(PaddleGlyphEmbedding, self).__init__()
        font_arrays = [
            np.load(np_file).astype(np.float32) for np_file in font_npy_files
        ]
        self.vocab_size = font_arrays[0].shape[0]
        self.font_num = len(font_arrays)
        self.font_size = font_arrays[0].shape[-1]
        # N, C, H, W
        font_array = np.stack(font_arrays, axis=1)
        self.embedding = paddle.nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.font_size ** 2 * self.font_num
        )
        self.embedding.weight.set_value(font_array.reshape([self.vocab_size, -1]))

    def forward(self, input_ids):
        """
            get glyph images for batch inputs
        Args:
            input_ids: [batch, sentence_length]
        Returns:
            images: [batch, sentence_length, self.font_num*self.font_size*self.font_size]
        """
        # return self.embedding(input_ids).view([-1, self.font_num, self.font_size, self.font_size])
        return self.embedding(input_ids)

In [22]:
config_path='./pretrain_models/torch/ChineseBERT-base/config/'
font_files = []
for file in os.listdir(config_path):
    if file.endswith(".npy"):
        font_files.append(os.path.join(config_path, file))
print(font_files)

['./pretrain_models/torch/ChineseBERT-base/config/STFANGSO.TTF24.npy', './pretrain_models/torch/ChineseBERT-base/config/STXINGKA.TTF24.npy', './pretrain_models/torch/ChineseBERT-base/config/方正古隶繁体.ttf24.npy']


In [23]:
# torch 
print("============================== torch =============================")
glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)
t_emb_w = glyph_embeddings.state_dict()['embedding.weight'].cpu().detach().numpy()
print(f"torch glyph_embeddings: {glyph_embeddings}")

# paddle
print("============================== paddle =============================")
paddle_glyph_embeddings = PaddleGlyphEmbedding(font_npy_files=font_files)
p_emb_w = paddle_glyph_embeddings.state_dict()['embedding.weight'].cpu().detach().numpy()
print(f"paddle paddle_glyph_embeddings: {paddle_glyph_embeddings}")

(p_emb_w == t_emb_w).all()

torch glyph_embeddings: GlyphEmbedding(
  (embedding): Embedding(23236, 1728)
)
paddle paddle_glyph_embeddings: PaddleGlyphEmbedding(
  (embedding): Embedding(23236, 1728, sparse=False)
)


True

In [24]:
paddle_glyph_embeddings.state_dict().keys()

odict_keys(['embedding.weight'])

## FusionBertEmbeddings

In [25]:
# torch
import os

import torch
from torch import nn
class FusionBertEmbeddings(nn.Module):
    """
    Construct the embeddings from word, position, glyph, pinyin and token_type embeddings.
    """

    def __init__(self, config):
        super(FusionBertEmbeddings, self).__init__()
        config_path = os.path.join(config.name_or_path, 'config')
        font_files = []
        for file in os.listdir(config_path):
            if file.endswith(".npy"):
                font_files.append(os.path.join(config_path, file))
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.pinyin_embeddings = PinyinEmbedding(embedding_size=128, pinyin_out_dim=config.hidden_size,
                                                 config_path=config_path)
        self.glyph_embeddings = GlyphEmbedding(font_npy_files=font_files)

        # self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
        # any TensorFlow checkpoint file
        self.glyph_map = nn.Linear(1728, config.hidden_size)
        self.map_fc = nn.Linear(config.hidden_size * 3, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))

    def forward(self, input_ids=None, pinyin_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # get char embedding, pinyin embedding and glyph embedding
        word_embeddings = inputs_embeds  # [bs,l,hidden_size]
        pinyin_embeddings = self.pinyin_embeddings(pinyin_ids)  # [bs,l,hidden_size]
        glyph_embeddings = self.glyph_map(self.glyph_embeddings(input_ids))  # [bs,l,hidden_size]
        # fusion layer
        concat_embeddings = torch.cat((word_embeddings, pinyin_embeddings, glyph_embeddings), 2)
        inputs_embeds = self.map_fc(concat_embeddings)

        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [26]:
# paddle
import os

import paddle
class PaddleFusionBertEmbeddings(paddle.nn.Layer):
    """
    Construct the embeddings from word, position, glyph, pinyin and token_type embeddings.
    """

    def __init__(self, config):
        super(PaddleFusionBertEmbeddings, self).__init__()
        config_path = os.path.join(config.name_or_path, 'config')
        font_files = []
        for file in os.listdir(config_path):
            if file.endswith(".npy"):
                font_files.append(os.path.join(config_path, file))
        self.word_embeddings = paddle.nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
        self.position_embeddings = paddle.nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = paddle.nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.pinyin_embeddings = PaddlePinyinEmbedding(embedding_size=128, 
                                                       pinyin_out_dim=config.hidden_size,
                                                       config_path=config_path)
        self.glyph_embeddings = PaddleGlyphEmbedding(font_npy_files=font_files)

        # self.LayerNorm is not snake-cased to stick with TensorFlow models variable name and be able to load
        # any TensorFlow checkpoint file
        self.glyph_map = paddle.nn.Linear(1728, config.hidden_size, bias_attr=True)
        self.map_fc = paddle.nn.Linear(config.hidden_size * 3, config.hidden_size, bias_attr=True)
        self.LayerNorm = paddle.nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
        self.dropout = paddle.nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", paddle.arange(config.max_position_embeddings).expand((1, -1)))

    def forward(self, input_ids=None, pinyin_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.shape
        else:
            input_shape = inputs_embeds.shape[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if token_type_ids is None:
            token_type_ids = paddle.zeros(input_shape, dtype='int64')

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # get char embedding, pinyin embedding and glyph embedding
        word_embeddings = inputs_embeds  # [bs,l,hidden_size]
        pinyin_embeddings = self.pinyin_embeddings(pinyin_ids)  # [bs,l,hidden_size]
        glyph_embeddings = self.glyph_map(self.glyph_embeddings(input_ids))  # [bs,l,hidden_size]
        # fusion layer
        concat_embeddings = paddle.concat((word_embeddings, pinyin_embeddings, glyph_embeddings), 2)
        inputs_embeds = self.map_fc(concat_embeddings)

        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [27]:
config

BertConfig {
  "_name_or_path": "./pretrain_models/torch/ChineseBERT-base/",
  "architectures": [
    "GlyceBertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 23236
}

In [28]:
# torch
print("============================== torch =============================")
torch_fusion_emb = FusionBertEmbeddings(config)
print(torch_fusion_emb)

# paddle
print("============================== paddle =============================")
paddle_fusion_emb = PaddleFusionBertEmbeddings(config)
print(paddle_fusion_emb)

FusionBertEmbeddings(
  (word_embeddings): Embedding(23236, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (pinyin_embeddings): PinyinEmbedding(
    (embedding): Embedding(32, 128)
    (conv): Conv1d(128, 768, kernel_size=(2,), stride=(1,))
  )
  (glyph_embeddings): GlyphEmbedding(
    (embedding): Embedding(23236, 1728)
  )
  (glyph_map): Linear(in_features=1728, out_features=768, bias=True)
  (map_fc): Linear(in_features=2304, out_features=768, bias=True)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
PaddleFusionBertEmbeddings(
  (word_embeddings): Embedding(23236, 768, padding_idx=0, sparse=False)
  (position_embeddings): Embedding(512, 768, sparse=False)
  (token_type_embeddings): Embedding(2, 768, sparse=False)
  (pinyin_embeddings): PaddlePinyinEmbedding(
    (embedding): Embedding(32, 128, sparse=False)
    (conv): Conv1D(128, 768, kernel_size=[2],

## GlyceBertModel

In [29]:
# torch
import warnings

import torch
from torch import nn
from transformers.modeling_bert import BertEncoder, BertPooler
from transformers.modeling_bert import BertModel
from transformers.modeling_outputs import BaseModelOutputWithPooling

class GlyceBertModel(BertModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the models.
        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the models at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        models = BertModel.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = models(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """

    def __init__(self, config):
        super(GlyceBertModel, self).__init__(config)
        self.config = config

        self.embeddings = FusionBertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        pinyin_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            if the models is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
            is used in the cross-attention if the models is configured as a decoder.
            Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        embedding_output = self.embeddings(
            input_ids=input_ids, pinyin_ids=pinyin_ids, position_ids=position_ids, token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_extended_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

In [30]:
glyce_bert_model = GlyceBertModel(config)

In [31]:
list(glyce_bert_model.state_dict().keys())[:10]

['embeddings.position_ids',
 'embeddings.word_embeddings.weight',
 'embeddings.position_embeddings.weight',
 'embeddings.token_type_embeddings.weight',
 'embeddings.pinyin_embeddings.embedding.weight',
 'embeddings.pinyin_embeddings.conv.weight',
 'embeddings.pinyin_embeddings.conv.bias',
 'embeddings.glyph_embeddings.embedding.weight',
 'embeddings.glyph_map.weight',
 'embeddings.glyph_map.bias']

In [32]:
# paddle
import warnings

import paddle
from paddlenlp.transformers import BertModel

class BertPooler(paddle.nn.Layer):
    """
    Pool the result of BertEncoder.
    """
    def __init__(self, hidden_size, pool_act="tanh"):
        super(BertPooler, self).__init__()
        self.dense = paddle.nn.Linear(hidden_size, hidden_size)
        self.activation = paddle.nn.Tanh()
        self.pool_act = pool_act

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        if self.pool_act == "tanh":
            pooled_output = self.activation(pooled_output)
        return pooled_output

class PaddleGlyceBertModel(BertModel):
    r"""
    PaddleGlyceBertModel
    """

    def __init__(self, config):
        super(PaddleGlyceBertModel, self).__init__(vocab_size=config.vocab_size)
        self.embeddings = PaddleFusionBertEmbeddings(config)
        encoder_layer = paddle.nn.TransformerEncoderLayer(
            config.hidden_size,
            config.num_attention_heads,
            config.intermediate_size,
            dropout=config.hidden_dropout_prob,
            activation=config.hidden_act,
            attn_dropout=config.attention_probs_dropout_prob,
            act_dropout=0)
        self.encoder = paddle.nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
        self.pooler = BertPooler(config.hidden_size)
        self.apply(self.init_weights)
        
    def forward(
        self,
        input_ids=None,
        pinyin_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        if attention_mask is None:
            attention_mask = paddle.unsqueeze(
                (input_ids == self.pad_token_id
                 ).astype(self.pooler.dense.weight.dtype) * -1e9,
                axis=[1, 2])
        embedding_output = self.embeddings(
            input_ids=input_ids, pinyin_ids=pinyin_ids, position_ids=position_ids, token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds
        )
        if output_hidden_states:
            output = embedding_output
            encoder_outputs = []
            for mod in self.encoder.layers:
                output = mod(output, src_mask=attention_mask)
                encoder_outputs.append(output)
            if self.encoder.norm is not None:
                encoder_outputs[-1] = self.encoder.norm(encoder_outputs[-1])
            pooled_output = self.pooler(encoder_outputs[-1])
        else:
            sequence_output = self.encoder(embedding_output, attention_mask)
            pooled_output = self.pooler(sequence_output)
        if output_hidden_states:
            return encoder_outputs, pooled_output
        else:
            return sequence_output, pooled_output

In [33]:
paddle_glyce_bert_model = PaddleGlyceBertModel(config)


In [34]:
list(paddle_state_dict.keys())[:10]

['bert.embeddings.position_ids',
 'bert.embeddings.word_embeddings.weight',
 'bert.embeddings.position_embeddings.weight',
 'bert.embeddings.token_type_embeddings.weight',
 'bert.embeddings.pinyin_embeddings.embedding.weight',
 'bert.embeddings.pinyin_embeddings.conv.weight',
 'bert.embeddings.pinyin_embeddings.conv.bias',
 'bert.embeddings.glyph_embeddings.embedding.weight',
 'bert.embeddings.glyph_map.weight',
 'bert.embeddings.glyph_map.bias']

# 计算前项精度比较的方法
* 两个框架对于同一个模型的前项输出，最大误差应该控制在 10^-4，即，说明复现成功

In [1]:
import sys
sys.path.append('/data1/workspace/research/ChineseBERT-Paddle/ChineseBert')

In [2]:
# torch
from datasets.bert_dataset import BertDataset
from models.modeling_glycebert import GlyceBertModel
sentence = '我喜欢猫'
CHINESEBERT_PATH='./pretrain_models/torch/ChineseBERT-large/'
tokenizer = BertDataset(CHINESEBERT_PATH)
chinese_bert = GlyceBertModel.from_pretrained(CHINESEBERT_PATH)
chinese_bert.eval()
input_ids, pinyin_ids = tokenizer.tokenize_sentence(sentence)
length = input_ids.shape[0]
input_ids = input_ids.view(1, length)
pinyin_ids = pinyin_ids.view(1, length, 8)
print(input_ids)
print(pinyin_ids)

torch_output_hidden = chinese_bert.forward(input_ids, pinyin_ids)[0]
torch_output_hidden, torch_output_hidden.shape

tensor([[ 101, 2769, 1599, 3614, 4344,  102]])
tensor([[[ 0,  0,  0,  0,  0,  0,  0,  0],
         [28, 20,  3,  0,  0,  0,  0,  0],
         [29, 14,  3,  0,  0,  0,  0,  0],
         [13, 26,  6, 19,  1,  0,  0,  0],
         [18,  6, 20,  1,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0,  0,  0,  0]]])


(tensor([[[-0.0368, -0.0524, -0.1786,  ..., -0.1690,  0.1436,  0.0379],
          [-0.1318,  0.1298,  0.3270,  ...,  0.2858, -0.5699,  0.2182],
          [-0.3525, -0.0942, -0.1648,  ...,  0.2943, -0.7855, -0.0041],
          [-0.1875,  0.0362,  0.1704,  ..., -0.3582, -0.2260, -0.1740],
          [-0.0889,  0.3527, -0.2916,  ..., -0.0836, -0.1554,  0.2234],
          [-0.0337, -0.0427, -0.1809,  ..., -0.1581,  0.1669,  0.0269]]],
        grad_fn=<NativeLayerNormBackward>),
 torch.Size([1, 6, 1024]))

In [3]:
#chinese_bert.state_dict()

In [3]:
import sys
sys.path.append('/data1/workspace/research/ChineseBERT-Paddle/Paddle_ChineseBert/PaddleNLP')

In [4]:
# paddle
import paddle
from paddlenlp.transformers import ChineseBertTokenizer
from paddlenlp.transformers import GlyceBertModel
sentence = '我喜欢猫'
CHINESEBERT_PADDLE_PATH = "./pretrain_models/paddle/ChineseBERT-large/"
tokenizer = ChineseBertTokenizer(CHINESEBERT_PADDLE_PATH)
glyce_bert_model = GlyceBertModel.from_pretrained(CHINESEBERT_PADDLE_PATH)
glyce_bert_model.eval()
token_input = tokenizer.tokenize_sentence(sentence)
input_ids = paddle.to_tensor(token_input['input_ids'])
pinyin_ids = paddle.to_tensor(token_input['pinyin_ids'])
input_ids = input_ids.unsqueeze(0)
pinyin_ids = pinyin_ids.unsqueeze(0)
print(input_ids)
print(pinyin_ids)

paddle_output_hidden = glyce_bert_model.forward(input_ids, pinyin_ids)[0]
paddle_output_hidden

W0913 21:05:51.689723 19563 device_context.cc:404] Please NOTE: device: 0, GPU Compute Capability: 6.0, Driver API Version: 11.0, Runtime API Version: 10.1
W0913 21:05:51.693789 19563 device_context.cc:422] device: 0, cuDNN Version: 8.0.
[32m[2021-09-13 21:05:58,713] [    INFO][0m - Weights from pretrained model not used in GlyceBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'][0m


Tensor(shape=[1, 6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
       [[101 , 2769, 1599, 3614, 4344, 102 ]])
Tensor(shape=[1, 6, 8], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
       [[[0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ],
         [28, 20, 3 , 0 , 0 , 0 , 0 , 0 ],
         [29, 14, 3 , 0 , 0 , 0 , 0 , 0 ],
         [13, 26, 6 , 19, 1 , 0 , 0 , 0 ],
         [18, 6 , 20, 1 , 0 , 0 , 0 , 0 ],
         [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ]]])


Tensor(shape=[1, 6, 1024], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
       [[[-0.03676526, -0.05237103, -0.17860234, ..., -0.16904099,  0.14357226,  0.03791038],
         [-0.13184090,  0.12977326,  0.32704630, ...,  0.28580564, -0.56986696,  0.21823926],
         [-0.35248795, -0.09423149, -0.16476731, ...,  0.29425690, -0.78550410, -0.00410738],
         [-0.18750003,  0.03615668,  0.17038533, ..., -0.35818714, -0.22604686, -0.17404869],
         [-0.08885855,  0.35273460, -0.29164734, ..., -0.08359668, -0.15540956,  0.22340605],
         [-0.03365400, -0.04272101, -0.18089685, ..., -0.15809794,  0.16688700,  0.02686221]]])

In [5]:
import paddle
import torch
import numpy as np

# torch
t_output_hidden = torch_output_hidden.cpu().detach().numpy()

# paddle
p_output_hidden = paddle_output_hidden.cpu().detach().numpy()

diff = t_output_hidden - p_output_hidden
error = np.max(abs(diff))
print("最大误差:", error)

最大误差: 3.8146973e-06


# 附录
## 对比 paddlenlp 和 huggingface 的 `bert-base-chinese` 模型
* paddlenlp 文档：（这个文档有些老，但总体上是对的，需要注意 huggingface 现在模型参数已经弃用了 `attention.output.LayerNorm.gamma` 等）
    https://paddlenlp.readthedocs.io/zh/latest/community/contribute_models/convert_pytorch_to_paddle.html

### paddlenlp

In [18]:
from paddlenlp.transformers import BertModel

p = BertModel.from_pretrained("bert-base-chinese")

[32m[2021-09-12 16:08:40,516] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-base-chinese/bert-base-chinese.pdparams[0m
[32m[2021-09-12 16:08:44,825] [    INFO][0m - Weights from pretrained model not used in BertModel: ['cls.predictions.decoder_weight', 'cls.predictions.decoder_bias', 'cls.predictions.transform.weight', 'cls.predictions.transform.bias', 'cls.predictions.layer_norm.weight', 'cls.predictions.layer_norm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'][0m


In [20]:
p

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, sparse=False)
    (position_embeddings): Embedding(512, 768, sparse=False)
    (token_type_embeddings): Embedding(2, 768, sparse=False)
    (layer_norm): LayerNorm(normalized_shape=[768], epsilon=1e-12)
    (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)
  )
  (encoder): TransformerEncoder(
    (layers): LayerList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadAttention(
          (q_proj): Linear(in_features=768, out_features=768, dtype=float32)
          (k_proj): Linear(in_features=768, out_features=768, dtype=float32)
          (v_proj): Linear(in_features=768, out_features=768, dtype=float32)
          (out_proj): Linear(in_features=768, out_features=768, dtype=float32)
        )
        (linear1): Linear(in_features=768, out_features=3072, dtype=float32)
        (dropout): Dropout(p=0, axis=None, mode=upscale_in_train)
        (linear2): Linear(in_features=3072,

In [58]:
p.state_dict()['encoder.layers.1.norm2.weight']

Parameter containing:
Tensor(shape=[768], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
       [0.99000758, 0.97690040, 0.97765732, 0.94424844, 0.76797146, 0.95842028, 0.95088869, 0.95098066, 0.89577252, 1.00088620, 0.98037577, 0.92581815, 0.83142120, 0.92646289, 0.95957601, 0.94461876, 0.99346662, 1.01123142, 0.96349865, 0.94775075, 0.98592800, 0.94116277, 1.01536751, 0.95520437, 0.94929725, 0.96389103, 0.96608067, 0.94282484, 1.01474190, 0.97430092, 0.91588789, 0.98019701, 0.99701631, 0.94673103, 0.99644357, 0.96920210, 0.97043175, 0.95901281, 0.94662303, 0.98728698, 0.97292870, 0.95352584, 0.97319037, 1.00273454, 0.92802316, 0.95470178, 0.97651005, 0.97335207, 0.94828159, 0.96513742, 0.97811615, 0.54728281, 0.95744258, 0.96712947, 0.97800148, 0.96688694, 0.96835792, 0.98429048, 0.90828168, 1.00500143, 0.98842162, 0.96154082, 0.94576639, 0.91822588, 1.00190949, 0.98568445, 0.96719068, 0.94651729, 0.94914740, 0.96677554, 0.93641907, 0.95245361, 0.96893132, 0.93815476, 0.9943

In [59]:
for k in p.state_dict().keys():
    print(k, p.state_dict()[k].shape)

embeddings.word_embeddings.weight [21128, 768]
embeddings.position_embeddings.weight [512, 768]
embeddings.token_type_embeddings.weight [2, 768]
embeddings.layer_norm.weight [768]
embeddings.layer_norm.bias [768]
encoder.layers.0.self_attn.q_proj.weight [768, 768]
encoder.layers.0.self_attn.q_proj.bias [768]
encoder.layers.0.self_attn.k_proj.weight [768, 768]
encoder.layers.0.self_attn.k_proj.bias [768]
encoder.layers.0.self_attn.v_proj.weight [768, 768]
encoder.layers.0.self_attn.v_proj.bias [768]
encoder.layers.0.self_attn.out_proj.weight [768, 768]
encoder.layers.0.self_attn.out_proj.bias [768]
encoder.layers.0.linear1.weight [768, 3072]
encoder.layers.0.linear1.bias [3072]
encoder.layers.0.linear2.weight [3072, 768]
encoder.layers.0.linear2.bias [768]
encoder.layers.0.norm1.weight [768]
encoder.layers.0.norm1.bias [768]
encoder.layers.0.norm2.weight [768]
encoder.layers.0.norm2.bias [768]
encoder.layers.1.self_attn.q_proj.weight [768, 768]
encoder.layers.1.self_attn.q_proj.bias [76

### huggingface

In [68]:
from transformers import BertModel
h = BertModel.from_pretrained('bert-base-chinese')

In [69]:
h

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [72]:
h.state_dict()['encoder.layer.0.attention.self.query.weight']

tensor([[ 1.1492e-01, -9.4048e-03,  5.8782e-03,  ...,  1.3729e-02,
          1.5892e-02, -8.1710e-02],
        [ 6.5357e-03, -2.2129e-02, -6.7798e-03,  ...,  4.7093e-05,
          4.5497e-02,  1.3203e-02],
        [ 1.3464e-02,  2.4955e-03,  3.5074e-02,  ...,  1.1137e-01,
         -3.1568e-02, -1.4776e-02],
        ...,
        [-5.0526e-02,  2.3847e-02,  1.0480e-02,  ...,  5.6518e-02,
          4.1081e-03,  7.1023e-02],
        [ 1.7295e-02, -8.8264e-02, -5.6218e-02,  ..., -4.2443e-02,
          3.8017e-02, -1.5388e-02],
        [ 1.5026e-02, -2.9466e-02, -1.5802e-03,  ...,  9.7084e-02,
         -3.4228e-02,  2.2910e-03]])

In [62]:
for k in h.state_dict().keys():
    print(k, h.state_dict()[k].shape)

embeddings.position_ids torch.Size([1, 512])
embeddings.word_embeddings.weight torch.Size([21128, 768])
embeddings.position_embeddings.weight torch.Size([512, 768])
embeddings.token_type_embeddings.weight torch.Size([2, 768])
embeddings.LayerNorm.weight torch.Size([768])
embeddings.LayerNorm.bias torch.Size([768])
encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias torch.Size([768])
encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias torch.Size([768])
encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias torch.Size([768])
encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
encoder.layer.0.intermediate.dense.weight 

In [4]:
from paddlenlp.transformers import BertModel
bm = BertModel.from_pretrained('bert-base-chinese')

[32m[2021-09-12 17:49:57,477] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-base-chinese/bert-base-chinese.pdparams[0m
W0912 17:49:57.479743 31618 device_context.cc:404] Please NOTE: device: 0, GPU Compute Capability: 6.0, Driver API Version: 11.0, Runtime API Version: 10.1
W0912 17:49:57.484134 31618 device_context.cc:422] device: 0, cuDNN Version: 8.0.


init_kwargs: {'vocab_size': 21128, 'hidden_size': 768, 'num_hidden_layers': 12, 'num_attention_heads': 12, 'intermediate_size': 3072, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'attention_probs_dropout_prob': 0.1, 'max_position_embeddings': 512, 'type_vocab_size': 2, 'initializer_range': 0.02, 'pad_token_id': 0}
----> init_class: BertModel
---> cls: <class 'paddlenlp.transformers.bert.modeling.BertModel'>


[32m[2021-09-12 17:50:04,158] [    INFO][0m - Weights from pretrained model not used in BertModel: ['cls.predictions.decoder_weight', 'cls.predictions.decoder_bias', 'cls.predictions.transform.weight', 'cls.predictions.transform.bias', 'cls.predictions.layer_norm.weight', 'cls.predictions.layer_norm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'][0m


In [5]:
bm.save_pretrained('./output/bert-base-chinese')