1、配置类（BertConfig）

In [42]:
import json
import copy

In [33]:
config_path = '../models/chinese-bert-wwm-ext/config.json'

In [26]:
config_dict = {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.0002,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": True,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 21128
}


In [53]:
class BertConfig(object):
    def __init__(
        self, 
        vocab_size,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act='gelu',
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02
    ):
        '''
        vocab_size：词表大小
        hidden_size：隐藏层神经元数
        num_hidden_layers：Transformer encoder 中的隐藏层数
        *num_attention_heads：*multi-head attention 的 head 数
        intermediate_size：encoder 的“中间”隐层神经元数（例如 feed-forward layer）
        hidden_act：隐藏层激活函数
        hidden_dropout_prob：隐层 dropout 率
        attention_probs_dropout_prob：注意力部分的 dropout
        max_position_embeddings：最大位置编码
        type_vocab_size：token_type_ids 的词典大小
        initializer_range：truncated_normal_initializer 初始化方法的 stdev
        '''
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
    
    # 从字典获得参数
    @classmethod    
    def from_json_dict(cls, json_object):
        config = BertConfig(vocab_size=None)
        for (key, value) in json_object.items():
            # 添加config的参数
            config.__dict__[key] = value
        return config
    
    # 从json文件获得参数
    @classmethod
    def from_json_file(cls, json_file):
        with open(json_file, 'r', encoding='utf8') as f:
            return cls.from_json_dict(json.load(f))
        
    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        return output
    
    def to_json_string(self):
        # indent: 格式缩进
        return json.dumps(self.to_dict(), indent=2) + '\n'

2、获取词向量（Embedding_lookup）

In [116]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [108]:
class BertEmbedding(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        max_sen_len=512,
        seg_type=2,
        embedding_size=128, 
        initializer_range=0.02,
        padding_idx=0,
        dropout=0.1,
    ):
        super().__init__()
        self.token_embedding = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim =embedding_size,
            padding_idx=padding_idx)
        
        self.pos_embedding = nn.Embedding(
            num_embeddings=max_sen_len, 
            embedding_dim =embedding_size,)
        
        self.seg_embedding = nn.Embedding(
            num_embeddings=seg_type, 
            embedding_dim =embedding_size,)
        
        self.dropout = nn.Dropout(dropout)
        
    def process_token(self, input_ids, seg_ids):
        assert input_ids.ndim == seg_ids.ndim 
        assert input_ids.ndim == 2
        
        # token_embed : [batch_size, sen, embedding_size]
        token_embed = self.token_embedding(input_ids)
        
        # pos_embed : [1,sen, embedding_size]
        # 由于pos_embed跟句子中token的位置有关，因此使用切片取出pos_embedding
        # 但此时embedding只有两个维度:[sen, embedding]，因此需要在第0维扩展
        # 方便后面广播相加
        pos_embed = self.pos_embedding.weight[:input_ids.shape[1]].unsqueeze(0)

        # seg_embed : [batch_size, sen, embedding_size]
        # 根据传入的seg_ids进行嵌入，一般seg_ids只有两个值
        seg_embed = self.seg_embedding(seg_ids)
        print(seg_embed.shape)
        return self.dropout(token_embed + pos_embed + seg_embed)

3、Transformer block(Encoder部分)

（1）MultiHead Attention

In [None]:
class Multihead_Attention(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
        
    def attention(q, k, v, d_k, mask=None, dropout=None):
        sen_len = q.shape[2]
        scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
        # scores : [batch_size, heads, sen_len, sen_len] -> [2, 4, 5, 5]

        # mask掉那些为了padding长度增加的token，让其通过softmax计算后为0
        if mask is not None:
            # mask初始size : [batch_size, sen_len]
            # 将其填充为 [batch_size, 1, sen_len, sen_len]
            # 例：mask = tensor([[1., 1., 1., 0., 0.],
            #                    [1., 1., 1., 0., 0.]])      -> [2,5]
            # 经google的bert源码测试后得出
            # 填充后为    tensor([[[[1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.]]],
    #                             [[[1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.],
    #                               [1., 1., 1., 0., 0.]]]])  ->[2,1,5,5]
            mask = mask.unsqueeze(1).repeat(1, sen_len, 1).unsqueeze(1)
    #         mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)  
    if dropout is not None:
        scores = dropout(scores)   
    output = torch.matmul(scores, v)
    # output : [batch_size, heads, sen_len, d_k]
    return output
    
    # q, k, v 分别传入，以便cross-attention传参
    def forward(self, q, k, v, mask=None):
        # x : [batch_size, sen_len, d_model]
        bs = x.shape[0]
        # 将sen_len的维度与heads的维度互换，进行多头注意力的计算
        # 转换后的size : [batch_size, heads, sen_len, d_model]
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k).transpose(1,2)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k).transpose(1,2)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k).transpose(1,2)
        
        attentioned = self.attention(q, k, v, self.d_k, mask, self.dropout)
        
        concat = attentioned.transpose(1,2).contiguous().view(bs, -1, self.model)
        output = self.out(concat)
        return output
            
        

（2）Layer Norm

（3）Feed Forward Layer

In [136]:
q = torch.rand([2,4,5,10])
k = torch.rand([2,4,5,10])
v = torch.rand([2,4,5,10])

In [137]:
d_k = 10

In [138]:
mask = torch.cat((torch.ones([2,3]), torch.zeros([2,2])), -1)
mask

tensor([[1., 1., 1., 0., 0.],
        [1., 1., 1., 0., 0.]])

In [166]:
test = torch.tensor([5.6534e-01, 6.2107e-01, 8.5721e-01, -1e9, -1e9])
F.softmax(test, dim=-1)

tensor([0.2944, 0.3113, 0.3942, 0.0000, 0.0000])

In [172]:
s = attention(q,k,v,d_k,mask)

RuntimeError: The size of tensor a (2) must match the size of tensor b (4) at non-singleton dimension 1