In [57]:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

Bert主要难在前面的数据处理上

In [58]:
text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)
#正则表达式把。 ？去掉
sentences = re.sub("[.,!?\\-]",'',text.lower()).split('\n')
word_list = list(set(" ".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]
#pad 是用来让句子的长度相同，cls放在开头，sep夹在两句话中间 mask随机替换单词
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
for i,w in enumerate(word_list):
    word2idx[w] = i + 4
idx2word = {i:w for i,w in enumerate(word2idx)}
vocab_size = len(word2idx)

#每个字都是一个token 下面这个数组存的是每一句话
token_list =list()
for sentence in sentences:
    wordArray = [word2idx[s] for s in sentence.split()]
    token_list.append(wordArray)

In [59]:
#print(token_list)
#print(sentences)
#randrange(len(sentences)) #输出 0-8

In [60]:
#这里是模型的参数
maxlen = 30 #所有样本的句子长度都一样， 这里我们把所有batch的样本句子的长度都一样
batch_size = 6
max_pred = 5 # max tokens of prediction 我们一句话里有0.15个token需要做mask 
n_layers = 6 #表示 Encoder Layer 的数量
n_heads = 12 #multi head attention
d_model = 768 #表示 Token Embeddings、Segment Embeddings、Position Embeddings 的维度
d_ff = 768*4 # 4*d_model, FeedForward dimension 全连接神经网络得维度
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2 #一个样本是由多少句话构成的 在Bert的论文中是两句话构成的

下面代码中，positive表示两句话是连续的，negative表示两句话不是连续的，我们需要做到在一个 batch 中，这两个样本的比例为 1:1。随机选取的两句话是否连续，只要通过判断 tokens_a_index + 1 == tokens_b_index 即可

然后是随机 mask 一些 token，n_pred 变量代表的是即将 mask 的 token 数量，cand_maked_pos 代表的是有哪些位置是候选的、可以 mask 的（因为像 [SEP]，[CLS] 这些不能做 mask，没有意义），最后 shuffle() 一下，然后根据 random() 的值选择是替换为 [MASK] 还是替换为其它的 token

接下来会做两个 Zero Padding，第一个是为了补齐句子的长度，使得一个 batch 中的句子都是相同长度。第二个是为了补齐 mask 的数量，因为不同句子长度，会导致不同数量的单词进行 mask，我们需要保证同一个 batch 中，mask 的数量（必须）是相同的，所以也需要在后面补一些没有意义的东西，比方说 [0]

In [61]:
#下面我们要准备数据集了，这里是比较复杂的
def make_data():
    batch = []
    positive = negative =0 #pos 表示这两句话是相邻的，neg表示两句话不是相邻的
    #如果相邻 pos+1 不相邻 neg+1

    while positive != batch_size/2 or negative != batch_size/2:
        #这一步是随机拼接两句话
        #我们用tokens_a_index表示上一句话的索引，tokens_b_index表示下一句话的索引
        #因为这里有九句话，我们就从这9句话随机抽取两句话的索引来作为输入
        tokens_a_index,tokens_b_index =randrange(len(sentences)),randrange(len(sentences))
        #取出这两句话中每一个token的索引
        tokens_a,tokens_b = token_list[tokens_a_index],token_list[tokens_b_index]
        #下面是处理每句话的输入
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        #处理segment_ids 这里主要是让bert明白哪些是一句话
        segment_ids = [0] * (1 + len(tokens_a) +1) + [1] * (len(tokens_b) + 1)


        #下面是mask掉几个输入 只有15%的token做mask
        n_pred = min(max_pred, max(int(len(input_ids)*0.15),1))
        #候选的mask的位置， 因为cls，sep不能做mask，
        cand_maked_pos = [i for i,token in enumerate(input_ids)
                                        if token !=[word2idx['[CLS]']] and token !=[word2idx['[SEP]']]]
        #把侯选位置随机
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [],[]
        #取前n_pred个
        for pos in cand_maked_pos[:n_pred]:
            #把索引取出来
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            #bert有三种情况
            if random()<0.8:
                input_ids[pos] = word2idx['[MASK]']
            elif random()>0.9:
                #随机另外一个单词
                index = randint(0, vocab_size-1)
                while index < 4: #去掉无意义的字符
                    index = randint(0, vocab_size-1)
                input_ids[pos] = index
        #做完这些 我们就要给这个句子补pad
        n_pad =maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        #mask的个数也要相同
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_pos.extend([0] * n_pad)
            masked_tokens.extend([0] * n_pad)
        
        #判断这两句话是否是相邻的
        #这里我们要使得neg和posi的数量是1:1
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids,segment_ids,masked_tokens,masked_pos,True]) #是下一个
            positive += 1

        if tokens_b_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids,segment_ids,masked_tokens,masked_pos,False]) #不是下一个
            negative += 1
     
    return batch

In [62]:
batch = make_data()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),\
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)

class MyDataSet(Data.Dataset):
  def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
    self.input_ids = input_ids
    self.segment_ids = segment_ids
    self.masked_tokens = masked_tokens
    self.masked_pos = masked_pos
    self.isNext = isNext
  
  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]

loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext), batch_size, True)

In [63]:
#除掉句子中没用的字符
def get_attn_pad_mask(seq_q, seq_k):
    '''
    seq_q: [batch_size, seq_len]
    seq_k: [batch_size, seq_len]
    seq_len could be src_len or it could be tgt_len
    seq_len in seq_q and seq_len in seq_k maybe not equal
    '''
    batch_size_q, len_q = seq_q.size()
    # eq(zero) is PAD token
    #每一个位置上的值和0比较，不是0就是F，是0就是True，再扩展一个维度，word_emb是三维的
    pad_attn_mask = seq_q.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
    #batchsize 表示这里有几句话 这里encoder和decoder都会调用
    return pad_attn_mask.expand(batch_size_q, len_q, len_q)  # [batch_size, len_q, len_k]

![Q_V_2_U79Z_GHA64_P_G9DA.png](https://i.loli.net/2021/10/18/HbkJTOnMErejiL1.png)

In [64]:
def gelu(x):
    #一个新的激活函数 bert论文中提出来的
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


In [65]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding,self).__init__()
        self.tok_embed = nn.Embedding(vocab_size,d_model)
        self.pos_embed = nn.Embedding(maxlen,d_model)
        self.seg_embed = nn.Embedding(n_segments,d_model)
        self.norm = nn.LayerNorm(d_model)

    def forward(self,x,seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # [seq_len] -> [batch_size, seq_len]
        embedding = self.tok_embed(x) + self.pos_embed(pos) +self.seg_embed(seg)
        return self.norm(embedding)


In [66]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        '''
        Q: [batch_size, n_heads, len_q, d_k]
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v]
        attn_mask: [batch_size, n_heads, seq_len, seq_len]
        '''
        #乘上k的转置 变成[lenq,lenk]
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, len_q, len_k]
        #把attn_mask矩阵中为True的地方替换为-1e9
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
        
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v]
        return context


In [67]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)
    def forward(self, input_Q, input_K, input_V, attn_mask):
        '''
        input_Q: [batch_size, len_q, d_model]
        input_K: [batch_size, len_k, d_model]
        input_V: [batch_size, len_v(=len_k), d_model]
        attn_mask: [batch_size, seq_len, seq_len]
        '''
        residual, batch_size = input_Q, input_Q.size(0)
        # (B, S, D) -proj-> (B, S, D_new) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        #下面这一步我们给他做一个维度的变换，这是为了后面计算好算
        q_s = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # Q: [batch_size, n_heads, len_q, d_k]
        k_s = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # K: [batch_size, n_heads, len_k, d_k]
        v_s = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # V: [batch_size, n_heads, len_v(=len_k), d_v]
        #中间增加的维度扩城到n_heads个
        #对于encoder这里的attn_mask就是去除的填充字符
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, len_q, d_v], attn: [batch_size, n_heads, len_q, len_k]
        context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads * d_v]
        output = self.fc(context)
        return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]


In [68]:
#前馈神经网络 相比transformer这里的激活函数已经改变了
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        return self.fc2(gelu(self.fc1(x)))

In [69]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        '''
        enc_inputs: [batch_size, src_len, d_model]
        enc_self_attn_mask: [batch_size, src_len, src_len]
        '''
        # enc_outputs: [batch_size, src_len, d_model], attn: [batch_size, n_heads, src_len, src_len]
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        return enc_outputs


In [72]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        #单词转换成输入向量的维度
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])#_ 是占位符， 表示不在意变量 的 值 只是用于循环遍历n次
        self.fc = nn.Sequential(
            nn.Linear(d_model,d_model),
            nn.Dropout(0.5),
            nn.Tanh(),
        )
        self.classifier = nn.Linear(d_model,2)
        self.linear = nn.Linear(d_model,d_model)
        self.activ2 = gelu
        embed_weight = self.embedding.tok_embed.weight
        self.fc2 = nn.Linear(d_model, vocab_size, bias=False)
        self.fc2.weight = embed_weight
        
    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids) # [batch_size, seq_len, d_model]
        #要把输入进去的多余字符p去掉，同时保证矩阵的大小
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids) # [batch_size, maxlen, maxlen]
        enc_self_attns = []
        for layer in self.layers:
            # output: [batch_size, seq_len, d_model], 
            output = layer(output , enc_self_attn_mask)
        #下面我们取出第一列CLS的token
        h_pooled = self.fc(output[:,0])
        #然后我们返回这一行的结果
        logits_clsf = self.classifier(h_pooled)
        
        #下面这两行就是把预测位置的词往前移到对应位置，计算loss的时候正好可以一一对应。
        masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        
        h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]
        logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]
        return logits_lm, logits_clsf
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)

In [73]:
for epoch in range(180):
    for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
      logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
      #下面这一行主要实现了将[batch_size, max_pred, d_model]转换为[batch_size * max_pred, d_model]
      #这里一个batch是一句话
      loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # for masked LM
      loss_lm = (loss_lm.float()).mean()
      loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
      loss = loss_lm + loss_clsf
      if (epoch + 1) % 10 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0010 loss = 0.933714
Epoch: 0020 loss = 0.780609
Epoch: 0030 loss = 0.803110
Epoch: 0040 loss = 0.769826
Epoch: 0050 loss = 0.776828
Epoch: 0060 loss = 0.842285
Epoch: 0070 loss = 0.830547
Epoch: 0080 loss = 0.759959
Epoch: 0090 loss = 0.745616
Epoch: 0100 loss = 0.746445
Epoch: 0110 loss = 0.777969
Epoch: 0120 loss = 0.726676
Epoch: 0130 loss = 0.752499
Epoch: 0140 loss = 0.779684
Epoch: 0150 loss = 0.723779
Epoch: 0160 loss = 0.764170
Epoch: 0170 loss = 0.645281
Epoch: 0180 loss = 0.735849


In [77]:
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[5]
print(text)
print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])

logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
                 torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thank you Romeo
Where are you going today?
I am going shopping. What about you?
I am going to visit my grandmother. she is not very well
['[CLS]', '[MASK]', 'meet', 'you', 'too', '[MASK]', 'are', 'you', 'today', '[SEP]', 'great', 'my', 'baseball', 'team', 'won', 'the', 'competition', '[SEP]']
masked tokens list :  [37, 39]
predict masked tokens list :  [37, 39]
isNext :  True
predict isNext :  True
