预训练语言模型：
- 作为特征提取器
- 作为encoder参数下游任务微调使用上非常类似，差别是后者在训练过程中原预训练语言模型的参数也允许优化。

主要内容：
- 以XLNET介绍 HuggingFace transformers 组件的使用套路
- 以XLNET为例介绍如何接下游任务的文本分类和抽取式问答

In [1]:
import os 
import torch
import torch.nn as nn 
import torch.functional as F 
!pip install transformers
from transformers import XLNetModel, XLNetTokenizer, XLNetConfig

ModuleNotFoundError: No module named 'torch'

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased',
                                  output_hidden_states=True,
                                  output_attentions=True)

In [None]:
# 本地加载XLNET模型
# MODEL_PATH = r'D:\data\nlp\xlnet-model/'
# config = XLNetConfig.from_json_file(os.path.join(MODEL_PATH, 'xlnet-base-cased-config.json'))
# config.output_hidden_states = True
# config.output_attentions = True

# tokenizer = XLNetTokenizer(os.path.join(MODEL_PATH, 'xlnet-base-cased-spiece.model'))
# model = XLNetModel.from_pretrained(MODEL_PATH, config=config)

# 1 句子到token_id的转换

In [None]:
# 利用tokenizer将原始的句子准备成模型输入
sentence= 'This is an interesting review session'

# tokenization
tokens = tokenizer.tokenize(sentence)
print('Tokens: {}'.format(tokens))

# 将token 转化为ID
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# 添加特殊token:<cls>, <sep>
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# 准备成pytorch tensor
tokens_pt = torch.tensor([tokens_ids])
print("Tokens Pytorch: {}".format(tokens_pt))

In [None]:
# 一条龙服务
tokens_pt2 = tokenizer(sentence, return_tensors='pt')
print("Tokens Pytorch: {}".format(tokens_pt2))

In [None]:
# 批处理
sentences = ['The ultimate answer to life, universe and time is 42.',
            'Take a towel for a space travel.']
print("Batch tokenization:\n", tokenizer(sentences)["input_ids"])
print("With padding:\n", tokenizer(sentences, padding=True)['input_ids'])

In [None]:
# 输入句子对
multi_seg_input = tokenizer("This is segment A", "This is segment B.")
print("Multi segment token (str): {}".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("Multi segment token (int): {}".format(multi_seg_input['input_ids']))
print("Multi segment type       : {}".format(multi_seg_input['token_type_ids']))

# 2 模型encoding

In [None]:
# 默认情况下模型是model.eval()模式，下面使用encode输入的句子
print("Is training mode?", model.training)
sentence = "The ultimate answer to life, universe and time is 42."
tokens_pt = tokenizer(sentence, return_tensor='pt')
print('Token:'.format(tokenizer.convert_ids_to_tokens(tokens_pt["input_ids"][0])))

final_layer_h, all_layer_h, attentions = model(**tokens_pt)
print(torch.sum(final_layer_h - all_layer_h[-1]).item())

final_layer_h.shape, len(all_layer_h), len(attentions)

# 3 下游任务

### 例1 文本分类

In [2]:
class XLNetSeqSummary(nn.Module):
    def __init__(self, how='cls',
                 hidden_size = 768,
                 activation = None,
                 first_dropout = None,
                 last_dropout = None):
        super().__init__()
        self.how = how
        self.summary = nn.Linear(hidden_size, hidden_size)
        self.activation = activation if activation else nn.GELU()
        self.first_dropout = first_dropout if first_dropout else nn.Dropout(0.5)
        self.last_dropout = last_dropout if last_dropout else nn.Dropout(0.5)
    
    def forward(self, hidden_states):
        """
        对隐状态序列池化或返回cls处的表示，作为句子的encoding.
        Args: hidden_states: XLNET 模型输出的最后层隐状态序列。
        Returns: 句子向量表示
        """
        if self.how == 'cls':
            output = hidden_states[:, -1]
        elif self.how == 'mean':
            output = hidden_states.mean(dim=1)
        elif self.how == 'max':
            output = hidden_states.max(dim=1)
        else:
            raise Exception("Summary type {} not implemted".format(self.how))
        
        output = self.first_dropout(output)
        output = self.summary(output)
        output = self.activation(output)
        output = self.last_dropout(output)

        return output

NameError: name 'nn' is not defined

In [None]:
class XLNetSentenceClassifier(nn.Module):
    def __init__(self, num_labels, xlnet_model, d_model=768):
        super().__init__()
        self.num_labels = num_labels
        self.d_model = d_model
        self.transformer = xlnet_model
        self.sequence_summary = XLNetSeqSummary('cls', d_model, nn.GELU())
        self.logits_proj = nn.Linear(d_model, num_labels)
    
    def forward(self, model_inputs):
        transformer_outputs = self.transformer(**model_inputs)
        output = transformer_outputs[0]
        output = self.sequence_summary(output)
        logits = self.logits_proj(output)
        return logits

def get_loss(criterion, logits, labels):
    return criterion(logits, labels)

In [None]:
# 验证forward 和反向传播
sentences = ['The ultimate answer to life, universe and time is 42.',
             'Take a towel for a space travel.']

labels = torch.LongTensor([0, 1])

# 实例化各个模块
criterion = nn.CrossEntropyLoss()
classifier = XLNetSentenceClassifier(2, model)
optimizer = torch.optim.AdamW(classifier.parameters())

# forward + loss
classifier.train()
optimizer.zero_grad()
logits = classifier(tokenizer(sentences, padding=True, return_tensors='pt'))
loss = get_loss(criterion, logits, labels)
print('loss: ', loss.item())

# backward step
loss.backward()
optimizer.step()
print("="*25)
print("Confirm that the gradients are computed for the original XLNet parameters.\n")
for param in classifier.parameters():
    print(param.shape, param.grad.sum() if not param.grad is None else param.grad)

# 例2 抽取式问答

In [None]:
class AnsStartLogits(nn.Module):
    "用于预测每个token是否为答案span开始位置"
    def __init__(self, hidden_size):
        super().__init__()
        self.linear = nn.Linear(hidden_size, 1)
    
    def forward(self, hidden_states, p_mask=None):
        x = self.linear(hidden_states).squeeze(-1)
        if p_mask is not None:
            x = x * (1-p_mask) - 1e30 * p_mask
        return x

In [None]:
class AnsEndLogits(nn.Module):
    "用于预测每个token是否为答案span结束位置，符合直觉。conditioned on 开始位置"
    def __init__(self, hidden_size):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Tanh(),
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size, 1)
        )
    def forward(self, hidden_states, start_states, p_mask=None):
        x = self.layer(torch.cat([hidden_states, start_states], dim=-1))
        x = x.squeeze(-1)
        if p_mask is not None:
            x = x*(1-p_mask) - 1e30 * p_mask
        return x

In [None]:
class XLNetQuestionAnswering(nn.Module):
    def __init__(self, num_labels, xlnet_model, d_model=768,
                top_k_start=2, top_k_end=2):
        super().__init__()
        self.transformer = xlnet_model
        self.start_logits = AnsStartLogits(d_model)
        self.end_logits = AnsEndLogits(d_model) # d_model就是hiddensize
        self.top_k_start = top_k_start
        self.top_k_end = top_k_end
    
    def forward(self, model_inputs, p_mask=None, start_positions=None):
        """
        p_mask 可选的mask, 被mask的位置不可能存在答案（eg:[CLS][PAD][QUES]）
                1.0表示应当被mask, 0.0反之为不被mask的值
        start_positions 正确答案标注的开始位置，训练时需要输入模型以利用
                teacher forcing计算end_logits. 
                Inference时 不需要输入，beam search 返回top k个开始和结束位置。
        """
        transformer_outputs = self.transformer(**model_inputs)
        hidden_states = transformer_outputs[0]
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
        if not start_positions is None:
            # 在训练时利用teacher forcing trick训练end_logits
            slen, hsz = hidden_states.shape[-2:]
            start_positions = start_positions.expand(-1, -1, hsz) # shape: (bsz, 1, hsz)
            start_states = hidden_states.gather(-2, start_positions) # bsz,1,hsz
            start_states = start_states.expand(-1, slen, -1) # shape: bsz, slen, hsz
            end_logits = self.end_logits(hidden_states,
                                        start_states=start_states,
                                        p_mask_p_mask)
            return start_logits, end_logits
        else:
            # 在Inference 时 利用Beam Search求end_logit
            bsz, slen, hsz = hidden_states.size() # batchsize, seq_len, hidden_size
            start_probs = torch.softmax(start_logits, dim=-1)
            start_top_probs, start_top_index = torch.topk(start_probs, self.top_k_start, dim=-1)
            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)#(bsz, top_k_start, hsz)
            start_states = torch.gather(hidden_states, -2, start_top_index_exp)
            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)
            
            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states)
            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
            end_logits = self.end_logits(hidden_states_expanded,
                                        start_states = start_states,
                                        p_mask = p_mask)
            end_probs = torch.softmax(end_logits, dim=1)#shape: bsz, slen, top_k_start
            end_top_probs, end_top_index = torch.topk(end_probs, self.top_k_end, dim=1) 
            # shape: bsz, top_k_end, top_k_start
            end_top_probs = torch.transpose(end_top_probs, 2, 1)
            # bsz, top_k_start, top_k_end
            end_top_index = torch.transpose(end_top_index, 2, 1)
            # bsz, top_k_start, top_k_end
            
            end_top_probs = end_top_probs.reshape(-1, self.top_k_start * self.top_k_end)
            end_top_index = end_top_index.reshape(-1, self.top_k_start * self.top_k_end)
            
            return start_top_probs, start_top_index, end_top_probs, end_top_index, start_logits, end_logits
            

def get_loss(criterion, start_logits, start_positions, end_logits, end_positions):
    start_loss = criterion(start_logits, start_positions)
    end_loss = criterion(end_logits, end_positions)
    return (start_loss + end_loss)/2
    
        

### 检测用于训练的forward和backward

In [None]:
context = r"""
    Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
    TensorFlow 2.0 and PyTorch.
    """
questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]
start_positions = torch.LongTensor([95, 36, 110])
end_positions = torch.LongTensor([97, 88, 123])
p_mask = [[1]*12 + [0]*(125-14) + [1,1],
          [1]*7  + [0]*(120- 9) + [1,1],
          [1]*12 + [0]*(125-14) + [1,1]]

neg_log_loss = nn.CrossEntropyLoss()
q_answer = XLNetQuestionAnswering(2, model, 768, 2, 2)
optimizer = torch.optim.AdamW(q_answer.parameters())

In [None]:
q_answer.train()
optimizer.zero_grad()
for ith, question in enumerate(questions):
    start_logits, end_logits = q_answer(
        tokenizer(question,
                  context,
                  add_special_tokens=True,
                  return_tensors='pt'),
        p_mask = torch.ByteTensor(p_mask[ith]),
        start_positions = start_positions[ith].view(1,1,1)
        )
    loss = get_loss(criterion,
                   start_logits,
                   start_positions[ith].view(-1),
                   end_logits,
                   end_positions[ith].view(-1))
    print("\n True Start:{}, True End:{}\n Pred Start Prob:{},Pred End Prob: {}\nPred Max Start: {}, Pred Max End: {}\nPred Max Start Prob: {}, Pred Max end Prob:{}\nLoss: {}\n".format(
        start_positions[ith].item(),
        end_positions[ith].item(),
        torch.sigmoid(start_logits[:, start_positions[ith]]).item(),
        torch.sigmoid(end_logits[:, end_positions[ith]]).item(),
        torch.argmax(start_logits).item(),
        torch.argmax(end_logits).item(),
        torch.sigmoid(torch.max(start_logits)).item(),
        torch.sigmodi(torch.max(end_logits)).item(),
        loss.item()
    ))
    print("="*25)
    loss.backward()
    optimizer.step()
print("\n Confirm that the gradients are computed for the original XLNET parameters")
for param in q_answer.parameters():
    print(param.shape, param.grad.sum() if not param.grad is None else param.grad)

### inference的forward以及实现Beam Search decoding

In [None]:
import numpy as np
def decode(start_probs, end_probs, topk):
    """
    给定beam中预测的开始和结束概率，搜索topk个最佳答案
    """
    top_k_start = start_probs.shape[-1]
    top_k_end = end_probs.shape[-1]//top_k_start
    #计算每一个（start end）对的分数，P(Start, end|sentence) = P(start|sentence) * P(end|start, sentence)
    joint_probs = dict()
    for i in range(top_k_start):
        for j in range(top_k_end):
            end_idx = i * top_k_end + j
            joint_probs[(i, end_idx)] = start_probs[i] * end_probs[end_idx]
    id_pairs, probs = zip(*sorted(joint_probs.items(), key=lambda kv: kv[1], reverse=True)[:topk])
    start_ids, end_ids = zip(*id_pairs)
    return start_ids, end_ids, probs    

In [None]:
# inference
# inference
context = r"""
    Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
    TensorFlow 2.0 and PyTorch.
    """
questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]
q_answer.eval()
for ith, question in enumerate(questions):
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors='pt')
    input_ids = inputs["input_ids"].tolist()[0]
    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    start_probs, start_index, end_probs, end_index, start_logits, end_logits = q_answer(
        inputs, p_mask=torch.ByteTensor(p_mask[ith]))
    pred_starts, pred_ends, probs = decode(
        start_probs.detach().squeeze().numpy(),
        end_probs.detach().squeeze().numpy(),
        2)
    # 只打印一个答案
    start = start_index[:, pred_starts[0]].item()
    end = end_index[:, pred_ends[0]].item()
    print("="*25)
    print("True start: {}, True end: {}".format(
        start_positions[ith].item(),
        end_positions[ith].item()
        ))
    print("Max answer prob: {:0.8f}, start idx: {}, end idx: {}".format(
        probs[0],
        start,
        end,
    ))
    print("-"*25)
    print("Question: '{}'".format(question))
    print("Answer: '{}'".format(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end]))))
    print("="*25)    