# Pytorch入门实战（7）：基于BERT实现问答任务（Question Answering task）

In [414]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModel

# Globe Config

In [415]:
batch_size = 4
text_max_length = 512
summary_max_length = 48
epochs = 10

In [416]:
dataset = load_dataset("amazon_reviews_multi", "zh")

Reusing dataset amazon_reviews_multi (C:\Users\zhaohongfei1\.cache\huggingface\datasets\amazon_reviews_multi\zh\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [417]:
dataset["train"][0]

{'review_id': 'zh_0626061',
 'product_id': 'product_zh_0691762',
 'reviewer_id': 'reviewer_zh_0824776',
 'stars': 1,
 'review_body': '本人账号被盗，资金被江西（杨建）挪用，请亚马逊尽快查实，将本人的200元资金退回。本人已于2017年11月30日提交退货申请，为何到2018年了还是没解决？亚马逊是什么情况？请给本人一个合理解释。',
 'review_title': '此书不是本人购买',
 'language': 'zh',
 'product_category': 'book'}

In [418]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# Dataset And Dataloader

In [430]:
class SummarizationDataset(Dataset):

    def __init__(self, mode='train'):
        super(SummarizationDataset, self).__init__()
        self.dataset = dataset[mode]

    def __getitem__(self, index):
        data = self.dataset[index]
        text = data['review_body']
        summary = data['review_title']
        return text, summary

    def __len__(self):
        # return len(self.dataset)
        return 5

In [431]:
train_dataset = SummarizationDataset()

In [432]:
len(train_dataset)

5

In [433]:
def collate_fn(batch):
    # print(batch)
    text, summary = zip(*batch)
    text, summary = list(text), list(summary)

    # src是要送给bert的，所以不需要特殊处理，直接用tokenizer的结果即可
    src = tokenizer(text, padding='max_length', max_length=text_max_length, return_tensors='pt', truncation=True)
    tgt = tokenizer(summary, padding='max_length', max_length=summary_max_length, return_tensors='pt', truncation=True)

    tgt_y = {}
    for key, value in tgt.items():
        tgt_y[key] = value[:, 1:]

    for key, value in tgt.items():
        tgt[key] = value[:, :-1]

    n_tokens = tgt_y['attention_mask'].sum().item()

    return src, tgt, tgt_y, n_tokens

In [434]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Model

In [435]:
class SummarizationModel(nn.Module):

    def __init__(self):
        super(SummarizationModel, self).__init__()

        self.bert = AutoModel.from_pretrained("bert-base-chinese")
        decoder_layer = nn.TransformerDecoderLayer(d_model=768, nhead=8, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
        self.embeddings = self.bert.embeddings
        self.predictor = nn.Linear(768, tokenizer.vocab_size)

    def forward(self, src, tgt):
        last_hidden_state = self.bert(**src).last_hidden_state
        decoder_inputs = self.embeddings(tgt['input_ids'])
        decoder_outputs = self.decoder(tgt=decoder_inputs, memory=last_hidden_state)
        return decoder_outputs

In [436]:
model = SummarizationModel()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [437]:
class SummarizationLoss(nn.Module):

    def __init__(self):
        super(SummarizationLoss, self).__init__()
        self.criteria = nn.CrossEntropyLoss()

    def forward(self, outputs, tgt_y, n_tokens):
        targets = tgt_y['input_ids'].flatten()
        outputs = outputs.view(-1, tokenizer.vocab_size)
        outputs[targets == 0] = 0
        return self.criteria(outputs, targets) / n_tokens

# Train

In [438]:
criteria = SummarizationLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

In [439]:
model.train()

total_loss = 0.
step = 0
for epoch in range(epochs):
    for batch in train_loader:
        src, tgt, tgt_y, n_tokens = batch
        outputs = model(src, tgt)
        outputs = model.predictor(outputs)
        loss = criteria(outputs, tgt_y, n_tokens)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss
        step += 1

        if step % 1 == 0:
            print("total loss:{}", total_loss)
            total_loss = 0

total loss:{} tensor(0.2324, grad_fn=<AddBackward0>)
total loss:{} tensor(1.9750, grad_fn=<AddBackward0>)


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 50331648 bytes.