# Pytorch入门实战（7）：基于BERT实现简单的中文文本摘要任务（Summarization task）

In [None]:
!pip install datasets
!pip install transformers

In [17]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModel

# Globe Config

In [18]:
batch_size = 4
text_max_length = 512
summary_max_length = 48
epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Device:", device)

Device: cuda


In [19]:
dataset = load_dataset("amazon_reviews_multi", "zh")

Using the latest cached version of the module from C:\Users\iiosn\.cache\huggingface\modules\datasets_modules\datasets\amazon_reviews_multi\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609 (last modified on Wed Aug 10 20:35:47 2022) since it couldn't be found locally at amazon_reviews_multi., or remotely on the Hugging Face Hub.
Reusing dataset amazon_reviews_multi (C:\Users\iiosn\.cache\huggingface\datasets\amazon_reviews_multi\zh\1.0.0\724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
dataset["train"][0]

{'review_id': 'zh_0626061',
 'product_id': 'product_zh_0691762',
 'reviewer_id': 'reviewer_zh_0824776',
 'stars': 1,
 'review_body': '本人账号被盗，资金被江西（杨建）挪用，请亚马逊尽快查实，将本人的200元资金退回。本人已于2017年11月30日提交退货申请，为何到2018年了还是没解决？亚马逊是什么情况？请给本人一个合理解释。',
 'review_title': '此书不是本人购买',
 'language': 'zh',
 'product_category': 'book'}

In [21]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# Dataset And Dataloader

In [22]:
class SummarizationDataset(Dataset):

    def __init__(self, mode='train'):
        super(SummarizationDataset, self).__init__()
        self.dataset = dataset[mode]

    def __getitem__(self, index):
        data = self.dataset[index]
        text = data['review_body']
        summary = data['review_title']
        return text, summary

    def __len__(self):
        # return len(self.dataset)
        return 1

In [23]:
train_dataset = SummarizationDataset()

In [24]:
len(train_dataset)

1

In [25]:
def collate_fn(batch):
    # print(batch)
    text, summary = zip(*batch)
    text, summary = list(text), list(summary)

    # src是要送给bert的，所以不需要特殊处理，直接用tokenizer的结果即可
    src = tokenizer(text, padding='max_length', max_length=text_max_length, return_tensors='pt', truncation=True)
    tgt = tokenizer(summary, padding='max_length', max_length=summary_max_length, return_tensors='pt', truncation=True)

    tgt_y = {}
    for key, value in tgt.items():
        tgt_y[key] = value[:, 1:]

    for key, value in tgt.items():
        tgt[key] = value[:, :-1]

    n_tokens = tgt_y['attention_mask'].sum().item()

    return src, tgt, tgt_y, n_tokens

In [26]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Model

In [27]:
class SummarizationModel(nn.Module):

    def __init__(self):
        super(SummarizationModel, self).__init__()

        self.bert = AutoModel.from_pretrained("bert-base-chinese")
        decoder_layer = nn.TransformerDecoderLayer(d_model=768, nhead=8, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
        self.embeddings = self.bert.embeddings
        self.predictor = nn.Linear(768, tokenizer.vocab_size)

    def forward(self, src, tgt):
        last_hidden_state = self.bert(**src).last_hidden_state
        decoder_inputs = self.embeddings(tgt['input_ids'])
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt['input_ids'].size(-1)).to(device)
        tgt_key_padding_mask = tgt['attention_mask'] == 0
        decoder_outputs = self.decoder(tgt=decoder_inputs, memory=last_hidden_state, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        return decoder_outputs

In [28]:
model = SummarizationModel()
model = model.to(device)

In [29]:
class SummarizationLoss(nn.Module):

    def __init__(self):
        super(SummarizationLoss, self).__init__()
        self.criteria = nn.CrossEntropyLoss(ignore_index=0)

    def forward(self, outputs, tgt_y, n_tokens):
        targets = tgt_y['input_ids'].flatten()
        outputs = outputs.view(-1, tokenizer.vocab_size)

        return self.criteria(outputs, targets) / n_tokens

# Train

In [30]:
criteria = SummarizationLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

In [31]:
model.train()

if torch.cuda.is_available():
    torch.cuda.empty_cache()

total_loss = 0.
step = 0

def to_device(dict_tensors):
    result_tensors = {}
    for key, value in dict_tensors.items():
        result_tensors[key] = value.to(device)
    return result_tensors

for epoch in range(10000):
    for batch in train_loader:
        src, tgt, tgt_y, n_tokens = batch
        src, tgt, tgt_y = to_device(src), to_device(tgt), to_device(tgt_y)
        outputs = model(src, tgt)
        outputs = model.predictor(outputs)
        loss = criteria(outputs, tgt_y, n_tokens)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss
        step += 1

        if step % 1 == 0:
            print("total loss:{}", total_loss)
            total_loss = 0


        # del batch, src, tgt, tgt_y, outputs
    # break

total loss:{} tensor(1.1201, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.9838, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.8892, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.7552, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.7146, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.6837, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.6363, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.6121, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.5796, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.5673, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.5434, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.5082, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor(0.4862, device='cuda:0', grad_fn=<AddBackward0>)
total loss:{} tensor

KeyboardInterrupt: 

In [50]:
outputs

tensor([[[-0.1375,  0.0668, -0.2818,  ..., -1.1806,  0.0993, -0.1572],
         [-0.8236,  0.5126,  0.0020,  ..., -0.1308, -0.0932,  0.4920],
         [-0.5635, -0.8716,  0.3278,  ...,  0.2668, -0.8484,  0.1565],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0', grad_fn=<CopySlices>)

In [55]:
nn.Softmax(dim=-1)(outputs.view(-1, tokenizer.vocab_size)).max(dim=-1)

torch.return_types.max(
values=tensor([9.8903e-01, 9.8789e-01, 9.8701e-01, 9.8842e-01, 9.8890e-01, 9.9022e-01,
        9.8810e-01, 9.8694e-01, 9.8804e-01, 4.7331e-05, 4.7331e-05, 4.7331e-05,
        4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05,
        4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05,
        4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05,
        4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05,
        4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05,
        4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05, 4.7331e-05],
       device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([3634,  741,  679, 3221, 3315,  782, 6579,  743,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,

In [60]:
criteria(outputs, tgt_y, n_tokens)

tensor(0.8949, device='cuda:0', grad_fn=<DivBackward0>)

In [72]:
outputs[:, 9:, 0] = 1

In [75]:
outputs

tensor([[[-0.1375,  0.0668, -0.2818,  ..., -1.1806,  0.0993, -0.1572],
         [-0.8236,  0.5126,  0.0020,  ..., -0.1308, -0.0932,  0.4920],
         [-0.5635, -0.8716,  0.3278,  ...,  0.2668, -0.8484,  0.1565],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0', grad_fn=<CopySlices>)

In [95]:
criteria(outputs, tgt_y, n_tokens)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<AsStridedBackward0>) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')


tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)

In [88]:
tgt_y['input_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')

In [90]:
tgt_y

{'input_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
        device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
        device='cuda:0')}