In [1]:
import torch 
from tqdm import tqdm
import torch.nn as nn 
from torch.optim import Adam
import numpy as np
import os
import json
import time
import pandas as pd
import glob
import bert_seq2seq
from torch.utils.data import Dataset, DataLoader
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert

In [2]:
vocab_path = "./vocab/bert_wmm_chinese_vocab.txt"
word2idx, keep_tokens = load_chinese_base_vocab(vocab_path=vocab_path, simplfied=True)
model_name = 'bert'
model_path = "./model_file/torch_model.bin"
model_save_path = "./model_file/trained_model/summary.bin"

精简后的词表大小为：13584


In [3]:
df = pd.read_csv("./dataset/train_with_summ.csv")
del df["Unnamed: 0"]
# train_len = df.article.__len__() // 10 * 8 
train_dataset = df

In [5]:
batch_size = 8
lr = 1e-5
maxlen=512

In [6]:
class BertDataset(Dataset):
    """
    针对特定数据集，定义相关的取数据方式
    """
    def __init__(self):
        super(BertDataset, self).__init__()
        # 拿到数据集
        
        self.dataset = train_dataset
        # 词->id
        self.idx2word = {k: v for v, k in word2idx.items()}
        # 分词器
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):
        # 得到单个数据
        summary, article = self.dataset["summary"][i], self.dataset["article"][i]
        # print(article)
        # print(summary)
        token_ids, token_type_ids = self.tokenizer.encode(
            article, summary, max_length=maxlen
        )
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
        }
        
        return output
    
    def __len__(self):
        return len(self.dataset)

In [7]:
def collate_fn(batch):
    """
    动态padding，batch为一部分sample
    """
    def padding(indice, max_length, pad_idx=0):
        """
        pad函数
        """
        pad_indice = [item + [pad_idx] * max(0, max_length-len(item)) for item in indice]
        
        return torch.tensor(pad_indice)
    token_ids = [data["token_ids"]for data in batch]
    max_length = max([len(t) for t in token_ids])
    token_type_ids = (data["token_type_ids"] for data in batch)

    token_ids_padded = padding(token_ids, max_length)
    token_type_ids_padded = padding(token_type_ids, max_length)
    # 任务目标
    target_ids_padded = token_ids_padded[:,1:].contiguous()

    return token_ids_padded, token_type_ids_padded, target_ids_padded

In [8]:
class Trainer:
    def __init__(self):
        # 判断是否有可用GPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx, model_name=model_name)
        ## 加载预训练的模型参数～
        self.bert_model = nn.DataParallel(self.bert_model)
        # self.bert_model = nn.parallel.data_parallel(self.bert_model, patch, device_ids=[0,1,2])
        self.bert_model.module.load_pretrain_params(model_path, keep_tokens=keep_tokens)
        # 加载已经训练好的模型，继续训练

        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.module.set_device(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = BertDataset()
        self.dataloader =  DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    def train(self, epoch):
        # 一个epoch的训练
        self.bert_model.module.train()
        self.iteration(epoch, dataloader=self.dataloader, train=True)
    
    def save(self, save_path):
        """
        保存模型
        """
        self.bert_model.module.save_all_params(save_path)
        print("{} saved!".format(save_path))

    def iteration(self, epoch, dataloader, train=True):
        total_loss = 0
        start_time = time.time() ## 得到当前时间
        step = 0
        report_loss = 0
        for token_ids, token_type_ids, target_ids in tqdm(dataloader,position=0, leave=True):
            step += 1
            if step % 1000 == 0:
                self.bert_model.eval()
                test_data = eval_dataset[:200]
                for text in test_data:
                    print(self.bert_model.module.generate(text, beam_size=3))
                print("loss is " + str(report_loss))
                report_loss = 0
                # self.eval(epoch)
                self.bert_model.module.train()
            if step % 8000 == 0:
                self.save(model_save_path)

            # 因为传入了target标签，因此会计算loss并且返回
            predictions, loss = self.bert_model(token_ids,
                                                token_type_ids,
                                                labels=target_ids,
                                               
                                                )
            report_loss += loss.item()
            # 反向传播
            if train:
                # 清空之前的梯度
                self.optimizer.zero_grad()
                # 反向传播, 获取新的梯度
                loss.backward()
                # 用获取的梯度更新模型参数
                self.optimizer.step()

            # 为计算当前epoch的平均loss
            total_loss += loss.item()

        end_time = time.time()
        spend_time = end_time - start_time
        # 打印训练信息
        print("epoch is " + str(epoch)+". loss is " + str(total_loss) + ". spend time is "+ str(spend_time))
        # 保存模型
        self.save(model_save_path)


In [9]:
trainer = Trainer()
train_epoches = 5

for epoch in range(train_epoches):
    # 训练一个epoch
    trainer.train(epoch)

device: cuda
  0%|          | 0/6250 [00:00<?, ?it/s]./model_file/torch_model.bin loaded!
  0%|          | 0/6250 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 6.00 GiB total capacity; 4.41 GiB already allocated; 43.26 MiB free; 4.46 GiB reserved in total by PyTorch)

In [44]:
pd.read_csv("./dataset/train_with_summ.csv")['article'].max().__len__()

3275

In [35]:
test_word2idx, test_keep_tokens = load_chinese_base_vocab(vocab_path=vocab_path, simplfied=True)

精简后的词表大小为：13584


In [36]:
type(test_word2idx), type(test_keep_tokens)

(dict, list)

In [14]:
test_dataset = BertDataset()
test_dataset.__getitem__(1)

中新社西宁11月22日电(赵凛松)青海省林业厅野生动植物和自然保护区管理局高级工程师张毓22日向中新社记者确认：“经过中国林业科学院、中科院新疆生态与地理研究所和青海省林业厅的共同认定，出现在青海省海西州境内的三只体型较大的鸟为世界极度濒危的红鹳目红鹳科红鹳属的大红鹳。”11月18日，青海省海西州可鲁克湖—托素湖国家级陆生野生动物疫源疫病监测站在野外监测巡护过程中，在可鲁克湖西南岸入水口盐沼滩发现三只体型较大的鸟类。张毓说：“此前在该区域从未发现过这种体型的鸟类。”可鲁克湖—托素湖位于青海省柴达木盆地东北部，海拔2800米，水域湿地环境内的优势种动物主要是水禽，共有30余种。根据拍摄的照片以及视频，张毓根据动物学体型得出了初步结论，然后会同中国林业科学院和中科院新疆生态与地理研究所的相关专家，确认了这三只鸟为红鹳目红鹳科红鹳属的大红鹳。大红鹳也称为大火烈鸟、红鹤等，三只鸟类特征为大红鹳亚成体。根据世界自然保护联盟、世界濒危动物红色名录，该鸟主要分布于非洲、中亚、南亚等区域，分布广、种群数量较大，无威胁因子，以往在中国并无分布。但1997年在新疆野外首次发现并确定该鸟在中国境内有分布，为中国鸟类新纪录，2012年在四川也发现一只该鸟亚成体。此次野外发现在中国属第三次。“我们现在还无法判断这三只鸟从何而来。不过我个人倾向于是从中亚国家迁徙至此。”张毓强调说，该种鸟国内也有人工饲养，因此也有人判断为从动物园逃逸。“我们对这三只鸟进行了详尽的记录，如果明年这个时间还在此地出现这种鸟，那就能肯定是迁徙的鸟类，而不是从动物园里跑出来的。”由于目前可鲁克湖—托素湖已开始结冰，鸟类采食困难，不排除三只鸟由于无法获得能量补给而进行远距离迁飞的可能。青海省林业厅野生动物行政主管部门将随时做好野外救护的各项准备工作。(完)
青海首次野外发现濒危大火烈鸟 尚不清楚具体来源


{'token_ids': [2,
  602,
  3071,
  4750,
  6103,
  2021,
  8008,
  3197,
  8027,
  3087,
  4408,
  11,
  6525,
  1021,
  3249,
  12,
  7369,
  3760,
  4587,
  3258,
  587,
  1222,
  6927,
  4393,
  1118,
  3388,
  4187,
  1367,
  5530,
  4095,
  822,
  2742,
  1175,
  4950,
  4313,
  2127,
  7668,
  5175,
  2237,
  4821,
  2258,
  2374,
  3580,
  8027,
  3087,
  1301,
  602,
  3071,
  4750,
  6279,
  5340,
  4700,
  6269,
  7936,
  1,
  5205,
  6712,
  602,
  1642,
  3258,
  587,
  4804,
  2008,
  7266,
  408,
  602,
  4804,
  7266,
  3071,
  4436,
  4393,
  2476,
  578,
  1663,
  4313,
  4675,
  4853,
  2690,
  1367,
  7369,
  3760,
  4587,
  3258,
  587,
  1222,
  4536,
  964,
  1296,
  6269,
  2035,
  7922,
  1037,
  4283,
  1660,
  7369,
  3760,
  4587,
  3760,
  6103,
  2234,
  1760,
  977,
  4536,
  574,
  1270,
  758,
  1696,
  6670,
  1818,
  4536,
  7779,
  609,
  584,
  4416,
  3251,
  2326,
  3983,
  1212,
  4536,
  5171,
  7815,
  4578,
  5171,
  7815,
  4804,
  5171,
  781

In [15]:
[1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1].__len__()

23