# DuReader Robust

In [1]:
!pip install --upgrade paddlenlp -i https://mirror.baidu.com/pypi/simple/

Looking in indexes: https://mirror.baidu.com/pypi/simple/
Collecting paddlenlp
[?25l  Downloading https://mirror.baidu.com/pypi/packages/62/10/ccc761d3e3a994703f31a4d0f93db0d13789d1c624a0cbbe9fe6439ed601/paddlenlp-2.0.5-py3-none-any.whl (435kB)
[K     |████████████████████████████████| 440kB 10.1MB/s eta 0:00:01
Installing collected packages: paddlenlp
  Found existing installation: paddlenlp 2.0.1
    Uninstalling paddlenlp-2.0.1:
      Successfully uninstalled paddlenlp-2.0.1
Successfully installed paddlenlp-2.0.5


In [None]:
import json
import math
import os
import random
import time
from functools import partial

import numpy as np
import paddle
import paddlenlp as ppnlp
from paddle.io import DataLoader
from paddle.io import BatchSampler
from paddle.io import DistributedBatchSampler
from paddlenlp.data import Dict
from paddlenlp.data import Pad
from paddlenlp.data import Stack
from paddlenlp.data import Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.datasets import MapDataset
from paddlenlp.ops.optimizer import AdamW
from paddlenlp.transformers import BertForQuestionAnswering
from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import ErnieForQuestionAnswering
from paddlenlp.transformers import ErnieTokenizer
from paddlenlp.transformers import ErnieGramForQuestionAnswering
from paddlenlp.transformers import ErnieGramModel
from paddlenlp.transformers import ErnieGramTokenizer
from paddlenlp.transformers import RobertaForQuestionAnswering
from paddlenlp.transformers import RobertaTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup

from config import Config
from utils import CrossEntropyLossForSQuAD
from utils import evaluate
from utils import predict
from utils import prepare_train_features
from utils import prepare_validation_features
from utils import set_seed



In [None]:
MODEL_CLASSES = {
    "bert": (BertForQuestionAnswering, BertTokenizer),
    "ernie": (ErnieForQuestionAnswering, ErnieTokenizer),
    "ernie_gram": (ErnieGramForQuestionAnswering, ErnieGramTokenizer),
    "roberta": (RobertaForQuestionAnswering, RobertaTokenizer)
}

In [None]:
def do_train(args):
    
    paddle.set_device(args.device)

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    set_seed(args)

    # train_ds = load_dataset('cmrc2018', splits=['train'])
    # dev_ds = load_dataset('dureader_robust', splits=['dev'])
    train_robust, dev_ds = load_dataset('dureader_robust', splits=['train', 'dev'])
    train_cmrc, dev_cmrc = load_dataset('cmrc2018', splits=['train', 'dev'])

    train_dataset = []
    for idx, example in enumerate(train_robust):
        train_dataset.append(example)
    for idx, example in enumerate(train_cmrc):
        train_dataset.append(example)
    for idx, example in enumerate(dev_cmrc):
        train_dataset.append(example)
    train_ds = MapDataset(train_dataset)

    train_trans_func = partial(
        prepare_train_features, 
        max_seq_length=args.max_seq_length, 
        doc_stride=args.doc_stride,
        tokenizer=tokenizer
    )

    train_ds.map(train_trans_func, batched=True)

    dev_trans_func = partial(
        prepare_validation_features, 
        max_seq_length=args.max_seq_length, 
        doc_stride=args.doc_stride,
        tokenizer=tokenizer
    )

    dev_ds.map(dev_trans_func, batched=True)

    # 定义BatchSampler
    train_batch_sampler = DistributedBatchSampler(
            dataset=train_ds, 
            batch_size=args.batch_size, 
            shuffle=True
    )
    dev_batch_sampler = BatchSampler(
        dataset=dev_ds, 
        batch_size=args.batch_size, 
        shuffle=False
    )
    # 定义batchify_fn
    train_batchify_fn = lambda samples, fn=Dict({
        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
        "start_positions": Stack(dtype="int64"),
        "end_positions": Stack(dtype="int64")
    }): fn(samples)

    dev_batchify_fn = lambda samples, fn=Dict({
        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
    }): fn(samples)

    # 构造DataLoader
    train_data_loader = DataLoader(
        dataset=train_ds,
        batch_sampler=train_batch_sampler,
        collate_fn=train_batchify_fn,
        return_list=True
    )

    dev_data_loader =  DataLoader(
        dataset=dev_ds,
        batch_sampler=dev_batch_sampler,
        collate_fn=dev_batchify_fn,
        return_list=True
    )

    output_dir = os.path.join(args.output_dir, 'best_model')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    init_checkpoint = os.path.join(args.output_dir, 'model_cmrc2018')
    if not os.path.exists(init_checkpoint):
        os.makedirs(init_checkpoint)

    model = model_class.from_pretrained(args.model_name_or_path)
    # model = model_class.from_pretrained(init_checkpoint)
    # model = model_class.from_pretrained(output_dir)


    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs
    num_train_epochs = math.ceil(num_training_steps / len(train_data_loader))

    num_batches = len(train_data_loader)

    lr_scheduler = LinearDecayWithWarmup(
        learning_rate=args.learning_rate, 
        total_steps=num_training_steps,
        warmup=args.warmup_proportion
    )

    # lr_scheduler = paddle.optimizer.lr.LinearWarmup(
    #     learning_rate=args.learning_rate, 
    #     warmup_steps=400, 
    #     start_lr=0, 
    #     end_lr=args.learning_rate, 
    #     verbose=False
    # )

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.roberta.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params
    )

    criterion = CrossEntropyLossForSQuAD()

    best_val_f1 = 0.0
    accumulation_steps = 8

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, num_train_epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):

            global_step += 1
            
            input_ids, segment_ids, start_positions, end_positions = batch
            logits = model(input_ids=input_ids, token_type_ids=segment_ids)
            loss = criterion(logits, (start_positions, end_positions))

            if global_step % args.log_steps == 0 :
                # print("global step %d, epoch: %d, batch: %d/%d, loss: %.5f,  speed: %.2f step/s" % (
                #     global_step, epoch, step, num_batches, loss, args.log_steps / (time.time() - tic_train)))
                
                print("global step %d, epoch: %d, batch: %d/%d, loss: %.5f,  speed: %.2f step/s, lr: %1.16e" % (
                    global_step, epoch, step, num_batches, loss, args.log_steps / (time.time() - tic_train), lr_scheduler.get_lr()))
                
                tic_train = time.time()
            
            # loss = loss / accumulation_steps
            loss.backward()

            # if global_step % accumulation_steps == 0:
            # if step % accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                em, f1 = evaluate(model=model, data_loader=dev_data_loader)

                print("global step: %d, eval dev Exact Mactch: %.5f, f1_score: %.5f" % (global_step, em, f1))

                if f1 > best_val_f1:
                    best_val_f1 = f1

                    print("save model at global step: %d, best eval f1_score: %.5f" % (global_step, best_val_f1))

                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                if global_step == num_training_steps:
                    break


In [None]:
def do_predict(args):

    paddle.set_device(args.device)

    output_dir = os.path.join(args.output_dir, "best_model")

    # 1. 加载测试集
    test_ds = load_dataset('dureader_robust', splits='test')

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(output_dir)

    # 2. 转化为 id
    test_trans_func = partial(
        prepare_validation_features, 
        max_seq_length=args.max_seq_length, 
        doc_stride=args.doc_stride,
        tokenizer=tokenizer
    )
    test_ds.map(test_trans_func, batched=True)

    # test BatchSampler
    test_batch_sampler = BatchSampler(
        dataset=test_ds, 
        batch_size=args.batch_size, 
        shuffle=False
    )

    # test dataset features batchify
    test_batchify_fn = lambda samples, fn=Dict({
        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
    }): fn(samples)

    # test DataLoader
    test_data_loader =  DataLoader(
        dataset=test_ds,
        batch_sampler=test_batch_sampler,
        collate_fn=test_batchify_fn,
        return_list=True
    )

    model = model_class.from_pretrained(output_dir)
    
    all_predictions = predict(model, test_data_loader)

    # Can also write all_nbest_json and scores_diff_json files if needed
    with open('prediction.json', "w", encoding='utf-8') as writer:
        writer.write(
            json.dumps(
                all_predictions, ensure_ascii=False, indent=4) + "\n")

    count = 0
    for example in test_data_loader.dataset.data:
        count += 1
        print()
        print('问题：',example['question'])
        print('原文：',''.join(example['context']))
        print('答案：',all_predictions[example['id']])
        if count >= 5:
            break

In [None]:
args = Config(model_type='roberta', 
              model_name_or_path='roberta-wwm-ext-large', 
              output_dir='./outputs/dureader-robust/',
              
              max_seq_length=384,
              batch_size=22, 
              learning_rate=5e-5,
              num_train_epochs=10,
              log_steps=10,
              save_steps=500,
              warmup_proportion=0.1,
              weight_decay=0.01)

In [7]:
do_train(args)

[2021-07-06 23:05:33,999] [    INFO] - Found /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/vocab.txt
[2021-07-06 23:07:57,384] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/roberta_chn_large.pdparams


global step 10, epoch: 1, batch: 10/2350, loss: 5.86297,  speed: 0.71 step/s, lr: 1.9148936170212765e-07
global step 20, epoch: 1, batch: 20/2350, loss: 5.86746,  speed: 0.67 step/s, lr: 4.0425531914893619e-07
global step 30, epoch: 1, batch: 30/2350, loss: 5.92419,  speed: 0.67 step/s, lr: 6.1702127659574471e-07
global step 40, epoch: 1, batch: 40/2350, loss: 5.76049,  speed: 0.67 step/s, lr: 8.2978723404255318e-07
global step 50, epoch: 1, batch: 50/2350, loss: 5.56990,  speed: 0.67 step/s, lr: 1.0425531914893618e-06
global step 60, epoch: 1, batch: 60/2350, loss: 5.51904,  speed: 0.66 step/s, lr: 1.2553191489361703e-06
global step 70, epoch: 1, batch: 70/2350, loss: 5.41459,  speed: 0.66 step/s, lr: 1.4680851063829787e-06
global step 80, epoch: 1, batch: 80/2350, loss: 5.29762,  speed: 0.66 step/s, lr: 1.6808510638297873e-06
global step 90, epoch: 1, batch: 90/2350, loss: 4.90931,  speed: 0.66 step/s, lr: 1.8936170212765956e-06
global step 100, epoch: 1, batch: 100/2350, loss: 4.695

KeyboardInterrupt: 

In [None]:
do_predict(args)

---

In [None]:
from paddlenlp.datasets import load_dataset


train_robust, dev_robust = load_dataset('dureader_robust', splits=('train', 'dev'))
train_cmrc, dev_cmrc = load_dataset('cmrc2018', splits=['train', 'dev'])


100%|██████████| 20038/20038 [00:00<00:00, 58154.62it/s]
100%|██████████| 7236/7236 [00:00<00:00, 25550.67it/s]
100%|██████████| 3222/3222 [00:00<00:00, 28489.79it/s]


In [None]:
len(train_robust), len(dev_robust), len(train_cmrc), len(dev_cmrc)

(14520, 1417, 10142, 3219)

In [None]:
train_dataset = []

In [None]:
for idx, example in enumerate(train_robust):
    train_dataset.append(example)

In [None]:
for idx, example in enumerate(train_cmrc):
    train_dataset.append(example)

In [None]:
len(train_dataset), len(train_robust) + len(train_cmrc)

(24662, 24662)

In [None]:
print(train_robust.label_list), print(train_cmrc.label_list)

None
None


(None, None)

In [None]:
for idx, example in enumerate(dev_cmrc):
    train_dataset.append(example)

In [None]:
len(train_dataset)

27881

In [None]:
from paddlenlp.datasets import MapDataset

train_ds = MapDataset(train_dataset)

In [None]:
type(train_ds)

paddlenlp.datasets.dataset.MapDataset

In [2]:
from paddlenlp.datasets import load_dataset

In [3]:
train_robust, dev_ds = load_dataset('dureader_robust', splits=['train', 'dev'])

100%|██████████| 20038/20038 [00:00<00:00, 64438.60it/s]


In [6]:
train_robust = list(train_robust)

In [8]:
type(train_robust), len(train_robust)

(list, 14520)

In [9]:
train_robust[0]

{'id': '0a25cb4bc1ab6f474c699884e04601e4',
 'title': '',
 'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
 'question': '仙剑奇侠传3第几集上天界',
 'answers': ['第35集'],
 'answer_starts': [0]}

In [10]:
train_robust_aug = []

In [11]:
import random

In [12]:
from paddlenlp.transformers import ErnieGramTokenizer

In [13]:
tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

[2021-07-18 23:13:02,309] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/vocab.txt
100%|██████████| 78/78 [00:00<00:00, 3670.26it/s]


In [None]:
for example in train_robust:
    train_robust_aug.append(example)
    for it in range(5):
        

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 