# DuReader-Checklist

In [1]:
!pip install --upgrade paddlenlp -i https://pypi.org/simple

Collecting paddlenlp
[?25l  Downloading https://files.pythonhosted.org/packages/b0/12/a827fac49f02eb642b9c0b7906e1684c24d87d866c6ccc9f40f76c41fc3e/paddlenlp-2.0.6-py3-none-any.whl (485kB)
[K     |████████████████████████████████| 491kB 25kB/s eta 0:00:016
Installing collected packages: paddlenlp
  Found existing installation: paddlenlp 2.0.1
    Uninstalling paddlenlp-2.0.1:
      Successfully uninstalled paddlenlp-2.0.1
Successfully installed paddlenlp-2.0.6


In [2]:
import json
import math
import os
import random
import time
from functools import partial

import numpy as np
import paddle
from paddle.io import BatchSampler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddlenlp.data import Dict
from paddlenlp.data import Pad
from paddlenlp.data import Stack
from paddlenlp.data import Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.datasets import MapDataset
from paddlenlp.ops.optimizer import AdamW
from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import ErnieTokenizer
from paddlenlp.transformers import ErnieGramTokenizer
from paddlenlp.transformers import RobertaTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup

from models import BertForQuestionAnswering
from models import ErnieForQuestionAnswering
from models import ErnieGramForQuestionAnswering
from models import RobertaForQuestionAnswering

from config import Config
from dataset import DuReaderChecklist

from utils import compute_prediction_checklist
from utils import CrossEntropyLossForChecklist
from utils import evaluate
from utils import predict
from utils import prepare_train_features
from utils import prepare_validation_features
from utils import set_seed


In [3]:

MODEL_CLASSES = {
    "bert": (BertForQuestionAnswering, BertTokenizer),
    "ernie": (ErnieForQuestionAnswering, ErnieTokenizer),
    "ernie_gram": (ErnieGramForQuestionAnswering, ErnieGramTokenizer),
    "roberta": (RobertaForQuestionAnswering, RobertaTokenizer)
}

In [4]:
def do_train(args):
    
    paddle.set_device(args.device)

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    set_seed(args)
    
    assert args.train_file != None, "--train_file should be set when training!"
    train_ds = DuReaderChecklist().read(args.train_file)
    dev_ds = DuReaderChecklist().read(args.dev_file)

    train_trans_func = partial(
        prepare_train_features, 
        tokenizer=tokenizer,
        args=args
    )
    train_ds.map(train_trans_func, batched=True)

    dev_trans_func = partial(
        prepare_validation_features, 
        tokenizer=tokenizer,
        args=args
    )
    dev_ds.map(dev_trans_func, batched=True)

    # 定义batchify_fn
    train_batchify_fn = lambda samples, fn=Dict({
        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
        "start_positions": Stack(dtype="int64"),
        "end_positions": Stack(dtype="int64"),
        "answerable_label": Stack(dtype="int64")
    }): fn(samples)

    dev_batchify_fn = lambda samples, fn=Dict({
        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
    }): fn(samples)

    # 定义BatchSampler
    train_batch_sampler = DistributedBatchSampler(
        dataset=train_ds, 
        batch_size=args.batch_size, 
        shuffle=True
    )
    dev_batch_sampler = BatchSampler(
        dataset=dev_ds, 
        batch_size=args.batch_size, 
        shuffle=False
    )

    # 构造DataLoader
    train_data_loader = DataLoader(
        dataset=train_ds,
        batch_sampler=train_batch_sampler,
        collate_fn=train_batchify_fn,
        return_list=True
    )

    dev_data_loader =  DataLoader(
        dataset=dev_ds,
        batch_sampler=dev_batch_sampler,
        collate_fn=dev_batchify_fn,
        return_list=True
    )

    output_dir = os.path.join(args.output_dir, 'best_model')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model = model_class.from_pretrained(args.model_name_or_path)
    # model = model_class.from_pretrained(output_dir)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs
    num_train_epochs = math.ceil(num_training_steps / len(train_data_loader))

    num_batches = len(train_data_loader)

    lr_scheduler = LinearDecayWithWarmup(
        learning_rate=args.learning_rate, 
        total_steps=num_training_steps,
        warmup=args.warmup_proportion
    )

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params
    )

    criterion = CrossEntropyLossForChecklist()

    best_val_f1 = 0.0

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, num_train_epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):

            global_step += 1
            
            input_ids, segment_ids, start_positions, end_positions, answerable_label = batch
            logits = model(input_ids=input_ids, token_type_ids=segment_ids)
            loss = criterion(logits, (start_positions, end_positions, answerable_label))

            if global_step % args.logging_steps == 0 :
                print(
                    "global step %d, epoch: %d, batch: %d/%d, loss: %.5f, speed: %.2f step/s, lr: %1.16e"
                    % (global_step, epoch, step, num_batches, loss,
                    args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr()))
                
                tic_train = time.time()
        
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                dev_em, dev_f1 = evaluate(model=model, data_loader=dev_data_loader, args=args)

                print("global step: %d, eval dev Exact Mactch: %.5f, f1_score: %.5f" % (global_step, dev_em, dev_f1))

                if dev_f1 > best_val_f1:
                    best_val_f1 = dev_f1

                    print("save model at global step: %d, best eval f1_score: %.5f" % (global_step, best_val_f1))

                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                if global_step == num_training_steps:
                    break

In [5]:
def do_predict(args):

    paddle.set_device(args.device)

    output_dir = os.path.join(args.output_dir, "best_model")

    # 1. 加载测试集
    test_ds = DuReaderChecklist().read(args.test_file)

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(output_dir)

    # 2. 转化为 id
    test_trans_func = partial(
        prepare_validation_features, 
        tokenizer=tokenizer,
        args=args
    )
    test_ds.map(test_trans_func, batched=True)

    # test BatchSampler
    test_batch_sampler = BatchSampler(
        dataset=test_ds, 
        batch_size=args.batch_size, 
        shuffle=False
    )

    # test dataset features batchify
    test_batchify_fn = lambda samples, fn=Dict({
        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
    }): fn(samples)

    # test DataLoader
    test_data_loader =  DataLoader(
        dataset=test_ds,
        batch_sampler=test_batch_sampler,
        collate_fn=test_batchify_fn,
        return_list=True
    )

    model = model_class.from_pretrained(output_dir)
    
    all_predictions = predict(model, test_data_loader, args)

    # Can also write all_nbest_json and scores_diff_json files if needed
    with open('prediction.json', "w", encoding='utf-8') as writer:
        writer.write(
            json.dumps(
                all_predictions, ensure_ascii=False, indent=4) + "\n")

    count = 0
    for example in test_data_loader.dataset.data:
        count += 1
        print()
        print('问题：',example['question'])
        print('原文：',''.join(example['context']))
        print('答案：',all_predictions[example['id']])
        if count >= 5:
            break

In [6]:
args = Config(model_type='roberta', 
              model_name_or_path='roberta-wwm-ext-large',  # roberta-wwm-ext-large
              output_dir='./outputs/dureader-checklist/',
              train_file='./checklist_data/train.json',
              dev_file='./checklist_data/dev.json',
              test_file='./checklist_data/test.json',

              max_seq_length=384,
              batch_size=4, 
              learning_rate=5e-5,
              num_train_epochs=10,
              logging_steps=20,
              save_steps=200,
              warmup_proportion=0.1,
              weight_decay=0.01)

In [None]:
do_train(args)

[2021-07-25 07:01:35,350] [    INFO] - Found /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/vocab.txt


In [None]:
do_predict(args)

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 