In [1]:
# !unzip /home/aistudio/data/data78992/lcqmc.zip -d /home/aistudio/data/
# !unzip /home/aistudio/data/data78992/paws-x-zh.zip -d /home/aistudio/data/
# !unzip /home/aistudio/data/data78992/bq_corpus.zip -d /home/aistudio/data/

## 1. Load dataset

In [2]:
def load_dataset(fpath, num_row_skip=0):

    def read(fp):
        data = open(fp)

        for _ in range(num_row_skip):
            next(data)

        if "test" in fp:
            for line in data:
                line = line.strip().split('\t')
                yield line[0], line[1]
        else:
            for line in data:
                line = line.strip().split('\t')
                if len(line) == 3:
                    yield line[0], line[1], int(line[2])

    if isinstance(fpath, str):
        return list(read(fpath))
    elif isinstance(fpath, (list, tuple)):
        return [list(read(fp)) for fp in fpath]
    else:
        raise TypeError("Input fpath must be a str or a list/tuple of str")

In [3]:
train_set, dev_set, test_set = load_dataset(['./data/bq_corpus/train.tsv', './data/bq_corpus/dev.tsv', './data/bq_corpus/test.tsv'])
# len(train_set), len(dev_set), len(test_set)
train_set = train_set + dev_set

## 2. Transform text

In [4]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import RobertaModel as SeqClfModel
from paddlenlp.transformers import RobertaTokenizer as PTMTokenizer
import numpy as np


MODEL_NAME = "roberta-wwm-ext-large"
tokenizer = PTMTokenizer.from_pretrained(MODEL_NAME)


def example_converter(example, max_seq_length, tokenizer):
    text_a, text_b, label = example
    encoded = tokenizer(text=text_a, text_pair=text_b, max_seq_len=max_seq_length)
    input_ids = encoded["input_ids"]
    token_type_ids = encoded["token_type_ids"]
    label = np.array([label], dtype="int64")
    return input_ids, token_type_ids, label


def get_trans_fn(max_seq_length=128, tokenizer=tokenizer):
    return lambda ex: example_converter(ex, max_seq_length, tokenizer)


batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id), 
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype="int64")
    ): fn(samples)


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):
    
    if test:
        dataset = [d + (0,) for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

[2022-03-29 04:19:21,581] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/vocab.txt


In [5]:
max_seq_length = 128; batch_size = 32
trans_fn = get_trans_fn(max_seq_length)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
# dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, test=True, batch_size=batch_size)

## 3. Model building

In [6]:
from paddle import nn
import paddle


class PTM(nn.Layer):

    def __init__(self, pretrained_model, dropout=0.1, num_class=2):
        super().__init__()

        self.ptm = pretrained_model
        ptm_out_dim = self.ptm.config["hidden_size"]
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(ptm_out_dim, ptm_out_dim // 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(ptm_out_dim // 2, num_class)

    def encoder(self, input_ids, token_type_ids):
        _, embd = self.ptm(input_ids, token_type_ids)
        embd = self.dropout(embd)
        return embd

    def forward(self, input_ids, token_type_ids):
        embd = self.encoder(input_ids, token_type_ids)
        hidden = self.relu(self.fc1(embd))
        logits = self.fc2(hidden)
        return logits

In [7]:
from paddlenlp.transformers import LinearDecayWithWarmup

epoch = 4
weight_decay = 0.0
warmup_proportion = 0.0
lr_scheduler = LinearDecayWithWarmup(2e-5, len(train_loader) * epoch,
                                         warmup_proportion)

def get_model(model):
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
    parameters=model.parameters(), 
    learning_rate=lr_scheduler, 
    weight_decay=weight_decay, 
    apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.CrossEntropyLoss()

    model = paddle.Model(model)
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [8]:
ptm = SeqClfModel.from_pretrained(MODEL_NAME)
model = PTM(ptm)
model = get_model(model)

[2022-03-29 04:19:21,621] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/roberta_chn_large.pdparams
W0329 04:19:21.623505  1556 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0329 04:19:21.627106  1556 device_context.cc:465] device: 0, cuDNN Version: 7.6.


## 4. Model training

In [9]:
model.fit(train_loader, epochs=epoch, verbose=2, log_freq=100)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/4
step  100/3438 - loss: 0.5379 - acc: 0.6462 - 334ms/step
step  200/3438 - loss: 0.4260 - acc: 0.7072 - 332ms/step
step  300/3438 - loss: 0.3677 - acc: 0.7335 - 335ms/step
step  400/3438 - loss: 0.6715 - acc: 0.7488 - 337ms/step
step  500/3438 - loss: 0.3896 - acc: 0.7637 - 340ms/step
step  600/3438 - loss: 0.3720 - acc: 0.7730 - 342ms/step
step  700/3438 - loss: 0.4872 - acc: 0.7813 - 342ms/step
step  800/3438 - loss: 0.3558 - acc: 0.7872 - 343ms/step
step  900/3438 - loss: 0.2119 - acc: 0.7915 - 342ms/step
step 1000/3438 - loss: 0.2769 - acc: 0.7967 - 341ms/step
step 1100/3438 - loss: 0.3397 - acc: 0.8007 - 340ms/step
step 1200/3438 - loss: 0.3669 - acc: 0.8048 - 340ms/step
step 1300/3438 - loss: 0.4902 - acc: 0.8086 - 340ms/step
step 1400/3438 - loss: 0.4080 - acc: 0.8121 - 341ms/step
step 1500/3438 - loss: 0.4065 - acc: 0.8156 - 341ms/step
step 1600/3438 - loss: 0.

## 5. Prediction

In [10]:
import paddle.nn.functional as F


predictions = []
logits = model.predict(test_loader)

for batch in logits[0]:
    batch = paddle.to_tensor(batch)
    probs = F.softmax(batch, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

Predict begin...
Predict samples: 10000


In [11]:
with open('bq_corpus.tsv', 'w') as f:
    f.write("index\tprediction")
    for idx, p in enumerate(predictions):
        f.write(f"\n{idx}\t{p}")
    f.close()