## 1. Load dataset

In [1]:
def load_dataset(fpath, test=False, num_row_to_skip=0):
    data = open(fpath)
    for _ in range(num_row_to_skip):
        next(data)

    out = []

    if test:
        for line in data:
            out.append(line.strip())

        return out

    idx_to_label = {}
    for line in data:
        line = line.strip().split('\t')
        if len(line) == 3:
            idx, label, text = line
            idx = int(idx)
            idx_to_label[idx] = label
            out.append([text, idx])

    return out, idx_to_label

In [2]:
train_set, idx_to_label = load_dataset('./data/data12701/Train.txt')
len(train_set), train_set[:2], idx_to_label

(752471,
 [['上证50ETF净申购突增', 0], ['交银施罗德保本基金将发行', 0]],
 {0: '财经',
  1: '彩票',
  2: '房产',
  3: '股票',
  4: '家居',
  5: '教育',
  6: '科技',
  7: '社会',
  8: '时尚',
  9: '时政',
  10: '体育',
  11: '星座',
  12: '游戏',
  13: '娱乐'})

In [3]:
# # split the train_set into train and dev sets
# from random import shuffle, seed

# seed(43)
# shuffle(train_set)

# train_set, dev_set = train_set[:652471], train_set[652471: ]

In [4]:
test_set = load_dataset('./data/data12701/Test.txt', test=True)
len(test_set), test_set[:2]

(83599, ['北京君太百货璀璨秋色 满100省353020元', '教育部：小学高年级将开始学习性知识'])

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForSequenceClassification as SeqClfModel
from paddlenlp.transformers import BertTokenizer as PTMTokenizer
import numpy as np


MODEL_NAME = "bert-wwm-chinese"
tokenizer = PTMTokenizer.from_pretrained(MODEL_NAME)


def example_converter(example, tokenizer, max_seq_length=128):

    text, label = example
    encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    label = np.array([label], dtype="int64")
    return input_ids, token_type_ids, label


def get_trans_fn(text_encoder, max_seq_length=128):
    return lambda ex: example_converter(ex, text_encoder, max_seq_length)


def get_batchify_fn(tokenizer=tokenizer):

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
        Stack(dtype="int64")
    ): fn(samples)

    return batchify_fn


def create_dataloader(dataset,
                      trans_fn,
                      batchify_fn,
                      test=False,
                      batch_size=128,
                      shuffle=True,
                      sampler=BatchSampler):

    if test:
        dataset = [[d, 0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)

    dataset.map(trans_fn)
    batch_sampler = sampler(dataset,
                            shuffle=shuffle,
                            batch_size=batch_size)

    dataloder = DataLoader(dataset,
                           batch_sampler=batch_sampler,
                           collate_fn=batchify_fn)

    return dataloder

[2022-03-25 04:56:44,624] [    INFO]

 - Downloading http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt and saved to /home/aistudio/.paddlenlp/models/bert-wwm-chinese




[2022-03-25 04:56:44,628] [    INFO]

 - Downloading bert-wwm-chinese-vocab.txt from http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt




  0%|          | 0/107 [00:00<?, ?it/s]

100%|██████████| 107/107 [00:00<00:00, 53818.27it/s]




In [6]:
max_seq_length = 64; batch_size = 64
trans_fn = get_trans_fn(tokenizer, max_seq_length)
batchify_fn = get_batchify_fn()
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
# dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, test=True, batch_size=batch_size)

## 3. Model preparing

In [7]:
import paddle
from paddlenlp.transformers import LinearDecayWithWarmup


model = SeqClfModel.from_pretrained(MODEL_NAME, num_classes=len(idx_to_label))

learning_rate = 5e-5; epochs = 10
warmup_proportion = 0.1; weight_decay = 0.01
num_training_steps = len(train_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ])

criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()


# @paddle.no_grad()
# def evaluate(model, criterion, metric, data_loader):
#     """
#     Given a dataset, it evals model and computes the metric.

#     Args:
#         model(obj:`paddle.nn.Layer`): A model to classify texts.
#         data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
#         criterion(obj:`paddle.nn.Layer`): It can compute the loss.
#         metric(obj:`paddle.metric.Metric`): The evaluation metric.
#     """
#     model.eval()
#     metric.reset()
#     losses = []
#     for batch in data_loader:
#         input_ids, token_type_ids, labels = batch
#         logits = model(input_ids, token_type_ids)
#         loss = criterion(logits, labels)
#         losses.append(loss.numpy())
#         correct = metric.compute(logits, labels)
#         metric.update(correct)
#         accu = metric.accumulate()
#     print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
#     model.train()
#     metric.reset()

[2022-03-25 04:56:45,026] [    INFO]

 - Downloading http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese.pdparams and saved to /home/aistudio/.paddlenlp/models/bert-wwm-chinese




[2022-03-25 04:56:45,028] [    INFO]

 - Downloading bert-wwm-chinese.pdparams from http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese.pdparams




  0%|          | 0/399504 [00:00<?, ?it/s]

  1%|▏         | 5663/399504 [00:00<00:06, 56625.46it/s]

  3%|▎         | 12655/399504 [00:00<00:06, 60050.30it/s]

  5%|▍         | 19595/399504 [00:00<00:06, 62578.60it/s]

  7%|▋         | 26520/399504 [00:00<00:05, 64439.20it/s]

  9%|▊         | 34432/399504 [00:00<00:05, 68237.02it/s]

 11%|█         | 42319/399504 [00:00<00:05, 71111.79it/s]

 12%|█▏        | 49789/399504 [00:00<00:04, 72149.15it/s]

 14%|█▍        | 56999/399504 [00:00<00:04, 72132.77it/s]

 16%|█▌        | 64070/399504 [00:00<00:04, 71699.51it/s]

 18%|█▊        | 71400/399504 [00:01<00:04, 72170.46it/s]

 20%|█▉        | 78568/399504 [00:01<00:04, 72021.92it/s]

 22%|██▏       | 85906/399504 [00:01<00:04, 72422.73it/s]

 23%|██▎       | 93084/399504 [00:01<00:04, 70359.51it/s]

 25%|██▌       | 100151/399504 [00:01<00:04, 70450.38it/s]

 27%|██▋       | 107401/399504 [00:01<00:04, 71051.12it/s]

 29%|██▊       | 114767/399504 [00:01<00:03, 71811.44it/s]

 31%|███       | 121940/399504 [00:01<00:03, 71703.86it/s]

 32%|███▏      | 129171/399504 [00:01<00:03, 71883.37it/s]

 34%|███▍      | 136370/399504 [00:01<00:03, 71912.83it/s]

 36%|███▌      | 143802/399504 [00:02<00:03, 72615.83it/s]

 38%|███▊      | 151064/399504 [00:02<00:03, 71842.75it/s]

 40%|███▉      | 158434/399504 [00:02<00:03, 72386.78it/s]

 41%|████▏     | 165675/399504 [00:02<00:03, 71943.25it/s]

 43%|████▎     | 173079/399504 [00:02<00:03, 72555.40it/s]

 45%|████▌     | 180844/399504 [00:02<00:02, 74009.37it/s]

 47%|████▋     | 188633/399504 [00:02<00:02, 75131.43it/s]

 49%|████▉     | 196340/399504 [00:02<00:02, 75699.62it/s]

 51%|█████     | 204087/399504 [00:02<00:02, 76221.61it/s]

 53%|█████▎    | 212041/399504 [00:02<00:02, 77185.30it/s]

 55%|█████▌    | 219996/399504 [00:03<00:02, 77879.56it/s]

 57%|█████▋    | 227937/399504 [00:03<00:02, 78329.67it/s]

 59%|█████▉    | 235780/399504 [00:03<00:02, 78354.50it/s]

 61%|██████    | 243620/399504 [00:03<00:02, 76867.60it/s]

 63%|██████▎   | 251316/399504 [00:03<00:01, 75772.82it/s]

 65%|██████▍   | 259218/399504 [00:03<00:01, 76716.22it/s]

 67%|██████▋   | 267146/399504 [00:03<00:01, 77465.69it/s]

 69%|██████▉   | 275117/399504 [00:03<00:01, 78125.18it/s]

 71%|███████   | 283228/399504 [00:03<00:01, 78992.15it/s]

 73%|███████▎  | 291226/399504 [00:03<00:01, 79284.94it/s]

 75%|███████▍  | 299332/399504 [00:04<00:01, 79809.15it/s]

 77%|███████▋  | 307440/399504 [00:04<00:01, 80184.02it/s]

 79%|███████▉  | 315463/399504 [00:04<00:01, 76594.24it/s]

 81%|████████  | 323159/399504 [00:04<00:01, 61553.64it/s]

 83%|████████▎ | 329813/399504 [00:04<00:01, 57955.28it/s]

 84%|████████▍ | 336001/399504 [00:04<00:01, 54464.17it/s]

 86%|████████▌ | 342549/399504 [00:04<00:00, 57357.53it/s]

 88%|████████▊ | 350589/399504 [00:04<00:00, 62752.97it/s]

 90%|████████▉ | 358665/399504 [00:04<00:00, 67250.78it/s]

 92%|█████████▏| 366701/399504 [00:05<00:00, 70710.34it/s]

 94%|█████████▍| 374816/399504 [00:05<00:00, 73547.90it/s]

 96%|█████████▌| 382627/399504 [00:05<00:00, 74859.49it/s]

 98%|█████████▊| 390609/399504 [00:05<00:00, 76278.73it/s]

100%|█████████▉| 398624/399504 [00:05<00:00, 77399.97it/s]

100%|██████████| 399504/399504 [00:05<00:00, 72677.94it/s]




W0325 04:56:50.602037   258 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0325 04:56:50.608598   258 device_context.cc:465] device: 0, cuDNN Version: 7.6.


## 4. Model training

In [8]:
import paddle.nn.functional as F


global_step = 0
paddle.set_device("gpu")
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_loader, start=1):
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 5000 == 0 :
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
    # evaluate(model, criterion, metric, dev_loader)


# model.save_pretrained('/home/aistudio/checkpoint')
# tokenizer.save_pretrained('/home/aistudio/checkpoint')

global step 5000, epoch: 1, batch: 5000, loss: 0.24993, acc: 0.86628




global step 10000, epoch: 1, batch: 10000, loss: 0.26144, acc: 0.90113




global step 15000, epoch: 2, batch: 3242, loss: 0.17364, acc: 0.91426




global step 20000, epoch: 2, batch: 8242, loss: 0.12977, acc: 0.92193




global step 25000, epoch: 3, batch: 1484, loss: 0.06254, acc: 0.92760




global step 30000, epoch: 3, batch: 6484, loss: 0.03021, acc: 0.93246




global step 35000, epoch: 3, batch: 11484, loss: 0.12317, acc: 0.93582




global step 40000, epoch: 4, batch: 4726, loss: 0.07457, acc: 0.93946




global step 45000, epoch: 4, batch: 9726, loss: 0.10953, acc: 0.94228




global step 50000, epoch: 5, batch: 2968, loss: 0.00956, acc: 0.94503




global step 55000, epoch: 5, batch: 7968, loss: 0.08340, acc: 0.94746




global step 60000, epoch: 6, batch: 1210, loss: 0.08326, acc: 0.94960




global step 65000, epoch: 6, batch: 6210, loss: 0.04879, acc: 0.95189




global step 70000, epoch: 6, batch: 11210, loss: 0.11789, acc: 0.95374




global step 75000, epoch: 7, batch: 4452, loss: 0.09232, acc: 0.95580




global step 80000, epoch: 7, batch: 9452, loss: 0.14026, acc: 0.95761




global step 85000, epoch: 8, batch: 2694, loss: 0.08094, acc: 0.95936




global step 90000, epoch: 8, batch: 7694, loss: 0.01834, acc: 0.96105




global step 95000, epoch: 9, batch: 936, loss: 0.01378, acc: 0.96258




global step 100000, epoch: 9, batch: 5936, loss: 0.00491, acc: 0.96413




global step 105000, epoch: 9, batch: 10936, loss: 0.04909, acc: 0.96551




global step 110000, epoch: 10, batch: 4178, loss: 0.04353, acc: 0.96687




global step 115000, epoch: 10, batch: 9178, loss: 0.01131, acc: 0.96814




## 5. Prediction

In [9]:
predictions = []

for batch in test_loader:
    input_ids, segment_ids, _ = batch
    logits = model(input_ids, segment_ids)
    probs = F.softmax(logits, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

In [10]:
with open('result.txt', 'w') as f:
    f.write(f'Acc: {acc}\n')
    f.write(idx_to_label[predictions[0]])
    for p in predictions[1:]:
        f.write('\n' + idx_to_label[p])
    f.close()