## 1. Load dataset

In [1]:
def load_dataset(fpath, test=False, num_row_to_skip=0):
    data = open(fpath)
    for _ in range(num_row_to_skip):
        next(data)

    out = []

    if test:
        for line in data:
            out.append(line.strip())

        return out

    idx_to_label = {}
    for line in data:
        line = line.strip().split('\t')
        if len(line) == 3:
            idx, label, text = line
            idx = int(idx)
            idx_to_label[idx] = label
            out.append([text, idx])

    return out, idx_to_label

In [2]:
train_set, idx_to_label = load_dataset('./data/data12701/Train.txt')
len(train_set), train_set[:2], idx_to_label

(752471,
 [['上证50ETF净申购突增', 0], ['交银施罗德保本基金将发行', 0]],
 {0: '财经',
  1: '彩票',
  2: '房产',
  3: '股票',
  4: '家居',
  5: '教育',
  6: '科技',
  7: '社会',
  8: '时尚',
  9: '时政',
  10: '体育',
  11: '星座',
  12: '游戏',
  13: '娱乐'})

In [3]:
# # split the train_set into train and dev sets
# from random import shuffle, seed

# seed(43)
# shuffle(train_set)

# train_set, dev_set = train_set[:652471], train_set[652471: ]

In [4]:
test_set = load_dataset('./data/data12701/Test.txt', test=True)
len(test_set), test_set[:2]

(83599, ['北京君太百货璀璨秋色 满100省353020元', '教育部：小学高年级将开始学习性知识'])

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForSequenceClassification as SeqClfModel
from paddlenlp.transformers import BertTokenizer as PTMTokenizer
import numpy as np


MODEL_NAME = "bert-wwm-chinese"
tokenizer = PTMTokenizer.from_pretrained(MODEL_NAME)


def example_converter(example, tokenizer, max_seq_length=128):

    text, label = example
    encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    label = np.array([label], dtype="int64")
    return input_ids, token_type_ids, label


def get_trans_fn(text_encoder, max_seq_length=128):
    return lambda ex: example_converter(ex, text_encoder, max_seq_length)


def get_batchify_fn(tokenizer=tokenizer):

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
        Stack(dtype="int64")
    ): fn(samples)

    return batchify_fn


def create_dataloader(dataset,
                      trans_fn,
                      batchify_fn,
                      test=False,
                      batch_size=128,
                      shuffle=True,
                      sampler=BatchSampler):

    if test:
        dataset = [[d, 0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)

    dataset.map(trans_fn)
    batch_sampler = sampler(dataset,
                            shuffle=shuffle,
                            batch_size=batch_size)

    dataloder = DataLoader(dataset,
                           batch_sampler=batch_sampler,
                           collate_fn=batchify_fn)

    return dataloder

[2022-03-25 02:33:20,042] [    INFO]

 - Downloading http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt and saved to /home/aistudio/.paddlenlp/models/bert-wwm-chinese




[2022-03-25 02:33:20,045] [    INFO]

 - Downloading bert-wwm-chinese-vocab.txt from http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt




  0%|          | 0/107 [00:00<?, ?it/s]

100%|██████████| 107/107 [00:00<00:00, 55826.66it/s]




In [6]:
max_seq_length = 64; batch_size = 512
trans_fn = get_trans_fn(tokenizer, max_seq_length)
batchify_fn = get_batchify_fn()
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
# dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, test=True, batch_size=batch_size)

## 3. Model preparing

In [7]:
import paddle
from paddlenlp.transformers import LinearDecayWithWarmup


model = SeqClfModel.from_pretrained(MODEL_NAME, num_classes=len(idx_to_label))

learning_rate = 5e-5; epochs = 5
warmup_proportion = 0.1; weight_decay = 0.01
num_training_steps = len(train_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ])

criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()


# @paddle.no_grad()
# def evaluate(model, criterion, metric, data_loader):
#     """
#     Given a dataset, it evals model and computes the metric.

#     Args:
#         model(obj:`paddle.nn.Layer`): A model to classify texts.
#         data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
#         criterion(obj:`paddle.nn.Layer`): It can compute the loss.
#         metric(obj:`paddle.metric.Metric`): The evaluation metric.
#     """
#     model.eval()
#     metric.reset()
#     losses = []
#     for batch in data_loader:
#         input_ids, token_type_ids, labels = batch
#         logits = model(input_ids, token_type_ids)
#         loss = criterion(logits, labels)
#         losses.append(loss.numpy())
#         correct = metric.compute(logits, labels)
#         metric.update(correct)
#         accu = metric.accumulate()
#     print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
#     model.train()
#     metric.reset()

[2022-03-25 02:33:20,426] [    INFO]

 - Downloading http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese.pdparams and saved to /home/aistudio/.paddlenlp/models/bert-wwm-chinese




[2022-03-25 02:33:20,429] [    INFO]

 - Downloading bert-wwm-chinese.pdparams from http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese.pdparams




  0%|          | 0/399504 [00:00<?, ?it/s]

  1%|▏         | 5392/399504 [00:00<00:07, 53917.48it/s]

  3%|▎         | 13014/399504 [00:00<00:06, 59104.82it/s]

  5%|▌         | 20882/399504 [00:00<00:05, 63871.97it/s]

  7%|▋         | 28862/399504 [00:00<00:05, 67939.57it/s]

  9%|▉         | 36489/399504 [00:00<00:05, 70240.34it/s]

 11%|█         | 44126/399504 [00:00<00:04, 71971.31it/s]

 13%|█▎        | 51820/399504 [00:00<00:04, 73391.32it/s]

 15%|█▍        | 59473/399504 [00:00<00:04, 74304.03it/s]

 17%|█▋        | 67559/399504 [00:00<00:04, 76156.33it/s]

 19%|█▉        | 75433/399504 [00:01<00:04, 76911.64it/s]

 21%|██        | 83018/399504 [00:01<00:04, 76431.75it/s]

 23%|██▎       | 90918/399504 [00:01<00:03, 77183.08it/s]

 25%|██▍       | 98587/399504 [00:01<00:03, 75450.96it/s]

 27%|██▋       | 106394/399504 [00:01<00:03, 76217.95it/s]

 29%|██▊       | 114119/399504 [00:01<00:03, 76522.72it/s]

 31%|███       | 121919/399504 [00:01<00:03, 76956.82it/s]

 33%|███▎      | 129996/399504 [00:01<00:03, 78061.15it/s]

 35%|███▍      | 137953/399504 [00:01<00:03, 78504.63it/s]

 37%|███▋      | 145889/399504 [00:01<00:03, 78757.42it/s]

 38%|███▊      | 153765/399504 [00:02<00:03, 78001.87it/s]

 40%|████      | 161567/399504 [00:02<00:03, 77637.94it/s]

 42%|████▏     | 169333/399504 [00:02<00:02, 77569.03it/s]

 44%|████▍     | 177190/399504 [00:02<00:02, 77864.78it/s]

 46%|████▋     | 185289/399504 [00:02<00:02, 78775.16it/s]

 48%|████▊     | 193170/399504 [00:02<00:02, 78541.39it/s]

 50%|█████     | 201027/399504 [00:02<00:02, 77889.82it/s]

 52%|█████▏    | 208819/399504 [00:02<00:02, 77178.42it/s]

 54%|█████▍    | 216773/399504 [00:02<00:02, 77869.28it/s]

 56%|█████▋    | 224906/399504 [00:02<00:02, 78875.10it/s]

 58%|█████▊    | 232901/399504 [00:03<00:02, 79192.64it/s]

 60%|██████    | 241072/399504 [00:03<00:01, 79929.25it/s]

 62%|██████▏   | 249246/399504 [00:03<00:01, 80463.62it/s]

 64%|██████▍   | 257297/399504 [00:03<00:01, 79874.15it/s]

 66%|██████▋   | 265289/399504 [00:03<00:01, 79549.74it/s]

 68%|██████▊   | 273506/399504 [00:03<00:01, 80317.68it/s]

 70%|███████   | 281567/399504 [00:03<00:01, 80404.43it/s]

 73%|███████▎  | 289790/399504 [00:03<00:01, 80941.96it/s]

 75%|███████▍  | 297887/399504 [00:03<00:01, 80456.64it/s]

 77%|███████▋  | 305936/399504 [00:03<00:01, 76829.52it/s]

 79%|███████▊  | 313991/399504 [00:04<00:01, 77907.37it/s]

 81%|████████  | 322201/399504 [00:04<00:00, 79118.37it/s]

 83%|████████▎ | 330327/399504 [00:04<00:00, 79746.64it/s]

 85%|████████▍ | 338378/399504 [00:04<00:00, 79972.16it/s]

 87%|████████▋ | 346389/399504 [00:04<00:00, 78146.23it/s]

 89%|████████▊ | 354223/399504 [00:04<00:00, 65861.82it/s]

 90%|█████████ | 361152/399504 [00:04<00:00, 60245.23it/s]

 92%|█████████▏| 367510/399504 [00:04<00:00, 58230.31it/s]

 94%|█████████▎| 373890/399504 [00:04<00:00, 59795.62it/s]

 95%|█████████▌| 380736/399504 [00:05<00:00, 62154.49it/s]

 97%|█████████▋| 388644/399504 [00:05<00:00, 66417.98it/s]

 99%|█████████▉| 396695/399504 [00:05<00:00, 70097.39it/s]

100%|██████████| 399504/399504 [00:05<00:00, 75295.38it/s]




W0325 02:33:25.803910   255 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0325 02:33:25.809233   255 device_context.cc:465] device: 0, cuDNN Version: 7.6.


## 4. Model training

In [8]:
import paddle.nn.functional as F


global_step = 0
paddle.set_device("gpu")
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_loader, start=1):
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 5000 == 0 :
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
    # evaluate(model, criterion, metric, dev_loader)


# model.save_pretrained('/home/aistudio/checkpoint')
# tokenizer.save_pretrained('/home/aistudio/checkpoint')

global step 5000, epoch: 4, batch: 590, loss: 0.09597, acc: 0.94230




## 5. Prediction

In [9]:
predictions = []

for batch in test_loader:
    input_ids, segment_ids, _ = batch
    logits = model(input_ids, segment_ids)
    probs = F.softmax(logits, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

In [10]:
with open('result.txt', 'w') as f:
    f.write(f'Acc: {acc}\n')
    f.write(idx_to_label[predictions[0]])
    for p in predictions[1:]:
        f.write('\n' + idx_to_label[p])
    f.close()