## 1. Load dataset

In [1]:
def load_dataset(fpath, test=False, num_row_to_skip=0):
    data = open(fpath)
    for _ in range(num_row_to_skip):
        next(data)

    out = []

    if test:
        for line in data:
            out.append(line.strip())
        
        return out

    idx_to_label = {}
    for line in data:
        line = line.strip().split('\t')
        if len(line) == 3:
            idx, label, text = line
            idx = int(idx)
            idx_to_label[idx] = label
            out.append([text, idx])
    
    return out, idx_to_label

In [2]:
train_set, idx_to_label = load_dataset('./data/data12701/Train.txt')
len(train_set), train_set[:2], idx_to_label

(752471,
 [['上证50ETF净申购突增', 0], ['交银施罗德保本基金将发行', 0]],
 {0: '财经',
  1: '彩票',
  2: '房产',
  3: '股票',
  4: '家居',
  5: '教育',
  6: '科技',
  7: '社会',
  8: '时尚',
  9: '时政',
  10: '体育',
  11: '星座',
  12: '游戏',
  13: '娱乐'})

In [3]:
# split the train_set into train and dev sets
from random import shuffle, seed

seed(43)
shuffle(train_set)

train_set, dev_set = train_set[:652471], train_set[652471: ]

In [4]:
test_set = load_dataset('./data/data12701/Test.txt', test=True)
len(test_set), test_set[:2]

(83599, ['北京君太百货璀璨秋色 满100省353020元', '教育部：小学高年级将开始学习性知识'])

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp import transformers
import numpy as np


MODEL_NAME = "ernie-1.0"
tokenizer = transformers.ErnieTokenizer.from_pretrained(MODEL_NAME)


def example_converter(example, tokenizer, max_seq_length=128):
    
    text, label = example
    encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    label = np.array([label], dtype="int64")
    return input_ids, token_type_ids, label


def get_trans_fn(text_encoder, max_seq_length=128):
    return lambda ex: example_converter(ex, text_encoder, max_seq_length)


def get_batchify_fn(tokenizer=tokenizer):
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id), 
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
        Stack(dtype="int64")
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):
    
    if test:
        dataset = [[d, 0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

[2022-03-24 01:43:04,962] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/ernie/vocab.txt and saved to /home/aistudio/.paddlenlp/models/ernie-1.0
[2022-03-24 01:43:04,965] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/ernie/vocab.txt
100%|██████████| 90/90 [00:00<00:00, 2881.80it/s]


In [6]:
max_seq_length = 16; batch_size = 256
trans_fn = get_trans_fn(tokenizer, max_seq_length)
batchify_fn = get_batchify_fn()
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, test=True, batch_size=batch_size)

## 3. Model preparing

In [8]:
import paddle 
from paddlenlp.transformers import LinearDecayWithWarmup


model = transformers.ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=len(idx_to_label))

learning_rate = 5e-5; epochs = 5
warmup_proportion = 0.1; weight_decay = 0.01
num_training_steps = len(train_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ])

criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()


@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    """
    Given a dataset, it evals model and computes the metric.

    Args:
        model(obj:`paddle.nn.Layer`): A model to classify texts.
        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
        criterion(obj:`paddle.nn.Layer`): It can compute the loss.
        metric(obj:`paddle.metric.Metric`): The evaluation metric.
    """
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
    model.train()
    metric.reset()

[2022-03-24 01:44:02,683] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-1.0/ernie_v1_chn_base.pdparams


## 4. Model training

In [9]:
import paddle.nn.functional as F


global_step = 0
paddle.set_device("gpu")
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_loader, start=1):
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 200 == 0 :
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
    evaluate(model, criterion, metric, dev_loader)


model.save_pretrained('/home/aistudio/checkpoint')
tokenizer.save_pretrained('/home/aistudio/checkpoint')

global step 200, epoch: 1, batch: 200, loss: 0.87745, acc: 0.36180
global step 400, epoch: 1, batch: 400, loss: 0.39251, acc: 0.60229
global step 600, epoch: 1, batch: 600, loss: 0.37153, acc: 0.69855
global step 800, epoch: 1, batch: 800, loss: 0.44869, acc: 0.75039
global step 1000, epoch: 1, batch: 1000, loss: 0.30168, acc: 0.78260
global step 1200, epoch: 1, batch: 1200, loss: 0.27262, acc: 0.80453
global step 1400, epoch: 1, batch: 1400, loss: 0.24838, acc: 0.82132
global step 1600, epoch: 1, batch: 1600, loss: 0.20881, acc: 0.83400
global step 1800, epoch: 1, batch: 1800, loss: 0.21726, acc: 0.84434
global step 2000, epoch: 1, batch: 2000, loss: 0.16612, acc: 0.85281
global step 2200, epoch: 1, batch: 2200, loss: 0.24940, acc: 0.85981
global step 2400, epoch: 1, batch: 2400, loss: 0.24411, acc: 0.86590
eval loss: 0.18752, accu: 0.94085
global step 2600, epoch: 2, batch: 51, loss: 0.15898, acc: 0.94462
global step 2800, epoch: 2, batch: 251, loss: 0.17325, acc: 0.94472
global step

## 5. Prediction

In [10]:
predictions = []

for batch in test_loader:
    input_ids, segment_ids, _ = batch
    logits = model(input_ids, segment_ids)
    probs = F.softmax(logits, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

In [11]:
with open('result.txt', 'w') as f:
    f.write(idx_to_label[predictions[0]])
    for p in predictions[1:]:
        f.write('\n' + idx_to_label[p])
    f.close()

In [12]:
!zip result.txt.zip result.txt

  adding: result.txt (deflated 89%)
