## 1. Load dataset

In [1]:
def load_dataset(fpath, test=False, num_row_to_skip=0):
    data = open(fpath)
    for _ in range(num_row_to_skip):
        next(data)

    out = []

    if test:
        for line in data:
            out.append(line.strip())
        
        return out

    idx_to_label = {}
    for line in data:
        line = line.strip().split('\t')
        if len(line) == 3:
            idx, label, text = line
            idx = int(idx)
            idx_to_label[idx] = label
            out.append([text, idx])
    
    return out, idx_to_label

In [2]:
train_set, idx_to_label = load_dataset('./data/data12701/Train.txt')
len(train_set), train_set[:2], idx_to_label

(752471,
 [['上证50ETF净申购突增', 0], ['交银施罗德保本基金将发行', 0]],
 {0: '财经',
  1: '彩票',
  2: '房产',
  3: '股票',
  4: '家居',
  5: '教育',
  6: '科技',
  7: '社会',
  8: '时尚',
  9: '时政',
  10: '体育',
  11: '星座',
  12: '游戏',
  13: '娱乐'})

In [3]:
# split the train_set into train and dev sets
from random import shuffle, seed

seed(43)
shuffle(train_set)

train_set, dev_set = train_set[:652471], train_set[652471: ]

In [4]:
test_set = load_dataset('./data/data12701/Test.txt', test=True)
len(test_set), test_set[:2]

(83599, ['北京君太百货璀璨秋色 满100省353020元', '教育部：小学高年级将开始学习性知识'])

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.embeddings import TokenEmbedding
import jieba


name = "w2v.baidu_encyclopedia.target.word-word.dim300"
token_embedding = TokenEmbedding(embedding_name=name)
vocab = token_embedding.vocab 


def text_encoder(text):
    tks = jieba.lcut(text)
    return [vocab[tk] for tk in tks]


def example_converter(example, text_encoder):
    text, label = example
    text_ids = text_encoder(text)
    return text_ids, label


def get_trans_fn(text_encoder=text_encoder):
    return lambda ex: example_converter(ex, text_encoder)


def get_batchify_fn():
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab['[PAD]']), 
        Stack(dtype="int64")
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):
    
    if test:
        dataset = [[d, 0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

[2022-03-26 05:30:12,804] [    INFO] - Loading token embedding...
W0326 05:30:17.112741  5713 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0326 05:30:17.118315  5713 device_context.cc:465] device: 0, cuDNN Version: 7.6.
[2022-03-26 05:30:21,462] [    INFO] - Finish loading embedding vector.
[2022-03-26 05:30:21,504] [    INFO] - Token Embedding info:             
Unknown index: 635963             
Unknown token: [UNK]             
Padding index: 635964             
Padding token: [PAD]             
Shape :[635965, 300]


In [6]:
batch_size = 1024
trans_fn = get_trans_fn(text_encoder)
batchify_fn = get_batchify_fn()
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, test=True, batch_size=batch_size)

## 3. Model preparing

In [7]:
import paddle 
import paddle.nn as nn
import paddle.nn.functional as F


class CNN(nn.Layer):

    def __init__(self, 
                 embedding,
                 output_dim,
                 embedding_dim,
                 dropout=0.5, 
                 padding_idx=vocab['[PAD]'],
                 num_filter=256,
                 filter_sizes=(3,),
                 hidden_dim=50,
                 activation=nn.ReLU()):
        
        super().__init__()

        self.embedding = embedding
        self.dropout = nn.Dropout(dropout)
        
        self.convs = nn.LayerList([
            nn.Conv1D(
                in_channels=embedding_dim,
                out_channels=num_filter,
                kernel_size=fz
            ) for fz in filter_sizes
        ])
        self.fc1 = nn.Linear(len(filter_sizes) * num_filter, hidden_dim)
        self.act = activation
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def encoder(self, embd):
        embd = embd.transpose((0,2,1))
        conved = [self.act(conv(embd)) for conv in self.convs]
        max_pooled = [F.adaptive_max_pool1d(conv, output_size=1).squeeze(2) for conv in conved]
        pooled_concat = paddle.concat(max_pooled, axis=1)
        return pooled_concat

    def forward(self, text_ids):
        embd = self.dropout(self.embedding(text_ids))
        encoded = self.encoder(embd)
        hidden = self.dropout(self.act(self.fc1(encoded)))
        logits = self.fc2(hidden)
        return logits


In [8]:

from paddlenlp.transformers import LinearDecayWithWarmup

epoch = 30
weight_decay = 0.001
warmup_proportion = 0.01
lr_scheduler = LinearDecayWithWarmup(5e-3, len(train_loader) * epoch,
                                         warmup_proportion)

def get_model(model):
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
    parameters=model.parameters(), 
    learning_rate=lr_scheduler, 
    weight_decay=weight_decay, 
    apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.CrossEntropyLoss()

    model = paddle.Model(model)
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [9]:
model = CNN(token_embedding, len(idx_to_label), 300)
model = get_model(model)

## 4. Model training

In [10]:
from paddle.callbacks import EarlyStopping

earlystop = EarlyStopping(patience=5, save_best_model=True)

# model.fit(train_loader, epochs=epoch, verbose=2, log_freq=100)
model.fit(train_loader, dev_loader, 
          epochs=epoch, callbacks=[earlystop], 
          verbose=2, log_freq=100, save_dir="ckpt", save_freq=100)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/30


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.638 seconds.
Prefix dict has been built successfully.


step 100/638 - loss: 0.9639 - acc: 0.4530 - 132ms/step
step 200/638 - loss: 0.6233 - acc: 0.6103 - 126ms/step
step 300/638 - loss: 0.5796 - acc: 0.6876 - 125ms/step
step 400/638 - loss: 0.4914 - acc: 0.7321 - 124ms/step
step 500/638 - loss: 0.4557 - acc: 0.7610 - 123ms/step
step 600/638 - loss: 0.4493 - acc: 0.7813 - 123ms/step
step 638/638 - loss: 0.2977 - acc: 0.7876 - 122ms/step
save checkpoint at /home/aistudio/ckpt/0
Eval begin...
step 98/98 - loss: 0.2864 - acc: 0.9158 - 108ms/step
Eval samples: 100000
Epoch 2/30
step 100/638 - loss: 0.3053 - acc: 0.9033 - 124ms/step
step 200/638 - loss: 0.2993 - acc: 0.9036 - 120ms/step
step 300/638 - loss: 0.3253 - acc: 0.9032 - 119ms/step
step 400/638 - loss: 0.4150 - acc: 0.9043 - 119ms/step
step 500/638 - loss: 0.3626 - acc: 0.9045 - 118ms/step
step 600/638 - loss: 0.2961 - acc: 0.9049 - 118ms/step
step 638/638 - loss: 0.3896 - acc: 0.9050 - 118ms/step
Eval begin...
step 98/98 - loss: 0.2572 - acc: 0.9246 - 105ms/step
Eval samples: 100000
Ep

## 5. Prediction

In [11]:

import paddle.nn.functional as F


predictions = []
logits = model.predict(test_loader)

for batch in logits[0]:
    batch = paddle.to_tensor(batch)
    probs = F.softmax(batch, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

Predict begin...
Predict samples: 83599


In [12]:
with open('result.txt', 'w') as f:
    f.write(idx_to_label[predictions[0]])
    for p in predictions[1:]:
        f.write('\n' + idx_to_label[p])
    f.close()