In [1]:
!pip install paddlepaddle --upgrade

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting paddlepaddle
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/e4/42/3a91bff89038a9773b2df69610293e759ed6e1de6fe115503d1132805103/paddlepaddle-2.2.2-cp37-cp37m-manylinux1_x86_64.whl (108.4 MB)
     |████████████████████████████████| 108.4 MB 3.2 MB/s            
Installing collected packages: paddlepaddle
Successfully installed paddlepaddle-2.2.2
You should consider upgrading via the '/opt/conda/envs/python35-paddle120-env/bin/python -m pip install --upgrade pip' command.[0m


## 1. Load dataset

In [2]:
def load_dataset(fpath, test=False, num_row_to_skip=0):
    data = open(fpath)
    for _ in range(num_row_to_skip):
        next(data)

    out = []

    if test:
        for line in data:
            out.append(line.strip())
        
        return out

    idx_to_label = {}
    for line in data:
        line = line.strip().split('\t')
        if len(line) == 3:
            idx, label, text = line
            idx = int(idx)
            idx_to_label[idx] = label
            out.append([text, idx])
    
    return out, idx_to_label

In [3]:
train_set, idx_to_label = load_dataset('./data/data12701/Train.txt')
len(train_set), train_set[:2], idx_to_label

(752471,
 [['上证50ETF净申购突增', 0], ['交银施罗德保本基金将发行', 0]],
 {0: '财经',
  1: '彩票',
  2: '房产',
  3: '股票',
  4: '家居',
  5: '教育',
  6: '科技',
  7: '社会',
  8: '时尚',
  9: '时政',
  10: '体育',
  11: '星座',
  12: '游戏',
  13: '娱乐'})

In [4]:
# split the train_set into train and dev sets
from random import shuffle, seed

seed(43)
shuffle(train_set)

train_set, dev_set = train_set[:652471], train_set[652471: ]

In [5]:
test_set = load_dataset('./data/data12701/Test.txt', test=True)
len(test_set), test_set[:2]

(83599, ['北京君太百货璀璨秋色 满100省353020元', '教育部：小学高年级将开始学习性知识'])

## 2. Transform text

In [6]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.data import Vocab


class TextVectorizer:
     
    def __init__(self, tokenizer=None):
        self.tokenize = tokenizer
        self.vocab_to_idx = None
        self._V = None
    
    def build_vocab(self, text):
        tokens = list(map(self.tokenize, text))
        self._V = Vocab.build_vocab(tokens, unk_token='[UNK]', pad_token='[PAD]')
        self.vocab_to_idx = self._V.token_to_idx
        
    def text_encoder(self, text):
        if isinstance(text, list):
            return [self(t) for t in text]
        
        tks = self.tokenize(text)
        out = [self.vocab_to_idx[tk] for tk in tks]
        return out

    def __len__(self):
        return len(self.vocab_to_idx)

    def __getitem__(self, w):
        return self.vocab_to_idx[w]
    
    def __call__(self, text):
        if self.vocab_to_idx:
            return self.text_encoder(text)
        raise ValueError("No vocab is built!")


def example_converter(example, text_encoder, include_seq_len):
    
    text, label = example
    encoded = text_encoder(text)
    if include_seq_len:
        text_len = len(encoded)
        return encoded, text_len, label
    return encoded, label


def get_trans_fn(text_encoder, include_seq_len):
    return lambda ex: example_converter(ex, text_encoder, include_seq_len)


def get_batchify_fn(include_seq_len):
    
    if include_seq_len:
        stack = [Stack(dtype="int64")] * 2
    else:
        stack = [Stack(dtype="int64")]
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  
        *stack
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):
    
    if test:
        dataset = [[d, 0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder


In [7]:
text = [t[0] for t in train_set]
V = TextVectorizer(list)
V.build_vocab(text)
print("Number of vocab (char):", len(V))

Number of vocab (char): 5207


In [14]:
include_seq_len = True; batch_size = 256
trans_fn = get_trans_fn(V, include_seq_len=include_seq_len)
batchify_fn = get_batchify_fn(include_seq_len=include_seq_len)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, test=True,
                                shuffle=False, batch_size=batch_size)

## 3. Model building

In [15]:
import paddle, paddlenlp
import paddle.nn as nn
import paddle.nn.functional as F


class LSTM(nn.Layer):
    def __init__(self,
                 vocab_size,
                 num_classes,
                 emb_dim=128,
                 padding_idx=0,
                 lstm_hidden_size=198,
                 direction='forward',
                 lstm_layers=1,
                 dropout_rate=0.0,
                 pooling_type=None,
                 fc_hidden_size=96):
        super().__init__()

        self.embedder = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=padding_idx)

        self.lstm_encoder = paddlenlp.seq2vec.LSTMEncoder(
            emb_dim,
            lstm_hidden_size,
            num_layers=lstm_layers,
            direction=direction,
            dropout=dropout_rate,
            pooling_type=pooling_type)

        self.fc = nn.Linear(self.lstm_encoder.get_output_dim(), fc_hidden_size)
        self.output_layer = nn.Linear(fc_hidden_size, num_classes)

    def forward(self, text, seq_len):
        embedded_text = self.embedder(text)
        text_repr = self.lstm_encoder(embedded_text, sequence_length=seq_len)
        fc_out = paddle.tanh(self.fc(text_repr))
        logits = self.output_layer(fc_out)
        return logits


def get_model(model):
    model = paddle.Model(model)
    optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(), learning_rate=5e-4)
    criterion = paddle.nn.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [16]:
model = LSTM(len(V), len(idx_to_label), direction='bidirectional')
model = get_model(model)

## 4. Model training

In [17]:
from paddle.callbacks import EarlyStopping

earlystop = EarlyStopping(patience=3)

model.fit(train_loader, dev_loader, epochs=10, verbose=2, log_freq=200, callbacks=[earlystop])

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step  200/2549 - loss: 0.8721 - acc: 0.4932 - 470ms/step
step  400/2549 - loss: 0.6373 - acc: 0.6327 - 472ms/step
step  600/2549 - loss: 0.5097 - acc: 0.6966 - 475ms/step
step  800/2549 - loss: 0.5186 - acc: 0.7331 - 474ms/step
step 1000/2549 - loss: 0.4713 - acc: 0.7580 - 474ms/step
step 1200/2549 - loss: 0.4068 - acc: 0.7759 - 474ms/step
step 1400/2549 - loss: 0.3895 - acc: 0.7900 - 474ms/step
step 1600/2549 - loss: 0.4493 - acc: 0.8010 - 473ms/step
step 1800/2549 - loss: 0.3655 - acc: 0.8102 - 476ms/step
step 2000/2549 - loss: 0.3071 - acc: 0.8175 - 478ms/step
step 2200/2549 - loss: 0.2517 - acc: 0.8239 - 478ms/step
step 2400/2549 - loss: 0.3058 - acc: 0.8293 - 480ms/step
step 2549/2549 - loss: 0.2621 - acc: 0.8328 - 481ms/step
Eval begin...
step 200/391 - loss: 0.3705 - acc: 0.8881 - 180ms/step
step 391/391 - loss: 0.2344 - acc: 0.8889 - 178ms/step
Eval samples: 

## 5. Prediction

In [18]:
predictions = []
logits = model.predict(test_loader)
for batch in logits[0]:
    batch = paddle.to_tensor(batch)
    probs = F.softmax(batch, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

Predict begin...
Predict samples: 83599


In [19]:
with open('result.txt', 'w') as f:
    f.write(idx_to_label[predictions[0]])
    for p in predictions[1:]:
        f.write('\n' + idx_to_label[p])
    f.close()

In [20]:
!zip result.txt.zip result.txt

updating: result.txt (deflated 89%)
