In [1]:
!pip install paddlepaddle --upgrade

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
You should consider upgrading via the '/opt/conda/envs/python35-paddle120-env/bin/python -m pip install --upgrade pip' command.[0m


## 1. Load dataset

In [2]:
def load_dataset(fpath, test=False, num_row_to_skip=0):
    data = open(fpath)
    for _ in range(num_row_to_skip):
        next(data)

    out = []

    if test:
        for line in data:
            out.append(line.strip())
        
        return out

    idx_to_label = {}
    for line in data:
        line = line.strip().split('\t')
        if len(line) == 3:
            idx, label, text = line
            idx = int(idx)
            idx_to_label[idx] = label
            out.append([text, idx])
    
    return out, idx_to_label

In [3]:
train_set, idx_to_label = load_dataset('./data/data12701/Train.txt')
len(train_set), train_set[:2], idx_to_label

(752471,
 [['上证50ETF净申购突增', 0], ['交银施罗德保本基金将发行', 0]],
 {0: '财经',
  1: '彩票',
  2: '房产',
  3: '股票',
  4: '家居',
  5: '教育',
  6: '科技',
  7: '社会',
  8: '时尚',
  9: '时政',
  10: '体育',
  11: '星座',
  12: '游戏',
  13: '娱乐'})

In [4]:
# split the train_set into train and dev sets
from random import shuffle, seed

seed(43)
shuffle(train_set)

train_set, dev_set = train_set[:652471], train_set[652471: ]

In [5]:
test_set = load_dataset('./data/data12701/Test.txt', test=True)
len(test_set), test_set[:2]

(83599, ['北京君太百货璀璨秋色 满100省353020元', '教育部：小学高年级将开始学习性知识'])

## 2. Transform text

In [6]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.data import Vocab


class TextVectorizer:
     
    def __init__(self, tokenizer=None):
        self.tokenize = tokenizer
        self.vocab_to_idx = None
        self._V = None
    
    def build_vocab(self, text):
        tokens = list(map(self.tokenize, text))
        self._V = Vocab.build_vocab(tokens, unk_token='[UNK]', pad_token='[PAD]')
        self.vocab_to_idx = self._V.token_to_idx
        
    def text_encoder(self, text):
        if isinstance(text, list):
            return [self(t) for t in text]
        
        tks = self.tokenize(text)
        out = [self.vocab_to_idx[tk] for tk in tks]
        return out

    def __len__(self):
        return len(self.vocab_to_idx)

    def __getitem__(self, w):
        return self.vocab_to_idx[w]
    
    def __call__(self, text):
        if self.vocab_to_idx:
            return self.text_encoder(text)
        raise ValueError("No vocab is built!")


def example_converter(example, text_encoder, include_seq_len):
    
    text, label = example
    encoded = text_encoder(text)
    if include_seq_len:
        text_len = len(encoded)
        return encoded, text_len, label
    return encoded, label


def get_trans_fn(text_encoder, include_seq_len):
    return lambda ex: example_converter(ex, text_encoder, include_seq_len)


def get_batchify_fn(include_seq_len):
    
    if include_seq_len:
        stack = [Stack(dtype="int64")] * 2
    else:
        stack = [Stack(dtype="int64")]
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  
        *stack
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):
    
    if test:
        dataset = [[d, 0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder


In [7]:
text = [t[0] for t in train_set]
V = TextVectorizer(list)
V.build_vocab(text)
print("Number of vocab (char):", len(V))

Number of vocab (char): 5207


In [8]:
include_seq_len = False; batch_size = 256
trans_fn = get_trans_fn(V, include_seq_len=include_seq_len)
batchify_fn = get_batchify_fn(include_seq_len=include_seq_len)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, test=True,
                                shuffle=False, batch_size=batch_size)

## 3. Model building

In [9]:
import paddle 
import paddle.nn as nn
import paddle.nn.functional as F


class CNN(nn.Layer):

    def __init__(self,
                 vocab_size,
                 output_dim,
                 embedding_dim=100,
                 padding_idx=0,
                 num_filter=256,
                 filter_sizes=(3,),
                 hidden_dim=50,
                 activation=nn.ReLU()):
        
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=padding_idx)
        
        self.convs = nn.LayerList([
            nn.Conv1D(
                in_channels=embedding_dim,
                out_channels=num_filter,
                kernel_size=fz
            ) for fz in filter_sizes
        ])
        self.dense = nn.Linear(len(filter_sizes) * num_filter, hidden_dim)
        self.activation = activation
        self.dense_out = nn.Linear(hidden_dim, output_dim)
    
    def encoder(self, embd):
        embd = embd.transpose((0,2,1))
        conved = [self.activation(conv(embd)) for conv in self.convs]
        max_pooled = [F.adaptive_max_pool1d(conv, output_size=1).squeeze(2) for conv in conved]
        pooled_concat = paddle.concat(max_pooled, axis=1)
        return pooled_concat
 
    def forward(self, text_ids):
        text_embd = self.embedding(text_ids)
        encoded = self.encoder(text_embd)
        hidden_out = self.activation(self.dense(encoded))
        out_logits = self.dense_out(hidden_out)
        return out_logits


def get_model(model):
    model = paddle.Model(model)
    optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(), learning_rate=5e-4)
    criterion = paddle.nn.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [10]:
model = CNN(len(V), len(idx_to_label))
model = get_model(model)

## 4. Model training

In [11]:
from paddle.callbacks import EarlyStopping

earlystop = EarlyStopping(patience=3)

model.fit(train_loader, dev_loader, epochs=20, verbose=2, log_freq=200, callbacks=[earlystop])

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/20


  return (isinstance(seq, collections.Sequence) and


step  200/2549 - loss: 0.8965 - acc: 0.4866 - 103ms/step
step  400/2549 - loss: 0.6837 - acc: 0.6349 - 103ms/step
step  600/2549 - loss: 0.4480 - acc: 0.7015 - 103ms/step
step  800/2549 - loss: 0.5344 - acc: 0.7391 - 103ms/step
step 1000/2549 - loss: 0.4914 - acc: 0.7645 - 102ms/step
step 1200/2549 - loss: 0.5045 - acc: 0.7827 - 102ms/step
step 1400/2549 - loss: 0.3949 - acc: 0.7969 - 102ms/step
step 1600/2549 - loss: 0.3352 - acc: 0.8082 - 102ms/step
step 1800/2549 - loss: 0.2884 - acc: 0.8172 - 102ms/step
step 2000/2549 - loss: 0.2896 - acc: 0.8250 - 102ms/step
step 2200/2549 - loss: 0.2584 - acc: 0.8315 - 101ms/step
step 2400/2549 - loss: 0.2963 - acc: 0.8372 - 102ms/step
step 2549/2549 - loss: 0.3044 - acc: 0.8409 - 102ms/step
Eval begin...
step 200/391 - loss: 0.3078 - acc: 0.9015 - 37ms/step
step 391/391 - loss: 0.2728 - acc: 0.9019 - 36ms/step
Eval samples: 100000
Epoch 2/20
step  200/2549 - loss: 0.3650 - acc: 0.9131 - 101ms/step
step  400/2549 - loss: 0.1999 - acc: 0.9140 - 10

## 5. Prediction

In [12]:
predictions = []
logits = model.predict(test_loader)
for batch in logits[0]:
    batch = paddle.to_tensor(batch)
    probs = F.softmax(batch, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

Predict begin...
Predict samples: 83599


In [13]:
with open('result.txt', 'w') as f:
    f.write(idx_to_label[predictions[0]])
    for p in predictions[1:]:
        f.write('\n' + idx_to_label[p])
    f.close()

In [14]:
!zip result.txt.zip result.txt

updating: result.txt (deflated 89%)
