In [1]:
# !unzip /home/aistudio/data/data78992/lcqmc.zip -d /home/aistudio/data/
# !unzip /home/aistudio/data/data78992/paws-x-zh.zip -d /home/aistudio/data/
# !unzip /home/aistudio/data/data78992/bq_corpus.zip -d /home/aistudio/data/

## 1. Load dataset

In [2]:
def load_dataset(fpath, num_row_skip=0):

    def read(fp):
        data = open(fp)

        for _ in range(num_row_skip):
            next(data)

        if "test" in fp:
            for line in data:
                line = line.strip().split('\t')
                yield line[0], line[1]
        else:
            for line in data:
                line = line.strip().split('\t')
                if len(line) == 3:
                    yield line[0], line[1], int(line[2])

    if isinstance(fpath, str):
        return list(read(fpath))
    elif isinstance(fpath, (list, tuple)):
        return [list(read(fp)) for fp in fpath]
    else:
        raise TypeError("Input fpath must be a str or a list/tuple of str")

In [3]:
train_set, dev_set, test_set = load_dataset(['./data/paws-x-zh/train.tsv', './data/paws-x-zh/dev.tsv', './data/paws-x-zh/test.tsv'])
# len(train_set), len(dev_set), len(test_set)
train_set = train_set + dev_set

## 2. Transform text

In [4]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.embeddings import TokenEmbedding
import jieba


name = "w2v.baidu_encyclopedia.target.word-word.dim300"
token_embedding = TokenEmbedding(embedding_name=name)
vocab = token_embedding.vocab 


def text_encoder(text):
    tks = jieba.lcut(text)
    return [vocab[tk] for tk in tks]


def example_converter(example, text_encoder):
    text_a, text_b, label = example
    text_a_ids = text_encoder(text_a)
    text_b_ids = text_encoder(text_b)
    return text_a_ids, text_b_ids, label


def get_trans_fn(text_encoder=text_encoder):
    return lambda ex: example_converter(ex, text_encoder)


batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=vocab['[PAD]']), 
    Pad(axis=0, pad_val=vocab['[PAD]']),
    Stack(dtype="int64")
    ): fn(samples)


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):
    
    if test:
        dataset = [d + (0,) for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

[2022-03-26 00:00:44,401] [    INFO] - Loading token embedding...
W0326 00:00:49.819521  2389 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0326 00:00:49.824138  2389 device_context.cc:465] device: 0, cuDNN Version: 7.6.
[2022-03-26 00:00:55,279] [    INFO] - Finish loading embedding vector.
[2022-03-26 00:00:55,282] [    INFO] - Token Embedding info:             
Unknown index: 635963             
Unknown token: [UNK]             
Padding index: 635964             
Padding token: [PAD]             
Shape :[635965, 300]


In [5]:
batch_size = 64
trans_fn = get_trans_fn()
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
# dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, test=True, batch_size=batch_size)

## 3. Model building

In [6]:
import paddle 
import paddle.nn as nn
import paddle.nn.functional as F


class CNN(nn.Layer):

    def __init__(self, 
                 embedding,
                 output_dim,
                 embedding_dim,
                 padding_idx=vocab['[PAD]'],
                 num_filter=256,
                 filter_sizes=(3,),
                 activation=nn.ReLU()):
        
        super().__init__()

        self.embedding = embedding
        
        self.convs = nn.LayerList([
            nn.Conv1D(
                in_channels=embedding_dim,
                out_channels=num_filter,
                kernel_size=fz
            ) for fz in filter_sizes
        ])
        self.fc1 = nn.Linear(len(filter_sizes) * num_filter * 4, 
                            len(filter_sizes) * num_filter)
        self.activation = activation
        self.fc2 = nn.Linear(len(filter_sizes) * num_filter, 
                            len(filter_sizes) * num_filter // 2)
        self.fc3 = nn.Linear(len(filter_sizes) * num_filter // 2, 
                            len(filter_sizes) * num_filter // 4)
        self.fc4 = nn.Linear(len(filter_sizes) * num_filter // 4, output_dim)
    
    def encoder(self, embd):
        embd = embd.transpose((0,2,1))
        conved = [self.activation(conv(embd)) for conv in self.convs]
        max_pooled = [F.adaptive_max_pool1d(conv, output_size=1).squeeze(2) for conv in conved]
        pooled_concat = paddle.concat(max_pooled, axis=1)
        return pooled_concat
 
    def forward(self, text_a_ids, text_b_ids):
        text_a_ids_embd = self.embedding(text_a_ids)
        text_b_ids_embd = self.embedding(text_b_ids)

        encoded_a = self.encoder(text_a_ids_embd)
        encoded_b = self.encoder(text_b_ids_embd)
        dif = encoded_a - encoded_b
        mul = encoded_a * encoded_b

        concat = paddle.concat([encoded_a, encoded_b, dif, mul], axis=-1)
        hidden = self.activation(self.fc1(concat))
        hidden = self.activation(self.fc2(hidden))
        hidden = self.activation(self.fc3(hidden))
        logits = self.fc4(hidden)
        return logits

In [16]:
from paddlenlp.transformers import LinearDecayWithWarmup

epoch = 20
weight_decay = 0.001
warmup_proportion = 0.0
lr_scheduler = LinearDecayWithWarmup(5e-4, len(train_loader) * epoch,
                                         warmup_proportion)

def get_model(model):
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
    parameters=model.parameters(), 
    learning_rate=lr_scheduler, 
    weight_decay=weight_decay, 
    apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.CrossEntropyLoss()

    model = paddle.Model(model)
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [17]:
model = CNN(token_embedding, 2, 300)
model = get_model(model)

## 4. Model training

In [None]:
model.fit(train_loader, epochs=epoch, verbose=2, log_freq=100)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/20
step 100/799 - loss: 0.6855 - acc: 0.5355 - 43ms/step
step 200/799 - loss: 0.6819 - acc: 0.5444 - 43ms/step
step 300/799 - loss: 0.6815 - acc: 0.5472 - 43ms/step
step 400/799 - loss: 0.6595 - acc: 0.5477 - 43ms/step
step 500/799 - loss: 0.6932 - acc: 0.5530 - 43ms/step
step 600/799 - loss: 0.6689 - acc: 0.5589 - 43ms/step
step 700/799 - loss: 0.6306 - acc: 0.5615 - 43ms/step
step 799/799 - loss: 0.6883 - acc: 0.5632 - 43ms/step
Epoch 2/20
step 100/799 - loss: 0.6881 - acc: 0.6127 - 43ms/step
step 200/799 - loss: 0.6318 - acc: 0.6113 - 43ms/step
step 300/799 - loss: 0.5764 - acc: 0.6099 - 43ms/step
step 400/799 - loss: 0.6433 - acc: 0.6095 - 43ms/step
step 500/799 - loss: 0.6036 - acc: 0.6088 - 43ms/step
step 600/799 - loss: 0.6195 - acc: 0.6107 - 43ms/step
step 700/799 - loss: 0.6686 - acc: 0.6104 - 43ms/step
step 799/799 - loss: 0.6211 - acc: 0.6119 - 43ms/step
Epoc

## 5. Prediction

In [10]:
import paddle.nn.functional as F


predictions = []
logits = model.predict(test_loader)

for batch in logits[0]:
    batch = paddle.to_tensor(batch)
    probs = F.softmax(batch, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

Predict begin...
Predict samples: 2000


In [11]:
with open('paws-x.tsv', 'w') as f:
    f.write("index\tprediction")
    for idx, p in enumerate(predictions):
        f.write(f"\n{idx}\t{p}")
    f.close()