# 基于paddlenlp的文本相似度判断

使用paddlenlp预训练模型进行文本相似度判断，自然语言处理起步！

# 项目背景

在新闻媒体行业中，通常需要处理大量的文本内容，对于新闻编辑文本处理工作是一项繁重的工作，本文作为自然语言处理起步训练，主要是用paddlenlp预训练模型完成识别工作，主要体验paddlepaddle进行深度学习的完整步骤。

# 数据集

本文作为起步文章，使用千言文本相似度判断数据集，数据包括100000行训练数据和10000行预测数据。

使用的套件：使用了 PaddleNlp

使用了什么优化器：使用了 AdamW

调整了那些参数：
（比如batchsize设置多少、是否使用warmup、base lr是否调整、是否尝试lr stepdecay等，这样做是否有提升等）
learning_rate

心得：paddlenlp提供了常用的预训练模型，对普通开发者友好，pytorch用户可以快速转换paddle。

科大讯飞NLP算法赛baseline：学术论文分类挑战赛
https://aistudio.baidu.com/aistudio/projectdetail/2201232

# 安装paddlenlp包

In [1]:
# 完整训练代码
!pip install -U -q paddlenlp

# 数据预处理
数据集解压和数据读取

In [2]:
# 准备数据
#!unzip -qo /home/aistudio/data/bq_corpus.zip -d /home/aistudio/dataset
#!rm -rf /home/aistudio/dataset/__MACOSX/
from paddlenlp.datasets import dataset

In [3]:
# 读取文件数据
def get_data(path, is_test=False):
    dataset_ = []
    
    with open(path, "r") as f:
        lines = f.readlines()
        if is_test:
            for line in lines:
                line = line.strip()
                tmp = line.split("\t")
                dataset_.append({"title": tmp[0], "paire": tmp[1]})
        else:
            for line in lines:
                line = line.strip()
                tmp = line.split("\t")
                dataset_.append({"title": tmp[0], "paire": tmp[1], "label": int(tmp[2])})
    
    dataset_ = dataset.MapDataset(dataset_)
    
    return dataset_

In [4]:
train_ = get_data('/home/aistudio/dataset/bq_corpus/train.tsv')
dev_ = get_data('/home/aistudio/dataset/bq_corpus/dev.tsv')
test_ = get_data('/home/aistudio/dataset/bq_corpus/test.tsv', is_test=True)

# 导入所需包并定义与训练模型

In [5]:
# 导入python包
import paddlenlp as pdnlp
from paddlenlp.data import Pad, Tuple, Stack
from paddlenlp.transformers import ErnieGramTokenizer, ErnieGramForSequenceClassification
from functools import partial
import numpy as np
import paddle

In [6]:
ernie_model = ErnieGramForSequenceClassification.from_pretrained("ernie-gram-zh", num_classes=2)
ernie_tokenizer = ErnieGramTokenizer.from_pretrained("ernie-gram-zh")

[2021-08-15 17:50:36,805] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams and saved to /home/aistudio/.paddlenlp/models/ernie-gram-zh
[2021-08-15 17:50:36,873] [    INFO] - Downloading ernie_gram_zh.pdparams from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams
100%|██████████| 583566/583566 [00:24<00:00, 24009.16it/s]
[2021-08-15 17:51:17,041] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/vocab.txt
100%|██████████| 78/78 [00:00<00:00, 2579.60it/s]


# 定于数据处理方法

数据处理主要是将文字转为ID，生成paddlepaddle可以处理的批量数据

In [13]:
# 将文字转换为ID表示
# 本文尝试将文本内容分别截取之后再拼接，避免超长语句导致第二句未被取到的情况
def convert_words(line, tokenizer, max_len, is_test=False):
    half_len = int(max_len/2)
    title = line['title'][:half_len]
    paire = line['paire'][:half_len]

    contact_line = title + paire
    encoded_line = tokenizer(contact_line, max_seq_len=max_len)
    
    input_ids = encoded_line['input_ids']
    token_type_ids = encoded_line['token_type_ids']

    if not is_test:
        label = np.array([line['label']])
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

In [14]:
batch_size = 64
max_seq_length = 128

trans_func = partial(
    convert_words,
    tokenizer=ernie_tokenizer,
    max_len=max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=ernie_tokenizer.pad_token_id),       # input
    Pad(axis=0, pad_val=ernie_tokenizer.pad_token_type_id),  # segment
    Stack(dtype="int64")                               # label
): [data for data in fn(samples)]

In [15]:
# 创建批量数据迭代器
def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)

    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

In [16]:
train_data_loader = create_dataloader(
    train_,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)
dev_data_loader = create_dataloader(
    dev_,
    mode='dev',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)

# 定义神经网络需要的参数

主要是学习率、优化器、训练轮数等

In [40]:
from paddlenlp.transformers import LinearDecayWithWarmup

# 训练过程中的最大学习率
learning_rate = 5e-5
# 训练轮次
epochs = 12
# 学习率预热比例
warmup_proportion = 0.1
# 权重衰减系数，类似模型正则项策略，避免模型过拟合
weight_decay = 0.01

num_training_steps = len(train_data_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=ernie_model.parameters(),
    weight_decay=weight_decay,
    )

criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

# 定义评估函数 用于对训练效果进行评估

In [41]:
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
    model.train()
    metric.reset()

# 进行深度网络训练并保存训练模型参数

In [19]:
import paddle.nn.functional as F


global_step = 0
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, segment_ids, labels = batch
        logits = ernie_model(input_ids, segment_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 100 == 0 :
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
    evaluate(ernie_model, criterion, metric, dev_data_loader)

global step 100, epoch: 1, batch: 100, loss: 0.71411, acc: 0.49984
global step 200, epoch: 1, batch: 200, loss: 0.65717, acc: 0.52219
global step 300, epoch: 1, batch: 300, loss: 0.49822, acc: 0.55984
global step 400, epoch: 1, batch: 400, loss: 0.46257, acc: 0.60328
global step 500, epoch: 1, batch: 500, loss: 0.47058, acc: 0.63500
global step 600, epoch: 1, batch: 600, loss: 0.54045, acc: 0.65846
global step 700, epoch: 1, batch: 700, loss: 0.44414, acc: 0.67699
global step 800, epoch: 1, batch: 800, loss: 0.36696, acc: 0.69078
global step 900, epoch: 1, batch: 900, loss: 0.40430, acc: 0.70462
global step 1000, epoch: 1, batch: 1000, loss: 0.32117, acc: 0.71609
global step 1100, epoch: 1, batch: 1100, loss: 0.45164, acc: 0.72503
global step 1200, epoch: 1, batch: 1200, loss: 0.43935, acc: 0.73393
global step 1300, epoch: 1, batch: 1300, loss: 0.40799, acc: 0.74214
global step 1400, epoch: 1, batch: 1400, loss: 0.37259, acc: 0.74837
global step 1500, epoch: 1, batch: 1500, loss: 0.366

In [20]:
# 保存模型
ernie_model.save_pretrained('/home/aistudio/models/bq')
ernie_tokenizer.save_pretrained('/home/aistudio/models/bq')

# 定义预测函数并进行预测

In [None]:
# 定义预测函数
def predict(model, data, tokenizer, label_map, batch_size=1):
    examples = []
    for text in data:
        input_ids, segment_ids = convert_words(
            text,
            tokenizer,
            max_len=128,
            is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    # Seperates data into some batches.
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        # The last batch whose size is less than the config batch_size setting.
        batches.append(one_batch)

    results = []
    model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = model(input_ids, segment_ids)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]
        results.extend(labels)
    return results

In [34]:
import pandas as pd
label_map = {0:'0', 1:'1'}
results = predict(
        ernie_model, test_, ernie_tokenizer, label_map, batch_size=batch_size)

predict_results = []
for idx, text in enumerate(test_):
    predict_results.append({"index": idx, "prediction":results[idx]})
predict_results = pd.DataFrame(predict_results)
print(predict_results.shape)

(10000, 2)


In [39]:
predict_results.to_csv('/home/aistudio/bq_corpus.tsv', sep='\t', index=None)

# 学习心得

paddlenlp提供了常用的预训练模型，对普通开发者友好，pytorch用户可以快速转换paddle。

# 实践项目
科大讯飞NLP算法赛baseline：学术论文分类挑战赛
https://aistudio.baidu.com/aistudio/projectdetail/2201232