In [None]:
!pip install --upgrade pip
!pip install tqdm
!nvidia-smi

In [None]:
import paddle
import paddlenlp
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

In [None]:
import os
import sklearn
from sklearn.model_selection import train_test_split
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import functools

In [None]:
# 清洗无效字符
def clean_text(text):
    # text = text.replace("\r", "").replace("\n", "")
    # text = text.replace("\r", "").replace("\n", "").replace(' ', '')
    text = text.replace("\r", "").replace("\n", "")
    text = re.sub(r"\\n\n", ".", text)
    return text

# 定义读取数据集函数
def read_data(filepath, is_one_hot=True):
    f = open(filepath)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split(',', 1)
        # 多分类标签One-hot处理
        if is_one_hot:
            label_true = data[1].strip('"').strip('[').strip(']').replace("'", '').replace(' ', '').split(',')
            # labels = [float(1) if str(i) in data[1].split(',') else float(0) for i in range(8)]
            labels = [float(1) if label_vocab[i] in label_true else float(0) for i in range(len(label_vocab))]
        else:
            labels = [int(d) for d in data[1].split(',')]
        yield {"text": clean_text(data[0]), "labels": labels}
    f.close()

In [None]:
# 读取数据集
dataset_path = './work/data/Train.csv'
dataset = pd.read_csv(dataset_path, encoding='gb18030')
dataset = dataset.drop('ID', axis=1)
# print(dataset.info())
# print(dataset.head(5))

# # 创建训练集和测试集
train_ds, test_ds = train_test_split(dataset, test_size=0.3, random_state=42)
train_ds.to_csv('./work/data/train.csv', header=False, index=False)
test_ds.to_csv('./work/data/test.csv', header=False, index=False)

# 标签字典
# labels_list = dataset['Labels'].apply(ast.literal_eval).tolist()
# labels = list(set([item for sublist in labels_list for item in sublist]))
# label_vocab = {}
# for i in range(len(labels)):
#     label_vocab[i] = labels[i]
# print(label_vocab) # {0: 'Anger', 1: 'Expect', 2: 'Love', 3: 'Sorrow', 4: 'Anxiety', 5: 'Hate', 6: 'Joy', 7: 'Surprise'}
label_vocab = {0: 'Anger', 1: 'Expect', 2: 'Love', 3: 'Sorrow', 4: 'Anxiety', 5: 'Hate', 6: 'Joy', 7: 'Surprise'}

# # 读取训练集和测试集
train_ds = load_dataset(read_data, filepath='./work/data/train.csv', lazy=False)
test_ds = load_dataset(read_data, filepath='./work/data/test.csv', lazy=False)
# print(type(train_ds), type(test_ds))
print(train_ds[3])
print(test_ds[3])

In [None]:
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-xbase-zh"
num_classes = len(label_vocab)  # 8分类任务
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=num_classes)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length):
    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)
    result["labels"] = examples["labels"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=100)
train_ds = train_ds.map(trans_func)
test_ds = test_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=16, shuffle=True)
test_batch_sampler = BatchSampler(test_ds, batch_size=16, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from paddle.metric import Metric

# 自定义MultiLabelReport评价指标
class MultiLabelReport(Metric):
    """
    AUC and F1 Score for multi-label text classification task.
    """

    def __init__(self, name='MultiLabelReport', average='micro'):
        super(MultiLabelReport, self).__init__()
        self.average = average
        self._name = name
        self.reset()

    def f1_score(self, y_prob):
        '''
        Returns the f1 score by searching the best threshhold
        '''
        best_score = 0
        for threshold in [i * 0.01 for i in range(100)]:
            self.y_pred = y_prob > threshold
            score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
            if score > best_score:
                best_score = score
                precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
                recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        return best_score, precison, recall

    def reset(self):
        """
        Resets all of the metric state.
        """
        self.y_prob = None
        self.y_true = None

    def update(self, probs, labels):
        if self.y_prob is not None:
            self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
        else:
            self.y_prob = probs.numpy()
        if self.y_true is not None:
            self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
        else:
            self.y_true = labels.numpy()

    def accumulate(self):
        auc = roc_auc_score(
            y_score=self.y_prob, y_true=self.y_true, average=self.average)
        f1_score, precison, recall = self.f1_score(y_prob=self.y_prob)
        return auc, f1_score, precison, recall

    def name(self):
        """
        Returns metric name
        """
        return self._name

In [None]:
import time
import paddle.nn.functional as F

# AdamW优化器、交叉熵损失函数、自定义MultiLabelReport评价指标
lr = 2e-5
lr_scheduler = paddle.optimizer.lr.StepDecay(learning_rate=lr, step_size=1, gamma=0.5)
optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=0.01)
criterion = paddle.nn.BCEWithLogitsLoss()
metric = MultiLabelReport()

In [None]:
import paddle
import numpy as np
import paddle.nn.functional as F

# 构建验证集evaluate函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    metric.reset()
    losses = []
    results = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.sigmoid(logits)
        losses.append(loss.numpy())
        metric.update(probs, labels)
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0.5:
                        result.append(label_vocab[c])
                results.append(','.join(result))

    auc, f1_score, precison, recall = metric.accumulate()
    print("eval loss: %.5f, auc: %.5f, f1 score: %.5f, precison: %.5f, recall: %.5f" %
          (np.mean(losses), auc, f1_score, precison, recall))
    model.train()
    metric.reset()
    if if_return_results:
        return results
    else:
        return f1_score

In [None]:
epochs = 4 # 训练轮次
ckpt_dir = "ckpt" # 训练过程中保存模型参数的文件夹

global_step = 0  # 迭代次数
tic_train = time.time()
best_f1_score = 0

# 模型训练
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率、f1分数
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.sigmoid(logits)
        metric.update(probs, labels)
        auc, f1_score, _, _ = metric.accumulate()

        # 每迭代10次，打印损失函数值、准确率、f1分数、计算速度
        global_step += 1
        if global_step % 50 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, auc: %.5f, f1 score: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, auc, f1_score,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        
        # 每迭代40次，评估当前训练的模型、保存当前最佳模型参数和分词器的词表等
        if global_step % 100 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            eval_f1_score = evaluate(model, criterion, metric, test_data_loader, label_vocab, if_return_results=False)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

    lr_scheduler.step()

In [None]:
# 模型验证
results = evaluate(model, criterion, metric, test_data_loader, label_vocab)

In [None]:
# 定义数据加载和处理函数
from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
def convert_example(example, tokenizer, max_seq_length=64, is_test=False):
    qtconcat = example["text"]
    encoded_inputs = tokenizer(text=qtconcat, max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    if not is_test:
        label = np.array([example["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

# 定义模型预测函数
def predict(model, data, tokenizer, label_vocab, batch_size=1, max_seq=64):
    examples = []
    # 将输入数据（list格式）处理为模型可接受的格式
    for text in data:
        input_ids, segment_ids = convert_example(
            text,
            tokenizer,
            max_seq_length=max_seq,
            is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    # Seperates data into some batches.
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        # The last batch whose size is less than the config batch_size setting.
        batches.append(one_batch)

    results = []
    model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = model(input_ids, segment_ids)
        # print(len(logits))
        probs = F.sigmoid(logits)
        probs = probs.tolist()
        # 结果处理,选取概率大于0.5的情感类别
        for idx, prob in enumerate(probs):
            result = []
            for c, pred in enumerate(prob):
                if pred > 0.45:
                    result.append(label_vocab[c])
            results.append(','.join(result))
    return results  # 返回预测结果

In [None]:
import pandas as pd
data = pd.read_csv("./work/test.csv", encoding='gb18030')
data = data.drop('ID', axis=1)
data.to_csv("./work/data.csv", header=False, index=False)

data = data.values
for_test = []
for i in data:
    for_test.append({"text": i[0]})
# print(for_test)

# 模型预测
labels =  predict(model, for_test, tokenizer, label_vocab, batch_size=1)


# 保存预测结果
submit = pd.DataFrame(columns=['ID', 'Labels'])
for idx, label in enumerate(labels):
    # print(idx + 1, type(label))
    label = label.split(",")
    if len(label) == 1:
        s = '"[' + label[0] + ']"'
        submit = submit.append(pd.DataFrame([[idx + 1, s]], columns=submit.columns))
        continue    
    submit = submit.append(pd.DataFrame([[idx + 1, label]], columns=submit.columns))

submit.to_csv("./submit.csv", index=False)