In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 构造文本相似度模型

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, BertPreTrainedModel, BertModel, TrainingArguments, Trainer
from typing import Optional
from torch.nn import CosineSimilarity, CosineEmbeddingLoss
import torch
import evaluate

In [3]:
df = pd.read_parquet("数据/文本相似模型.parquet")
df

Unnamed: 0,text1,text2,label
0,Docetaxel target prodrug for preventing liver ...,retroviruses; retroviral protease substrate li...,1
1,Docetaxel target prodrug for preventing liver ...,bone morphogenetic proteins; chemoprevention; ...,0
2,Docetaxel target prodrug for preventing liver ...,tetrahydroisoquinoline; szyldergemajn; lurbine...,0
3,Docetaxel target prodrug for preventing liver ...,asialoglycoproteinreceptor-mediated uptake; di...,1
4,Docetaxel target prodrug for preventing liver ...,ddchaohui@sina.com; colorectal cancer; lintao4...,0
...,...,...,...
95,"LUNG CANCER DIFFERENTIAL MARKER,An object of t...",immune checkpoint inhibitors; theimmune checkp...,0
96,"LUNG CANCER DIFFERENTIAL MARKER,An object of t...",lung; kwiatkowski; balasundaram; ding l; genes...,0
97,"LUNG CANCER DIFFERENTIAL MARKER,An object of t...",peptide nucleic acids; nucleic acid recognitio...,0
98,"LUNG CANCER DIFFERENTIAL MARKER,An object of t...",hypoxia; hypoxia-inducible factor-1alpha; tumo...,0


In [22]:
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'label'],
        num_rows: 21959
    })
    test: Dataset({
        features: ['text1', 'text2', 'label'],
        num_rows: 2440
    })
})

## 1、处理数据

In [23]:
model_path = "NeuML/pubmedbert-base-embeddings"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [25]:
def process_func(examples):
    sentences, labels = [], []

    for sent1, sent2, label in zip(examples["text1"], examples["text2"], examples["label"]):
        sentences.append(sent1)
        sentences.append(sent2)
        labels.append(1 if label == 1 else -1)

    token = tokenizer(sentences, max_length=256, truncation=True, padding="max_length", return_tensors="np")
    token = {k: v.reshape(-1, 2, 256) for k, v in token.items()}
    token["labels"] = labels
    return token

In [26]:
dataloader = dataset.map(process_func, batched=True, remove_columns=dataset["test"].column_names)
dataloader

Map:   0%|          | 0/21959 [00:00<?, ? examples/s]

Map:   0%|          | 0/2440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 21959
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2440
    })
})

## 2、创建模型

In [27]:
class DualModel(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.post_init()

    def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            token_type_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            head_mask: Optional[torch.Tensor] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 分别获取句子1和句子2的输入
        text1_input_ids, text2_input_ids = input_ids[:, 0], input_ids[:, 1]
        text1_mask, text2_mask = attention_mask[:, 0], attention_mask[:, 1]
        text1_type_ids, text2_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        # 句子1的获取向量表示
        text1_outputs = self.bert(
            text1_input_ids,
            attention_mask=text1_mask,
            token_type_ids=text1_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text1_pooled_output = text1_outputs[1]

        # 句子2的获取向量表示
        text2_outputs = self.bert(
            text2_input_ids,
            attention_mask=text2_mask,
            token_type_ids=text2_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        text2_pooled_output = text2_outputs[1]

        # 计算相似度
        cos = CosineSimilarity()(text1_pooled_output, text2_pooled_output)

        # 计算loss
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(margin=0.3)
            loss = loss_fct(text1_pooled_output, text2_pooled_output, labels)
            return loss, cos

        return None, cos

In [28]:
model = DualModel.from_pretrained(model_path)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 3、创建评估函数

In [29]:
metric = evaluate.combine([
    evaluate.load("accuracy", average="macro"),
    evaluate.load("f1", average="macro"),
    evaluate.load("precision", average="macro"),
    evaluate.load("recall", average="macro"),
])

In [30]:
def metric_fn(values):
    predictions, labels = values
    predictions = [int(p > 0.6) for p in predictions]
    labels = [int(label > 0) for label in labels]
    return metric.compute(predictions, labels)

## 4、创建训练参数

In [31]:
args = TrainingArguments(
    output_dir="模型/文本相似模型",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_steps=100,
    eval_strategy="steps",
    learning_rate=1e-4,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    logging_steps=100,
    save_total_limit=2,
    save_steps=100,
)



## 5、创建训练器

In [32]:
trainer = Trainer(
    model=model,
    args=args,
    compute_metrics=metric_fn,
    train_dataset=dataloader["train"],
    eval_dataset=dataloader["test"],
)

## 6、训练

In [33]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.2346,0.196976,0.761066,0.606347,0.697205,0.53644
200,0.1983,0.207347,0.729098,0.58558,0.616095,0.557945
300,0.196,0.184515,0.767623,0.631579,0.692308,0.580645
400,0.1804,0.174469,0.779098,0.65068,0.711048,0.599761
500,0.1757,0.168646,0.789344,0.668387,0.726508,0.618877
600,0.1691,0.167674,0.788115,0.677882,0.708333,0.64994
700,0.1663,0.160186,0.793033,0.675241,0.731198,0.62724
800,0.2998,0.459877,0.343033,0.510833,0.343033,1.0
900,0.3726,0.459877,0.343033,0.510833,0.343033,1.0
1000,0.3287,0.459877,0.343033,0.510833,0.343033,1.0


TrainOutput(global_step=2061, training_loss=0.26238015205868237, metrics={'train_runtime': 1612.1979, 'train_samples_per_second': 40.862, 'train_steps_per_second': 1.278, 'total_flos': 1.7332655742517248e+16, 'train_loss': 0.26238015205868237, 'epoch': 3.0})

In [34]:
trainer.save_model("模型/文本相似模型")

## 7、推理

In [35]:
class SentenceSimilarityPipeline:
    def __init__(self, model_, tokenizer_):
        self.model = model_.eval()
        self.tokenizer = tokenizer_
        self.device = model_.device

    def __call__(self, text1, text2):
        texts = []
        if isinstance(text1, str) and isinstance(text2, str):
            texts.extend([text1, text2])
        elif isinstance(text1, str) and isinstance(text2, list):
            for text_ in text2:
                texts.extend([text1, text_])
        elif isinstance(text1, list) and isinstance(text2, list):
            assert len(text1) == len(text2), Exception("输入的长度要相同")
            for t1, t2 in zip(text1, text2):
                texts.extend([t1, t2])
        else:
            raise Exception("输入的格式有问题")

        token = self.tokenizer(texts, max_length=256, truncation=True, padding="max_length", return_tensors="pt")
        token = {k: v.reshape(-1, 2, 256).to(self.device) for k, v in token.items()}
        predict = self.model(**token)[1]
        return predict.cpu().detach().numpy()

In [36]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

In [None]:
for text, a in dataset["test"].to_pandas().groupby("text1"):
    predicts = pipe(text, a["text2"].tolist())
    print((predicts > 0.5).astype(int))
    print(a["label"].tolist())