In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
import pandas as pd
from datasets import Dataset
import torch
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

In [3]:
df = pd.read_parquet("数据/多标签分类模型.parquet")
print(df.shape)
df

(100, 3)


Unnamed: 0,title,abstract,labels
0,Oct-3/4 regulates stem cell identity and cell ...,Although the transcriptional regulatory events...,[No Label]
1,Insulin directly stimulates VEGF-A production ...,Podocytes are critically important for maintai...,[New Finding]
2,Routine HIV screening--what counts in evidence...,,[Controversial]
3,Ultrasensitive fluorescent proteins for imagin...,Fluorescent calcium sensors are widely used to...,[Technical Advance]
4,Active VSG expression sites in Trypanosoma bru...,African trypanosomes regulate transcription di...,"[New Finding, Technical Advance]"
...,...,...,...
95,Decoding the signaling of a GPCR heteromeric c...,"Atypical antipsychotic drugs, such as clozapin...","[New Finding, Novel Drug Target]"
96,Ultrasound imaging for regional anesthesia in ...,The use of ultrasound guidance has provided an...,"[Technical Advance, Confirmation]"
97,"Quality of life, social support, and uncertain...",PURPOSE/OBJECTIVES: To examine the differences...,[Confirmation]
98,"Long-term safety and efficacy of indacaterol, ...","BACKGROUND: Indacaterol is an inhaled, long-ac...",[No Label]


In [33]:
label2idx = {}
count = {}

for labels in df["labels"].values:
    for label in labels:
        if label not in label2idx:
            label2idx[label] = len(label2idx)
            count[label] = 1
        else:
            count[label] += 1
count

{'No Label': 5741,
 'Controversial': 13959,
 'Technical Advance': 31197,
 'New Finding': 67240,
 'Good for Teaching': 30255,
 'Confirmation': 37176,
 'Interesting Hypothesis': 44630,
 'Novel Drug Target': 11754}

## 1、加载预训练模型

In [51]:
premodel = "NeuML/pubmedbert-base-embeddings"

In [35]:
tokenizer = AutoTokenizer.from_pretrained(premodel)

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(premodel,
                                                           num_labels=len(label2idx),
                                                           problem_type="multi_label_classification",
                                                           )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 2、准备数据

In [None]:
def process_fn(text):
    token = tokenizer(text["abstract"], text_pair=text["title"], max_length=512, truncation=True, padding="max_length")
    num = [[label2idx[label_] for label_ in labels_] for labels_ in text["labels"]]
    one_hot_labels = np.zeros((len(num), len(label2idx)), dtype=float)
    for idx, key in enumerate(num):
        one_hot_labels[idx, key] = 1.0
    token["labels"] = one_hot_labels.tolist()
    return token


datasets = Dataset.from_pandas(df).train_test_split(test_size=0.1, seed=2025)
dataloader = datasets.map(process_fn, batched=True, remove_columns=["title", "abstract"])
dataloader

## 3、评估标准

In [38]:
def compute_metrics(values):
    predict, target = values
    predict = (torch.sigmoid(torch.tensor(predict)) > 0.5).int().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(predict, target, average="weighted", zero_division=0)
    return {"precision": precision, "recall": recall, "f1": f1}

## 4、超参数

In [39]:
args = TrainingArguments(output_dir="模型",
                         eval_steps=200,
                         eval_strategy="steps",
                         per_device_train_batch_size=64,
                         per_device_eval_batch_size=64,
                         logging_steps=200,
                         save_steps=200,
                         save_total_limit=3,
                         learning_rate=1e-5,
                         num_train_epochs=2,
                         metric_for_best_model="f1",
                         weight_decay=0.01,
                         )

## 5、训练器

In [40]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataloader["train"],
    eval_dataset=dataloader["test"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt", padding="max_length",
                                          max_length=512),
)

## 6、训练

In [41]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1
200,0.4889,0.445237,0.55568,0.726675,0.611891
400,0.4307,0.425587,0.543609,0.722766,0.603532
600,0.4163,0.413126,0.581398,0.674085,0.608502
800,0.4118,0.408578,0.562283,0.687202,0.606489
1000,0.4082,0.406438,0.576044,0.693001,0.615277
1200,0.4071,0.402191,0.57327,0.682707,0.611866
1400,0.4006,0.401536,0.578785,0.686751,0.616167
1600,0.4,0.400529,0.561525,0.719454,0.612874
1800,0.4002,0.398545,0.564559,0.70447,0.614178
2000,0.3937,0.396637,0.557625,0.694708,0.609875


TrainOutput(global_step=3544, training_loss=0.40484509026762056, metrics={'train_runtime': 5802.9704, 'train_samples_per_second': 39.067, 'train_steps_per_second': 0.611, 'total_flos': 5.965101579352474e+16, 'train_loss': 0.40484509026762056, 'epoch': 2.0})

## 7、推理

In [42]:
from sklearn.metrics import classification_report

In [52]:
model_path = f"模型"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path, trust_remote_code=True, problem_type="multi_label_classification")
model = model.eval().to("cuda")

In [53]:
test_df = dataloader["test"].to_pandas()
test_df

Unnamed: 0,pmid,labels,input_ids,token_type_ids,attention_mask
0,21536967,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[101, 6123, 1024, 16705, 1999, 1043, 7630, 282...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,18263931,"[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[101, 8704, 1024, 1996, 6614, 1997, 2023, 2817...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,21450447,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[101, 7327, 6673, 7677, 4588, 13458, 2015, 202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,33627872,"[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]","[101, 1996, 3754, 2000, 5901, 15581, 2000, 311...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,21806284,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[101, 1996, 12353, 1997, 15965, 9113, 5991, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...
12590,17273971,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[101, 4824, 7060, 1997, 4962, 1011, 3670, 8386...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
12591,30389745,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0]","[101, 2166, 1011, 2806, 2241, 3635, 3279, 1938...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
12592,20363178,"[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[101, 2058, 1996, 2197, 2261, 5109, 1010, 1259...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
12593,17463249,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[101, 1996, 8382, 10595, 2920, 1999, 1996, 245...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [54]:
test_df['input_ids'] = test_df['input_ids'].map(lambda x: x.tolist())
test_df['attention_mask'] = test_df['attention_mask'].map(lambda x: x.tolist())
test_df['token_type_ids'] = test_df['token_type_ids'].map(lambda x: x.tolist())
test_df['labels'] = test_df['labels'].map(lambda x: [int(s) for s in x.tolist()])

In [55]:
y_pred = []
for i in range(0, test_df.shape[0], 128):
    tokens = {
        "input_ids": torch.tensor(test_df.loc[i:i + 127, "input_ids"].tolist(), device="cuda"),
        "attention_mask": torch.tensor(test_df.loc[i:i + 127, "attention_mask"].tolist(), device="cuda"),
        "token_type_ids": torch.tensor(test_df.loc[i:i + 127, "token_type_ids"].tolist(), device="cuda"),
    }

    v = model(**tokens).logits.sigmoid().cpu().detach().numpy()
    y_pred.extend([[int(i) for i in line] for line in (v > 0.5)])

In [None]:
print(classification_report(test_df["labels"].tolist(), y_pred, target_names=list(label2idx.keys()), zero_division=0))