In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
from datasets import Dataset
import numpy as np
import evaluate

In [5]:
df = pd.read_parquet("数据/多分类模型.parquet")
print(df.shape)
df.head()

(100, 2)


Unnamed: 0,Data,Label
0,KeyWords: retroviruses; retroviral protease su...,1
1,KeyWords: bone morphogenetic proteins; chemopr...,0
2,KeyWords: tetrahydroisoquinoline; szyldergemaj...,0
3,KeyWords: asialoglycoproteinreceptor-mediated ...,1
4,KeyWords: ddchaohui@sina.com; colorectal cance...,0


### 加载预训练模型

如果是多标签分类只需要增加 problem_type="multi_label_classification" 参数

In [None]:
model_path = "NeuML/pubmedbert-base-embeddings"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, trust_remote_code=True)

## 准备数据

In [None]:
def tokenize(text):
    token = tokenizer(text["Data"], max_length=512, truncation=True)
    token["labels"] = text["Label"]
    return token


datasets = Dataset.from_pandas(df).train_test_split(test_size=0.1)
dataloader = datasets.map(tokenize, batched=True, remove_columns=datasets["test"].column_names)

## 评估标准

In [None]:
metric = evaluate.combine(
    [
        evaluate.load("accuracy", average="macro"),
        evaluate.load("f1", average="macro"),
        evaluate.load("precision", average="macro"),
        evaluate.load("recall", average="macro"),
    ]
)


def compute_metrics(eval_pred):
    logit, labels = eval_pred
    predictions = np.argmax(logit, axis=-1)
    return metric.compute(predictions, labels)

## 训练器

In [None]:
args = TrainingArguments(output_dir="./model",
                         eval_steps=100,
                         eval_strategy="steps",
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=32,
                         logging_steps=100,
                         save_steps=100,
                         save_total_limit=2,
                         learning_rate=1e-5,
                         num_train_epochs=5,
                         metric_for_best_model="f1",
                         load_best_model_at_end=True,
                         weight_decay=0.01,
                         )

In [None]:
train = Trainer(
    model=model,
    args=args,
    train_dataset=dataloader["train"],
    eval_dataset=dataloader["test"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, max_length=512, padding="max_length"),
)

# 训练

In [None]:
train.train()

# 测试

In [None]:
train.evaluate(dataloader["test"])

# 推理


In [None]:
from transformers import pipeline

In [None]:
infer = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [None]:
infer(["KeyWords: bone morphogenetic proteins; chemoprevention; cancer metastasis;"])