In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers.data import DataCollatorForTokenClassification
import evaluate
from datasets import Dataset
import numpy as np
import pandas as pd

# NER模型

In [3]:
label_list = \
    {0: 'O',
     1: 'B-PER',
     2: 'I-PER',
     3: 'B-ORG',
     4: 'I-ORG',
     5: 'B-LOC',
     6: 'I-LOC'}

In [4]:
df = pd.read_parquet('数据/Token分类模型.parquet')
print(df.shape)
df.head()

(100, 2)


Unnamed: 0,tokens,ner_tags
0,"[海, 钓, 比, 赛, 地, 点, 在, 厦, 门, 与, 金, 门, 之, 间, 的, ...","[0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, ..."
1,"[这, 座, 依, 山, 傍, 水, 的, 博, 物, 馆, 由, 国, 内, 一, 流, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[但, 作, 为, 一, 个, 共, 产, 党, 员, 、, 人, 民, 公, 仆, ，, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[在, 发, 达, 国, 家, ，, 急, 救, 保, 险, 十, 分, 普, 及, ，, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[日, 俄, 两, 国, 国, 内, 政, 局, 都, 充, 满, 变, 数, ，, 尽, ...","[5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
ner_data = Dataset.from_pandas(df).train_test_split(test_size=0.1)
ner_data

## 1、数据处理

In [None]:
model_path = 'hfl/chinese-macbert-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
def process_example(examples):
    # is_split_into_words 意思是输入的单词是分割的
    token = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True, max_length=512)
    labels = []
    for index, tags in enumerate(examples["ner_tags"]):
        word_ids = token.word_ids(index)
        label = [-100 if ids is None else tags[ids] for ids in word_ids]
        labels.append(label)
    token["labels"] = labels
    return token

In [None]:
net_dataset = ner_data.map(process_example, batched=True)

## 2、创建模型

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list), id2label=label_list)

## 3、创建评估函数

In [None]:
seqeval = evaluate.load("seqeval")
seqeval

In [None]:
def metrics(values):
    predict, labels = values
    index = np.argmax(predict, axis=-1)

    trues, predicates = [], []
    for predicate, label in zip(index, labels):
        true = [label_list[l] for l in label if l != -100]

        predicate = [label_list[p] for p, l in zip(predicate, label) if l != -100]
        trues.append(true)
        predicates.append(predicate)
    result = seqeval.compute(predictions=predicates, references=trues, mode="strict", scheme="IOB2")
    return {"f1": result["overall_f1"], "recall": result["overall_recall"], "precision": result["overall_precision"]}

## 4、训练参数

In [28]:
args = TrainingArguments(
    output_dir="结果",
    eval_strategy="steps",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    num_train_epochs=3,
    logging_steps=100,
)

## 5、创建训练器

In [29]:
train = Trainer(
    model=model,
    args=args,
    train_dataset=net_dataset["train"],
    eval_dataset=net_dataset["test"],
    compute_metrics=metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer, max_length=512, padding="max_length"),
)

In [30]:
train.train()



Step,Training Loss,Validation Loss,F1,Recall,Precision
100,0.0198,0.032274,0.929755,0.929441,0.93007
200,0.0138,0.03399,0.92306,0.935753,0.910706
300,0.0142,0.028721,0.931594,0.925609,0.937657
400,0.0162,0.030679,0.930164,0.921776,0.938705
500,0.013,0.031788,0.938753,0.939811,0.937697
600,0.0154,0.027053,0.940223,0.941389,0.93906
700,0.0162,0.022372,0.946942,0.949504,0.944395
800,0.0248,0.023178,0.946931,0.949279,0.944594
900,0.0113,0.022117,0.942707,0.947701,0.937765
1000,0.0109,0.023011,0.947085,0.95018,0.944009


  state_dict = torch.load(best_model_path, map_location="cpu")


TrainOutput(global_step=2349, training_loss=0.010168542122018748, metrics={'train_runtime': 2140.7318, 'train_samples_per_second': 35.088, 'train_steps_per_second': 1.097, 'total_flos': 1.9627931820017664e+16, 'train_loss': 0.010168542122018748, 'epoch': 3.0})

## 6、预测

In [31]:
from transformers import pipeline

In [32]:
# aggregation_strategy 指定聚合策略
ner = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple")

In [34]:
pd.DataFrame(ner("张伟明天去协和医院上班地址在北京"))

Unnamed: 0,entity_group,score,word,start,end
0,PER,0.9992,张 伟,0,2
1,ORG,0.999289,协 和 医 院,5,9
2,LOC,0.999236,北 京,14,16
