# 实体识别

实体是指识别文本中具有特定意义的名词

In [None]:
import torch
import evaluate

from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification, BertForSequenceClassification, BertForTokenClassification

In [2]:
dataset = DatasetDict.load_from_disk("../../datas/ner_data")

In [18]:
label_list = dataset["train"].features["ner_tags"].feature.names

In [3]:
print(dataset["train"][0])

{'id': '0', 'tokens': ['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}


In [4]:
tokenizer = AutoTokenizer.from_pretrained("models/macbert-base")

In [6]:
def process_func(examples):
    tokenized_data = tokenizer(
        examples["tokens"], 
        max_length=128,
        truncation=True,
        padding=True,
        is_split_into_words=True
    )

    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word2ids = tokenized_data.word_ids(batch_index=idx) # words方法也ok, 返回token对应的word的索引
        each_sentence_labels = []
        for item in word2ids:
            if item is None: # [CLS] and [SEP] 返回None
                each_sentence_labels.append(-100)
            else:
                each_sentence_labels.append(label[item])
        labels.append(each_sentence_labels)
    
    tokenized_data["labels"] = labels
    return tokenized_data

In [7]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/20865 [00:00<?, ? examples/s]

Map:   0%|          | 0/2319 [00:00<?, ? examples/s]

Map:   0%|          | 0/4637 [00:00<?, ? examples/s]

In [46]:
model = AutoModelForTokenClassification.from_pretrained("models/macbert-base", num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at models/macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
seqeval = evaluate.load("seqeval")

In [None]:
def compute_metrics(pred):
    predictions, labels = pred
    predictions = predictions.argmax(axis=-1)

    # 将标签index转化为标签名
    predictions = [
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]

    return seqeval.compute(
        predictions=predictions,
        references=labels,
        mode="strict",
        scheme="IOB2"
    )

In [48]:
args = TrainingArguments(
    output_dir="trained/model_for_tokenClass",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    optim="adafactor",
    num_train_epochs=1,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="overall_f1"
)

In [49]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)

In [50]:
trainer.train()

Step,Training Loss,Validation Loss,Loc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
200,0.0378,0.034849,"{'precision': 0.8852504897844948, 'recall': 0.9131062355658198, 'f1': 0.8989626261190848, 'number': 3464}","{'precision': 0.8043571123451516, 'recall': 0.8693444136657433, 'f1': 0.8355890836476593, 'number': 2166}","{'precision': 0.9740698985343855, 'recall': 0.9494505494505494, 'f1': 0.9616026711185309, 'number': 1820}",0.881113,0.909262,0.894966,0.989373
400,0.0316,0.026104,"{'precision': 0.9058531198233021, 'recall': 0.9471709006928406, 'f1': 0.9260513688964155, 'number': 3464}","{'precision': 0.8441901408450704, 'recall': 0.8855032317636196, 'f1': 0.8643533123028391, 'number': 2166}","{'precision': 0.9736842105263158, 'recall': 0.9758241758241758, 'f1': 0.9747530186608122, 'number': 1820}",0.903732,0.936242,0.919699,0.991694
600,0.0222,0.022837,"{'precision': 0.9222565253999438, 'recall': 0.9486143187066974, 'f1': 0.9352497509605805, 'number': 3464}","{'precision': 0.8521383075523203, 'recall': 0.8647276084949215, 'f1': 0.8583868010999083, 'number': 2166}","{'precision': 0.9740761169332598, 'recall': 0.9703296703296703, 'f1': 0.9721992843380126, 'number': 1820}",0.914312,0.92953,0.921858,0.992682


Trainer is attempting to log a value of "{'precision': 0.8852504897844948, 'recall': 0.9131062355658198, 'f1': 0.8989626261190848, 'number': 3464}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8043571123451516, 'recall': 0.8693444136657433, 'f1': 0.8355890836476593, 'number': 2166}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9740698985343855, 'recall': 0.9494505494505494, 'f1': 0.9616026711185309, 'number': 1820}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9058531198233021, 'recall': 0.9471709006928406

TrainOutput(global_step=653, training_loss=0.049839773402188495, metrics={'train_runtime': 1430.4732, 'train_samples_per_second': 14.586, 'train_steps_per_second': 0.456, 'total_flos': 1363050820834560.0, 'train_loss': 0.049839773402188495, 'epoch': 1.0})

In [52]:
trainer.evaluate(eval_dataset=tokenized_data["validation"])

Trainer is attempting to log a value of "{'precision': 0.9409166206515737, 'recall': 0.9471928849360756, 'f1': 0.9440443213296399, 'number': 1799}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8774062816616008, 'recall': 0.8872950819672131, 'f1': 0.8823229750382069, 'number': 976}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9748283752860412, 'recall': 0.9659863945578231, 'f1': 0.970387243735763, 'number': 882}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.019259439781308174,
 'eval_LOC': {'precision': 0.9409166206515737,
  'recall': 0.9471928849360756,
  'f1': 0.9440443213296399,
  'number': 1799},
 'eval_ORG': {'precision': 0.8774062816616008,
  'recall': 0.8872950819672131,
  'f1': 0.8823229750382069,
  'number': 976},
 'eval_PER': {'precision': 0.9748283752860412,
  'recall': 0.9659863945578231,
  'f1': 0.970387243735763,
  'number': 882},
 'eval_overall_precision': 0.931917211328976,
 'eval_overall_recall': 0.9357396773311457,
 'eval_overall_f1': 0.9338245326784008,
 'eval_overall_accuracy': 0.9938909730473099,
 'eval_runtime': 33.9168,
 'eval_samples_per_second': 68.373,
 'eval_steps_per_second': 2.152,
 'epoch': 1.0}

In [53]:
from transformers import pipeline

In [58]:
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}

In [59]:
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [62]:
res = pipe("小明在北京上班")
res

[{'entity': 'B-PER',
  'score': 0.59083873,
  'index': 1,
  'word': '小',
  'start': 0,
  'end': 1},
 {'entity': 'I-PER',
  'score': 0.73853093,
  'index': 2,
  'word': '明',
  'start': 1,
  'end': 2},
 {'entity': 'B-LOC',
  'score': 0.9979704,
  'index': 4,
  'word': '北',
  'start': 3,
  'end': 4},
 {'entity': 'I-LOC',
  'score': 0.9978807,
  'index': 5,
  'word': '京',
  'start': 4,
  'end': 5}]