# Named Entity Recognition (NER)

### 1. Import libraries

In [1]:
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import load_dataset

### 2. Load dataset

In [2]:
ner_dataset = load_dataset("arrow", data_dir = "./peoples_daily_ner")

In [3]:
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4637
    })
})

In [4]:
print(ner_dataset["train"][4])

{'id': '4', 'tokens': ['日', '俄', '两', '国', '国', '内', '政', '局', '都', '充', '满', '变', '数', '，', '尽', '管', '日', '俄', '关', '系', '目', '前', '是', '历', '史', '最', '佳', '时', '期', '，', '但', '其', '脆', '弱', '性', '不', '言', '自', '明', '。'], 'ner_tags': [5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [5]:
ner_dataset["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}

In [6]:
label_list = ner_dataset["train"].features["ner_tags"].feature.names

In [7]:
type(label_list)

list

In [8]:
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

`ner_tags` has the index of the labels in `label_list`. 

`label_list` is like a look-up list for the label numbers and the actual label.

### 4. Preprocess data

In [9]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-large")

In [10]:
ner_dataset["train"][0]["tokens"]

['海',
 '钓',
 '比',
 '赛',
 '地',
 '点',
 '在',
 '厦',
 '门',
 '与',
 '金',
 '门',
 '之',
 '间',
 '的',
 '海',
 '域',
 '。']

In [11]:
tokenizer(ner_dataset["train"][0]["tokens"])

{'input_ids': [[101, 3862, 102], [101, 7157, 102], [101, 3683, 102], [101, 6612, 102], [101, 1765, 102], [101, 4157, 102], [101, 1762, 102], [101, 1336, 102], [101, 7305, 102], [101, 680, 102], [101, 7032, 102], [101, 7305, 102], [101, 722, 102], [101, 7313, 102], [101, 4638, 102], [101, 3862, 102], [101, 1818, 102], [101, 511, 102]], 'token_type_ids': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}

Because in the original data, each character is seprated in a string in the list, that's why in the tokenized data, each small list has 3 numbers inside, with 101 as first and 102 as the last. This is not correct.

Adding
```python
is_split_into_words = True
```
will let the tokenizer understand the separation of characters.

In [12]:
tokenizer(ner_dataset["train"][0]["tokens"], is_split_into_words = True)

{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

English words can be conevrt to multiple tokens each.

In [13]:
tokenizer("transportation")

{'input_ids': [101, 162, 10477, 10367, 10143, 8794, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [14]:
result = tokenizer("transportation tool")

In [15]:
result

{'input_ids': [101, 162, 10477, 10367, 10143, 8794, 11928, 8178, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
result.word_ids()

[None, 0, 0, 0, 0, 0, 1, 1, None]

In [17]:
tokenizer.decode([10367])

'##sp'

In [18]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["tokens"],
                                   max_length = 128,
                                   truncation = True,
                                   padding = True,
                                   is_split_into_words = True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_examples.word_ids(batch_index = i) #get the i-th sample of the example
        label_ids = []
        #as some word match multiple ids, we need to do some special label assignment
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100) # ignore None in loss computation
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)

    tokenized_examples["labels"] = labels
    return tokenized_examples

In [19]:
tokenized_datasets = ner_dataset.map(process_function, batched = True)


In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4637
    })
})

In [21]:
print(tokenized_datasets["train"][4])

{'id': '4', 'tokens': ['日', '俄', '两', '国', '国', '内', '政', '局', '都', '充', '满', '变', '数', '，', '尽', '管', '日', '俄', '关', '系', '目', '前', '是', '历', '史', '最', '佳', '时', '期', '，', '但', '其', '脆', '弱', '性', '不', '言', '自', '明', '。'], 'ner_tags': [5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 3189, 915, 697, 1744, 1744, 1079, 3124, 2229, 6963, 1041, 4007, 1359, 3144, 8024, 2226, 5052, 3189, 915, 1068, 5143, 4680, 1184, 3221, 1325, 1380, 3297, 881, 3198, 3309, 8024, 852, 1071, 5546, 2483, 2595, 679, 6241, 5632, 3209, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### 5. Create the model

In [22]:
model = AutoModelForTokenClassification.from_pretrained("hfl/chinese-macbert-large", num_labels = len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
model.config.num_labels

7

By default, the model will only do 2 label classification. We need to adjust it to our use case.

In [24]:
for param in model.parameters():
    param.data = param.data.contiguous()

### 6. Set up evaluation function

In [25]:
seqeval = evaluate.load("seqeval")

In [26]:
seqeval

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [27]:
import numpy as np

In [28]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = np.argmax(predictions, axis = -1)
    true_predictions = [
        [label_list[p] for p, l in zip (prediction, label) if l != -100] #iterate every token_id in the sample
        for prediction, label in zip(predictions, labels) #all the samples in one batch
    ]
    true_labels = [
        [label_list[l] for p, l in zip (prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) #all the samples in one batch
    ]
    result = seqeval.compute(references = true_labels,
                             predictions = true_predictions,
                             scheme = "IOB2",
                             mode = "strict")
    
    return {
        "f1": result["overall_f1"]
    }

### 7. Configure training params

In [29]:
args = TrainingArguments(
    output_dir = "./modesl_for_ner",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 128,
    logging_steps = 30,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    metric_for_best_model = "f1",
    load_best_model_at_end = True , 
    disable_tqdm = True
)

In [30]:
args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=True,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp16

### 8. Create trainer

In [31]:
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets["train"].select(range(6000)),
    eval_dataset = tokenized_datasets["validation"].select(range(1000)),
    data_collator = DataCollatorWithPadding(tokenizer = tokenizer),
    compute_metrics = eval_metric
)

In [32]:
model.device

device(type='mps', index=0)

In [33]:
model.to("cpu")

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

### 9. Train the model

In [34]:
trainer.train()

  return forward_call(*args, **kwargs)


{'loss': 0.4203, 'grad_norm': 1.7180335521697998, 'learning_rate': 4.935555555555556e-05, 'epoch': 0.04}
{'loss': 0.1212, 'grad_norm': 3.8996825218200684, 'learning_rate': 4.868888888888889e-05, 'epoch': 0.08}
{'loss': 0.0739, 'grad_norm': 2.683225631713867, 'learning_rate': 4.802222222222223e-05, 'epoch': 0.12}
{'loss': 0.1145, 'grad_norm': 4.836878776550293, 'learning_rate': 4.7355555555555555e-05, 'epoch': 0.16}
{'loss': 0.108, 'grad_norm': 4.652242660522461, 'learning_rate': 4.668888888888889e-05, 'epoch': 0.2}
{'loss': 0.1117, 'grad_norm': 0.708916187286377, 'learning_rate': 4.602222222222222e-05, 'epoch': 0.24}
{'loss': 0.0734, 'grad_norm': 3.203871488571167, 'learning_rate': 4.5355555555555554e-05, 'epoch': 0.28}
{'loss': 0.0625, 'grad_norm': 6.3980326652526855, 'learning_rate': 4.468888888888889e-05, 'epoch': 0.32}
{'loss': 0.0442, 'grad_norm': 0.8283454179763794, 'learning_rate': 4.4022222222222225e-05, 'epoch': 0.36}
{'loss': 0.0712, 'grad_norm': 7.60693883895874, 'learning_r

  return forward_call(*args, **kwargs)


{'loss': 0.0636, 'grad_norm': 1.3048202991485596, 'learning_rate': 3.268888888888889e-05, 'epoch': 1.04}
{'loss': 0.0311, 'grad_norm': 1.6440324783325195, 'learning_rate': 3.2022222222222224e-05, 'epoch': 1.08}
{'loss': 0.0277, 'grad_norm': 0.6418254375457764, 'learning_rate': 3.135555555555555e-05, 'epoch': 1.12}
{'loss': 0.0259, 'grad_norm': 0.7985669374465942, 'learning_rate': 3.068888888888889e-05, 'epoch': 1.16}
{'loss': 0.0321, 'grad_norm': 0.11533275991678238, 'learning_rate': 3.0022222222222223e-05, 'epoch': 1.2}
{'loss': 0.0262, 'grad_norm': 2.1946496963500977, 'learning_rate': 2.935555555555556e-05, 'epoch': 1.24}
{'loss': 0.0309, 'grad_norm': 0.8636915683746338, 'learning_rate': 2.8688888888888894e-05, 'epoch': 1.28}
{'loss': 0.0223, 'grad_norm': 3.0919342041015625, 'learning_rate': 2.8022222222222222e-05, 'epoch': 1.32}
{'loss': 0.0249, 'grad_norm': 1.7206865549087524, 'learning_rate': 2.7355555555555557e-05, 'epoch': 1.3599999999999999}
{'loss': 0.0289, 'grad_norm': 4.6070

  return forward_call(*args, **kwargs)


{'loss': 0.0086, 'grad_norm': 0.3652673065662384, 'learning_rate': 1.602222222222222e-05, 'epoch': 2.04}
{'loss': 0.011, 'grad_norm': 0.03480139747262001, 'learning_rate': 1.5355555555555557e-05, 'epoch': 2.08}
{'loss': 0.0079, 'grad_norm': 0.4935230314731598, 'learning_rate': 1.468888888888889e-05, 'epoch': 2.12}
{'loss': 0.0076, 'grad_norm': 0.08843041956424713, 'learning_rate': 1.4022222222222222e-05, 'epoch': 2.16}
{'loss': 0.0081, 'grad_norm': 0.10128812491893768, 'learning_rate': 1.3355555555555557e-05, 'epoch': 2.2}
{'loss': 0.0066, 'grad_norm': 0.0034808507189154625, 'learning_rate': 1.268888888888889e-05, 'epoch': 2.24}
{'loss': 0.0068, 'grad_norm': 0.033791378140449524, 'learning_rate': 1.2022222222222223e-05, 'epoch': 2.2800000000000002}
{'loss': 0.0103, 'grad_norm': 0.29339855909347534, 'learning_rate': 1.1355555555555556e-05, 'epoch': 2.32}
{'loss': 0.0043, 'grad_norm': 0.0006742799305357039, 'learning_rate': 1.068888888888889e-05, 'epoch': 2.36}
{'loss': 0.0027, 'grad_nor

TrainOutput(global_step=2250, training_loss=0.03971756872203615, metrics={'train_runtime': 3153.5725, 'train_samples_per_second': 5.708, 'train_steps_per_second': 0.713, 'train_loss': 0.03971756872203615, 'epoch': 3.0})

### 10. Evaluate the model

In [35]:
trainer.evaluate(tokenized_datasets["test"])

  return forward_call(*args, **kwargs)


{'eval_loss': 0.03460295498371124, 'eval_f1': 0.9404969778374749, 'eval_runtime': 164.7689, 'eval_samples_per_second': 28.142, 'eval_steps_per_second': 0.225, 'epoch': 3.0}


{'eval_loss': 0.03460295498371124,
 'eval_f1': 0.9404969778374749,
 'eval_runtime': 164.7689,
 'eval_samples_per_second': 28.142,
 'eval_steps_per_second': 0.225,
 'epoch': 3.0}

### 11. Use the model to predict

In [36]:
trainer.predict(tokenized_datasets["test"])

PredictionOutput(predictions=array([[[10.64678   , -2.239826  , -2.2844255 , ..., -2.6060598 ,
         -0.81991184, -1.2551738 ],
        [11.5474415 , -2.2178485 , -2.7366002 , ..., -3.2716274 ,
         -1.1731164 , -2.7360775 ],
        [11.684027  , -2.6310945 , -2.0666964 , ..., -2.5131216 ,
         -2.2834492 , -1.7167438 ],
        ...,
        [ 7.6248655 , -0.48450452, -2.0787547 , ..., -2.9007163 ,
          0.73124385, -2.8661065 ],
        [ 7.445551  , -1.5348887 , -1.0170684 , ..., -1.3277225 ,
         -0.65729487, -0.741101  ],
        [ 5.2149487 , -1.2520466 , -2.4231222 , ..., -1.7960234 ,
          3.3725803 , -0.70588726]],

       [[10.880301  , -2.3322723 , -2.2843578 , ..., -2.507852  ,
         -1.3147421 , -1.4865595 ],
        [11.582846  , -2.6003728 , -2.2398748 , ..., -2.9084034 ,
         -1.9265811 , -2.3096178 ],
        [11.616342  , -2.7918046 , -1.8694113 , ..., -2.8674319 ,
         -2.042788  , -1.936191  ],
        ...,
        [ 9.158684  , -1.

### 12. Predict with single sample

In [38]:
from transformers import pipeline

In [39]:
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}

In [41]:
ner_pipe = pipeline("token-classification",
                    model = model,
                    tokenizer = tokenizer,
                    device = 0)
ner_pipe("张美肥出生在巴布亚新几内亚，就职于高腰集团。") #This will list out the named entity charater by character, and provide details.

Device set to use mps:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-PER',
  'score': 0.9994832,
  'index': 1,
  'word': '张',
  'start': 0,
  'end': 1},
 {'entity': 'I-PER',
  'score': 0.999395,
  'index': 2,
  'word': '美',
  'start': 1,
  'end': 2},
 {'entity': 'I-PER',
  'score': 0.9990043,
  'index': 3,
  'word': '肥',
  'start': 2,
  'end': 3},
 {'entity': 'B-LOC',
  'score': 0.9981299,
  'index': 7,
  'word': '巴',
  'start': 6,
  'end': 7},
 {'entity': 'I-LOC',
  'score': 0.99649304,
  'index': 8,
  'word': '布',
  'start': 7,
  'end': 8},
 {'entity': 'I-LOC',
  'score': 0.9958793,
  'index': 9,
  'word': '亚',
  'start': 8,
  'end': 9},
 {'entity': 'I-LOC',
  'score': 0.9366285,
  'index': 10,
  'word': '新',
  'start': 9,
  'end': 10},
 {'entity': 'I-LOC',
  'score': 0.9986533,
  'index': 11,
  'word': '几',
  'start': 10,
  'end': 11},
 {'entity': 'I-LOC',
  'score': 0.9985067,
  'index': 12,
  'word': '内',
  'start': 11,
  'end': 12},
 {'entity': 'I-LOC',
  'score': 0.9983771,
  'index': 13,
  'word': '亚',
  'start': 12,
  'end': 13},

In [42]:
ner_pipe = pipeline("token-classification",
                    model = model,
                    tokenizer = tokenizer,
                    device = 0, 
                    aggregation_strategy = "simple" #This will aggregate named entities
                   )
ner_pipe("张美肥出生在巴布亚新几内亚，就职于高腰集团。")

Device set to use mps:0


[{'entity_group': 'PER',
  'score': 0.9992942,
  'word': '张 美 肥',
  'start': 0,
  'end': 3},
 {'entity_group': 'LOC',
  'score': 0.9889525,
  'word': '巴 布 亚 新 几 内 亚',
  'start': 6,
  'end': 13},
 {'entity_group': 'ORG',
  'score': 0.9988554,
  'word': '高 腰 集 团',
  'start': 17,
  'end': 21}]

Bert will automatically add space between characters after decoding.

In [44]:
sen = "张美肥出生在巴布亚新几内亚，就职于高腰集团。"
result = ner_pipe(sen)

ner_result = {}

for r in result:
    if r["entity_group"] not in ner_result:
        ner_result[r["entity_group"]] = []
    ner_result[r["entity_group"]].append(sen[r["start"]:r["end"]])

ner_result

{'PER': ['张美肥'], 'LOC': ['巴布亚新几内亚'], 'ORG': ['高腰集团']}