In [None]:
!pip install transformers
!pip install datasets
!pip install seqeval

In [1]:
!git clone https://github.com/leslie-huang/UN-named-entity-recognition

Cloning into 'UN-named-entity-recognition'...
remote: Enumerating objects: 21580, done.[K
remote: Total 21580 (delta 0), reused 0 (delta 0), pack-reused 21580[K
Receiving objects: 100% (21580/21580), 14.70 MiB | 8.14 MiB/s, done.
Resolving deltas: 100% (21095/21095), done.


In [2]:
import os
import itertools
import pandas as pd
import random
from math import ceil
from datasets import Dataset

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch


directories = ['./UN-named-entity-recognition/tagged-training/', './UN-named-entity-recognition/tagged-test/']
data_files = []
for dir in directories:
    for filename in os.listdir(dir):
        file_path = os.path.join(dir, filename)

        with open(file_path, 'r', encoding="utf8") as f:
            lines = f.readlines()
            split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
            tokens = [[x.split('\t')[0] for x in y] for y in split_list]
            entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list]
            data_files.append(pd.DataFrame({'tokens': tokens, 'ner_tags': entities}))

dataset = pd.concat(data_files).reset_index().drop('index', axis=1)

In [3]:
from collections import Counter

tag_counter = Counter([tag for tags in dataset["ner_tags"] for tag in tags])
tag_counter.most_common()

[('O', 135914),
 ('I-ORG', 3562),
 ('I-LOC', 3329),
 ('I-MISC', 2649),
 ('I-PER', 444),
 ('0', 7),
 ('I-', 2),
 ('I-PRG', 1),
 ('I-I-MISC', 1),
 ('I-OR', 1),
 ('VMISC', 1)]

In [4]:
tags_to_remove = ["I-PRG", "I-I-MISC", "I-OR", "VMISC", "I-", "0"]

def clean_tags(tags):
    clean_list = []
    for tag in list(tags):
        if tag != "O":
            if tag not in tags_to_remove:
                clean_list.append(tag)
            else:
                clean_list.append("O")    
        else:
            clean_list.append("O")
    return clean_list
dataset["ner_tags"] = dataset["ner_tags"].apply(lambda x: clean_tags(x))

tag_counter = Counter([tag for tags in dataset["ner_tags"] for tag in tags])
tag_counter.most_common()

[('O', 135927),
 ('I-ORG', 3562),
 ('I-LOC', 3329),
 ('I-MISC', 2649),
 ('I-PER', 444)]

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [7]:
tag_to_ids = {tag: ix for ix, tag in enumerate(tag_counter.keys())}
id_to_tag = {ix: tag for tag, ix in tag_to_ids.items()}

In [None]:
examples = dataset.loc[0]
for i, label in enumerate(examples["ner_tags"]):
    print(i, label)

In [13]:
tokenized_inputs.word_ids()

[None,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 3,
 3,
 3,
 4,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 31,
 31,
 32,
 32,
 33,
 33,
 34,
 34,
 35,
 35,
 35,
 35,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 46,
 46,
 47,
 48,
 None]

In [21]:
examples = dataset.loc[0]
label_all_tokens = True
tokenized_inputs = tokenizer(list([examples["tokens"]]), truncation=True, is_split_into_words=True)

labels = []
for i, label in enumerate([examples["ner_tags"]]):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif label[word_idx] == '0':
            label_ids.append(0)
        elif word_idx != previous_word_idx:
            label_ids.append(tag_to_ids[label[word_idx]])
        else:
            label_ids.append(tag_to_ids[label[word_idx]] if label_all_tokens else -100)
        previous_word_idx = word_idx
    labels.append(label_ids)
    

In [22]:
tokenized_inputs

{'input_ids': [[101, 13397, 14255, 14867, 7926, 23591, 1828, 119, 8731, 1403, 8548, 26835, 10205, 1852, 1117, 1728, 1106, 1103, 11223, 1104, 1103, 1615, 2970, 1120, 1157, 9229, 118, 1248, 4912, 117, 1105, 19120, 1114, 13542, 1103, 3268, 1104, 1117, 8283, 117, 1430, 16409, 22494, 9517, 13609, 1161, 16164, 1161, 16890, 10680, 2586, 118, 148, 24486, 8057, 117, 1105, 1123, 2265, 5880, 1104, 1103, 1250, 1104, 1103, 9229, 118, 1148, 4912, 119, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [14]:
tag_to_ids = {tag: ix for ix, tag in enumerate(tag_counter.keys())}
id_to_tag = {ix: tag for tag, ix in tag_to_ids.items()}

def tokenize_and_align_labels(examples):
    # https://huggingface.co/docs/transformers/tasks/token_classification
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(tag_to_ids[label[word_idx]])
            else:
                label_ids.append(tag_to_ids[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


dataset_ix = set(dataset.index)
random.seed(42)
test_ix = random.sample(dataset_ix, ceil(len(dataset) * 0.2))
train_ix = dataset_ix - set(test_ix)

train_dataset = Dataset.from_pandas(dataset.loc[train_ix])
test_dataset = Dataset.from_pandas(dataset.loc[test_ix])
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
tokenized_train_dataset[0].keys()

dict_keys(['tokens', 'ner_tags', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'])

In [20]:
tokenized_train_dataset[0]["tokens"]

['Kuwait',
 'congratulates',
 'Mr.',
 'Srgjan',
 'Kerim',
 'upon',
 'his',
 'election',
 'to',
 'the',
 'presidency',
 'of',
 'the',
 'General',
 'Assembly',
 'at',
 'its',
 'sixty-second',
 'session',
 ',',
 'and',
 'recalls',
 'with',
 'appreciation',
 'the',
 'efforts',
 'of',
 'his',
 'predecessor',
 ',',
 'Her',
 'Excellency',
 'Sheikha',
 'Haya',
 'Rashed',
 'Al-Khalifa',
 ',',
 'and',
 'her',
 'successful',
 'conduct',
 'of',
 'the',
 'work',
 'of',
 'the',
 'sixty-first',
 'session',
 '.']

In [87]:
from datasets import load_metric
import numpy as np

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-cased", num_labels=len(tag_counter))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)


metric = load_metric("seqeval")

label_list = list(tag_counter.keys())

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.1

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.046761,0.782543,0.834671,0.807767,0.984884
2,0.080200,0.033703,0.859472,0.888443,0.873717,0.990067
3,0.080200,0.03588,0.866148,0.893258,0.879494,0.988987
4,0.020300,0.038187,0.858796,0.893258,0.875688,0.989758
5,0.020300,0.038702,0.87325,0.901284,0.887046,0.990067


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, ner_tags. If __index_level_0__, tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1147
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, ner_tags. If __index_level_0__, tokens, ner_tags are not expected by `DistilBertForTokenClass

{'epoch': 5.0,
 'eval_accuracy': 0.9900669401857051,
 'eval_f1': 0.8870458135860978,
 'eval_loss': 0.03870198130607605,
 'eval_precision': 0.8732503888024883,
 'eval_recall': 0.9012841091492777,
 'eval_runtime': 2.9584,
 'eval_samples_per_second': 387.703,
 'eval_steps_per_second': 24.337}

In [88]:
model.to("cpu")

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [89]:
paragraph = '''Expressing deep concern about the impact of the food security crisis on the
assistance provided by United Nations humanitarian agencies, in particular the World
Food Programme, the United Nations Children’s Fund, the Office for the
Coordination of Humanitarian Affairs of the Secretariat and the Office of the United
Nations High Commissioner for Refugees'''

tokens = tokenizer(paragraph)
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [id_to_tag[int(i)] for i in predictions]

words = tokenizer.batch_decode(tokens['input_ids'])
pd.DataFrame({'ner': predictions, 'words': words}).to_csv("aaa.csv", index=False)

In [77]:
pd.DataFrame({'ner': predictions, 'words': words})

Unnamed: 0,ner,words
0,O,[CLS]
1,O,Express
2,O,##ing
3,O,deep
4,O,concern
...,...,...
59,O,Commissioner
60,O,for
61,MIS,Refuge
62,O,##es
