In [4]:
%load_ext autoreload
%autoreload 2

In [5]:

from transformers import ElectraTokenizerFast, ElectraForTokenClassification
from pprint import pprint

tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")
model = ElectraForTokenClassification.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")


In [6]:
# from transformers import ElectraTokenizer, ElectraForTokenClassification
# import torch

# tokenizer = ElectraTokenizer.from_pretrained("bhadresh-savani/electra-base-discriminator-finetuned-conll03-english")
# model = ElectraForTokenClassification.from_pretrained("bhadresh-savani/electra-base-discriminator-finetuned-conll03-english")

# inputs = tokenizer(
#     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
# )

# with torch.no_grad():
#     logits = model(**inputs).logits

# predicted_token_class_ids = logits.argmax(-1)

# # Note that tokens are classified rather then input words which means that
# # there might be more predicted token classes than words.
# # Multiple token classes might account for the same word
# predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
# predicted_tokens_classes

In [7]:
model

ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_fea

In [15]:
import logging
from typing import Optional, Union

import torch
import numpy as np

from transformers import (
    BasicTokenizer,
    PreTrainedTokenizer,
    Pipeline
)


logger = logging.getLogger(__name__)


def custom_encode_plus(sentence,
                       tokenizer,
                       return_tensors=None):
    # {'input_ids': [2, 10841, 10966, 10832, 10541, 21509, 27660, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
    words = sentence.split()

    tokens = []
    tokens_mask = []

    for word in words:
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        tokens_mask.extend([1] + [0] * (len(word_tokens) - 1))

    ids = tokenizer.convert_tokens_to_ids(tokens)
    len_ids = len(ids)
    total_len = len_ids + tokenizer.num_special_tokens_to_add()
    if tokenizer.model_max_length and total_len > tokenizer.model_max_length:
        ids, _, _ = tokenizer.truncate_sequences(
            ids,
            pair_ids=None,
            num_tokens_to_remove=total_len - tokenizer.model_max_length,
            truncation_strategy="longest_first",
            stride=0,
        )

    sequence = tokenizer.build_inputs_with_special_tokens(ids)
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(ids)
    # HARD-CODED: As I know, most of the transformers architecture will be `[CLS] + text + [SEP]``
    #             Only way to safely cover all the cases is to integrate `token mask builder` in internal library.
    tokens_mask = [1] + tokens_mask + [1]
    words = [tokenizer.cls_token] + words + [tokenizer.sep_token]

    encoded_inputs = {}
    encoded_inputs["input_ids"] = sequence
    encoded_inputs["token_type_ids"] = token_type_ids


    encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])

    if "token_type_ids" in encoded_inputs:
        encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])

    if "attention_mask" in encoded_inputs:
        encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])

    elif return_tensors is not None:
        logger.warning(
            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
                return_tensors
            )
        )

    return encoded_inputs, words, tokens_mask


In [16]:
sentence = "2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다"

In [17]:
tokens, words, tokens_mask = custom_encode_plus(
                sentence,
                tokenizer
)

In [18]:
token = {name: tensor.to() for name, tensor in tokens.items()}

In [7]:
entities = model(**token)

In [8]:
result = entities[0][0].detach().numpy()

In [9]:
score = np.exp(result) / np.exp(result).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

In [10]:
token_level_answer = []
input_ids = tokens["input_ids"].numpy()[0]

for idx, label_idx in enumerate(labels_idx):
    # NOTE Append every answer even though the `entity` is in `ignore_labels`
    token_level_answer += [
        {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": model.config.id2label[label_idx],
        }
    ]

In [8]:
import pandas as pd

In [12]:
data_path = "../data/naver-ner/test.tsv"

In [13]:
data = pd.read_csv(data_path, sep='\t', names = ['text', 'label'])

In [14]:
from torch.utils.data import Dataset
from dataclasses import dataclass


In [15]:
class SummaryDataset(Dataset):
    def __init__(self, data_path):
        data = pd.read_csv(data_path, sep='\t', names = ['text', 'label'])
        
        self.texts = []
        self.labels = []
        for i in range(len(data)):
            self.texts.append(data.iloc[i].text)
            self.labels.append(data.iloc[i].label)

                
    def labels2id(self, label):
        labels_lst = ["O",
                "PER-B", "PER-I", "FLD-B", "FLD-I", "AFW-B", "AFW-I", "ORG-B", "ORG-I",
                "LOC-B", "LOC-I", "CVL-B", "CVL-I", "DAT-B", "DAT-I", "TIM-B", "TIM-I",
                "NUM-B", "NUM-I", "EVT-B", "EVT-I", "ANM-B", "ANM-I", "PLT-B", "PLT-I",
                "MAT-B", "MAT-I", "TRM-B", "TRM-I"]
        labels_dict = {label:i for i, label in enumerate(labels_lst)}
        try:
            id_value = labels_dict[label]
        except:
            raise Exception('Not in NER labels')
        return id_value

    def __getitem__(self, index):
        return {'text':self.texts[index], 'label':self.labels[index]}
        
    def __len__(self):
        return len(self.contexts)

In [16]:
ner_dataset = SummaryDataset(data_path)

In [17]:
ner_dataset[0]

{'text': '나아가 한 스트로크를 하는 제한시간에 대한 지침도 있다 .',
 'label': 'O NUM-B NUM-I O O O O O O'}

In [18]:
@dataclass
class NERCollator:
    def __init__(self, tokenizer, mapping=None):
        self.tokenizer = tokenizer
        self.text = 'text'
        self.label = 'label'    

    def __call__(self, batch):
        if self.text not in batch[0] or self.label not in batch[0]:
            raise Exception("Error: Undefined data keys")
        #sentence = [self.tokenizer.bos_token+item[self.text]+self.tokenizer.eos_token for item in batch]
        sentence = [item[self.text]+self.tokenizer.eos_token for item in batch]

        source_batch = self.tokenizer.batch_encode_plus(sentence,
                    padding='longest',
                    max_length=512,
                    truncation=True, 
                    return_tensors='pt')
        if self.summary in batch[0]:
            labels = [item[self.summary]+self.tokenizer.eos_token for item in batch]
            target_batch = self.tokenizer.batch_encode_plus(labels,
                    padding='longest', 
                    max_length=512,
                    truncation=True, 
                    return_tensors='pt')
            
            labels = target_batch.input_ids.clone()
            labels[labels == self.pad_token_id] = -100

            
            return {'input_ids':source_batch.input_ids,
                     'attention_mask':source_batch.attention_mask,
                     'labels': labels,
                     'decoder_input_ids': shift_tokens_right(target_batch.input_ids, self.pad_token_id, self.eos_token_id)}
                     #'decoder_input_ids': target_batch.input_ids,
                     #'decoder_attention_mask':target_batch.attention_mask }
                   
        else:
            return {'input_ids':source_batch.input_ids,
                     'attention_mask':source_batch.attention_mask}

# IPU model inference

In [1]:
import os, sys
os.chdir('../')

In [2]:
from pipeline_electra import PipelinedElectraForTokenClassification
from easydict import EasyDict
import yaml

In [3]:
config_file = 'finetune/squad_configurations.yaml'
config = EasyDict(yaml.load(open(config_file).read(), Loader=yaml.Loader))


In [4]:
train_ipu_config = {
    "layers_per_ipu": config.train_config.train_layers_per_ipu,
    "recompute_checkpoint_every_layer": config.train_config.train_recompute_checkpoint_every_layer,
    "embedding_serialization_factor": config.train_config.train_embedding_serialization_factor
}

In [5]:
model = PipelinedElectraForTokenClassification.from_pretrained_transformers("monologg/koelectra-small-finetuned-naver-ner", train_ipu_config)

In [6]:
import poptorch
from finetune.run_squad_ipu import ipu_options

In [7]:
valid_micro_batch_size = 1
valid_replication_factor = 1
valid_global_batch_size = valid_micro_batch_size * valid_replication_factor
valid_device_iterations = 1
valid_samples_per_iteration = valid_global_batch_size * valid_device_iterations

In [8]:
valid_samples_per_iteration

1

In [9]:
val_opts = ipu_options(1, valid_replication_factor, valid_device_iterations, train_option=False)

In [10]:
inference_model = poptorch.inferenceModel(model, val_opts)

In [19]:
token

{'input_ids': tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
           6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
           5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
           6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
           5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
          10822,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]])}

In [1]:
#input_ids = torch.vstack((token['input_ids'],token['input_ids']))

In [2]:
#token_type_ids = torch.vstack((token['token_type_ids'],token['token_type_ids']))

In [147]:
entities = inference_model(token['input_ids'],token['token_type_ids'])

In [148]:
entities = entities[0].cpu().numpy()
input_ids = tokens["input_ids"].numpy()[0]

In [156]:
score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

token_level_answer = []
for idx, label_idx in enumerate(labels_idx):
    # NOTE Append every answer even though the `entity` is in `ignore_labels`
    token_level_answer += [
        {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": model.config.id2label[label_idx],
        }
    ]

In [162]:
ignore_labels=["O"]
ignore_special_tokens=True

In [164]:
# [FIX] Now let's change it to word-level NER
word_idx = 0
word_level_answer = []

# NOTE: Might not be safe. BERT, ELECTRA etc. won't make issues.
if ignore_special_tokens:
    words = words[1:-1]
    tokens_mask = tokens_mask[1:-1]
    token_level_answer = token_level_answer[1:-1]

for mask, ans in zip(tokens_mask, token_level_answer):
    if mask == 1:
        ans["word"] = words[word_idx]
        word_idx += 1
        if ans["entity"] not in ignore_labels:
            word_level_answer.append(ans)

In [165]:
word_level_answer

[{'word': '2009년', 'score': 0.9996205568313599, 'entity': 'DAT-B'},
 {'word': '7월', 'score': 0.9350952506065369, 'entity': 'DAT-I'},
 {'word': 'FC서울을', 'score': 0.9994584321975708, 'entity': 'ORG-B'},
 {'word': '잉글랜드', 'score': 0.9983227849006653, 'entity': 'LOC-B'},
 {'word': '프리미어리그', 'score': 0.9989780187606812, 'entity': 'ORG-B'},
 {'word': '볼턴', 'score': 0.9300729632377625, 'entity': 'ORG-B'},
 {'word': '원더러스로', 'score': 0.999349057674408, 'entity': 'ORG-I'},
 {'word': '이청용은', 'score': 0.9994910359382629, 'entity': 'PER-B'},
 {'word': '크리스탈', 'score': 0.9994640350341797, 'entity': 'ORG-B'},
 {'word': '팰리스와', 'score': 0.9991778135299683, 'entity': 'ORG-I'},
 {'word': '독일', 'score': 0.9977171421051025, 'entity': 'LOC-B'},
 {'word': '분데스리가2', 'score': 0.9814788699150085, 'entity': 'ORG-B'},
 {'word': 'VfL', 'score': 0.8719519376754761, 'entity': 'ORG-B'},
 {'word': '보훔을', 'score': 0.9938763380050659, 'entity': 'ORG-I'},
 {'word': '지난', 'score': 0.9963383078575134, 'entity': 'DAT-B'},