In [13]:
%load_ext autoreload
%autoreload 2

In [14]:

from transformers import ElectraTokenizerFast, ElectraForTokenClassification
from pprint import pprint

tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")
model = ElectraForTokenClassification.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")


In [15]:

import logging
from typing import Optional, Union

import torch
import numpy as np

from transformers import (
    BasicTokenizer,
    PreTrainedTokenizer,
    Pipeline
)


logger = logging.getLogger(__name__)


def custom_encode_plus(sentence,
                       tokenizer,
                       return_tensors=None):
    # {'input_ids': [2, 10841, 10966, 10832, 10541, 21509, 27660, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
    words = sentence.split()

    tokens = []
    tokens_mask = []

    for word in words:
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        tokens_mask.extend([1] + [0] * (len(word_tokens) - 1))

    ids = tokenizer.convert_tokens_to_ids(tokens)
    len_ids = len(ids)
    total_len = len_ids + tokenizer.num_special_tokens_to_add()
    if tokenizer.model_max_length and total_len > tokenizer.model_max_length:
        ids, _, _ = tokenizer.truncate_sequences(
            ids,
            pair_ids=None,
            num_tokens_to_remove=total_len - tokenizer.model_max_length,
            truncation_strategy="longest_first",
            stride=0,
        )

    sequence = tokenizer.build_inputs_with_special_tokens(ids)
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(ids)
    # HARD-CODED: As I know, most of the transformers architecture will be `[CLS] + text + [SEP]``
    #             Only way to safely cover all the cases is to integrate `token mask builder` in internal library.
    tokens_mask = [1] + tokens_mask + [1]
    words = [tokenizer.cls_token] + words + [tokenizer.sep_token]

    encoded_inputs = {}
    encoded_inputs["input_ids"] = sequence
    encoded_inputs["token_type_ids"] = token_type_ids


    encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])

    if "token_type_ids" in encoded_inputs:
        encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])

    if "attention_mask" in encoded_inputs:
        encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])

    elif return_tensors is not None:
        logger.warning(
            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
                return_tensors
            )
        )

    return encoded_inputs, words, tokens_mask


In [16]:
sentence = "2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다"

In [17]:
tokens, words, tokens_mask = custom_encode_plus(
                sentence,
                tokenizer
)

In [18]:
token = {name: tensor.to() for name, tensor in tokens.items()}

In [7]:
entities = model(**token)

In [8]:
result = entities[0][0].detach().numpy()

In [9]:
score = np.exp(result) / np.exp(result).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

In [10]:
token_level_answer = []
input_ids = tokens["input_ids"].numpy()[0]

for idx, label_idx in enumerate(labels_idx):
    # NOTE Append every answer even though the `entity` is in `ignore_labels`
    token_level_answer += [
        {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": model.config.id2label[label_idx],
        }
    ]

# IPU model inference

In [1]:
import os, sys
os.chdir('../')

In [2]:
from pipeline_electra import PipelinedElectraForTokenClassification
from easydict import EasyDict
import yaml

In [3]:
config_file = 'finetune/squad_configurations.yaml'
config = EasyDict(yaml.load(open(config_file).read(), Loader=yaml.Loader))


In [4]:
train_ipu_config = {
    "layers_per_ipu": config.train_config.train_layers_per_ipu,
    "recompute_checkpoint_every_layer": config.train_config.train_recompute_checkpoint_every_layer,
    "embedding_serialization_factor": config.train_config.train_embedding_serialization_factor
}

In [5]:
model = PipelinedElectraForTokenClassification.from_pretrained_transformers("monologg/koelectra-small-finetuned-naver-ner", train_ipu_config)

In [6]:
import poptorch
from finetune.run_squad_ipu import ipu_options

In [7]:
valid_micro_batch_size = 1
valid_replication_factor = 1
valid_global_batch_size = valid_micro_batch_size * valid_replication_factor
valid_device_iterations = 1
valid_samples_per_iteration = valid_global_batch_size * valid_device_iterations

In [8]:
valid_samples_per_iteration

1

In [9]:
val_opts = ipu_options(1, valid_replication_factor, valid_device_iterations, train_option=False)

In [10]:
inference_model = poptorch.inferenceModel(model, val_opts)

In [19]:
token

{'input_ids': tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
           6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
           5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
           6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
           5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
          10822,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]])}

In [20]:
input_ids = torch.vstack((token['input_ids'],token['input_ids']))

In [21]:
token_type_ids = torch.vstack((token['token_type_ids'],token['token_type_ids']))

In [22]:
token

{'input_ids': tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
           6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
           5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
           6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
           5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
          10822,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]])}

In [23]:
entities = inference_model(token['input_ids'],token['token_type_ids'])

Graph compilation: 100%|██████████| 100/100 [00:38<00:00]
