In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

from transformers import ElectraTokenizerFast, ElectraForTokenClassification
from pprint import pprint

tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")
model = ElectraForTokenClassification.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")


In [3]:
import logging
from typing import Optional, Union

import torch
import numpy as np

from transformers import (
    BasicTokenizer,
    PreTrainedTokenizer,
    Pipeline
)


logger = logging.getLogger(__name__)


def custom_encode_plus(sentence,
                       tokenizer,
                       return_tensors=None):
    # {'input_ids': [2, 10841, 10966, 10832, 10541, 21509, 27660, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
    words = sentence.split()

    tokens = []
    tokens_mask = []

    for word in words:
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        tokens_mask.extend([1] + [0] * (len(word_tokens) - 1))

    ids = tokenizer.convert_tokens_to_ids(tokens)
    len_ids = len(ids)
    total_len = len_ids + tokenizer.num_special_tokens_to_add()
    if tokenizer.model_max_length and total_len > tokenizer.model_max_length:
        ids, _, _ = tokenizer.truncate_sequences(
            ids,
            pair_ids=None,
            num_tokens_to_remove=total_len - tokenizer.model_max_length,
            truncation_strategy="longest_first",
            stride=0,
        )

    sequence = tokenizer.build_inputs_with_special_tokens(ids)
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(ids)
    # HARD-CODED: As I know, most of the transformers architecture will be `[CLS] + text + [SEP]``
    #             Only way to safely cover all the cases is to integrate `token mask builder` in internal library.
    tokens_mask = [1] + tokens_mask + [1]
    words = [tokenizer.cls_token] + words + [tokenizer.sep_token]

    encoded_inputs = {}
    encoded_inputs["input_ids"] = sequence
    encoded_inputs["token_type_ids"] = token_type_ids

    encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])

    if "token_type_ids" in encoded_inputs:
        encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])

    if "attention_mask" in encoded_inputs:
        encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])

    elif return_tensors is not None:
        logger.warning(
            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
                return_tensors
            )
        )

    return encoded_inputs, words, tokens_mask


In [4]:
sentence = "2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다"

In [5]:
tokens, words, tokens_mask = custom_encode_plus(sentence, tokenizer)

In [6]:
tokens

{'input_ids': tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
           6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
           5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
           6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
           5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
          10822,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]])}

In [7]:
tokens

{'input_ids': tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
           6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
           5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
           6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
           5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
          10822,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]])}

In [8]:
tokens = tokenizer.encode_plus(sentence,
    padding='longest',
    max_length=512,
    truncation=True, 
    return_tensors='pt')

In [9]:
entities = model(**tokens)

In [10]:
result = entities[0][0].detach().numpy()

In [11]:
score = np.exp(result) / np.exp(result).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

In [12]:
token_level_answer = []
input_ids = tokens["input_ids"].numpy()[0]

for idx, label_idx in enumerate(labels_idx):
    # NOTE Append every answer even though the `entity` is in `ignore_labels`
    token_level_answer += [
        {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": model.config.id2label[label_idx],
        }
    ]

In [13]:
import pandas as pd

In [14]:
data_path = "../data/naver-ner/test.tsv"

In [15]:
data = pd.read_csv(data_path, sep='\t', names = ['text', 'label'])

In [16]:
from torch.utils.data import Dataset
from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader, RandomSampler, SubsetRandomSampler

In [17]:
class NERDataset(Dataset):
    def __init__(self, data_path):
        data = pd.read_csv(data_path, sep='\t', names = ['text', 'label'])
        
        self.texts = []
        self.labels = []
        for i in range(len(data)):
            self.texts.append(data.iloc[i].text)
            self.labels = self.labels2id(data.iloc[i].label.split())

                
    def labels2id(self, labels):
        labels_lst = ["O",
                "PER-B", "PER-I", "FLD-B", "FLD-I", "AFW-B", "AFW-I", "ORG-B", "ORG-I",
                "LOC-B", "LOC-I", "CVL-B", "CVL-I", "DAT-B", "DAT-I", "TIM-B", "TIM-I",
                "NUM-B", "NUM-I", "EVT-B", "EVT-I", "ANM-B", "ANM-I", "PLT-B", "PLT-I",
                "MAT-B", "MAT-I", "TRM-B", "TRM-I"]
        labels_dict = {label:i for i, label in enumerate(labels_lst)}
        id_value=[]
        try:
            for label in labels:
                id_value.append(labels_dict[label])
                print(id_value)
        except:
            raise Exception(f'Not in NER labels : {label}')
        return id_value

    def __getitem__(self, index):
        return {'text':self.texts[index], 'label':self.labels[index]}
        
    def __len__(self):
        return len(self.contexts)

In [18]:
@dataclass
class NERCollator:
    def __init__(self, tokenizer, mapping=None):
        self.tokenizer = tokenizer
        self.text = 'text'
        self.label = 'label'    

    def __call__(self, batch):
        if self.text not in batch[0] or self.label not in batch[0]:
            raise Exception("Error: Undefined data keys")

        sentence = [item[self.text]+self.tokenizer.eos_token for item in batch]

        source_batch = self.tokenizer.batch_encode_plus(sentence,
                    padding='longest',
                    max_length=512,
                    truncation=True, 
                    return_tensors='pt')
       

            
        return {'input_ids':source_batch.input_ids,
                 'attention_mask':source_batch.attention_mask,
                 'token_type_ids':source_batch.token_type_ids,
                 'labels': labels,
                 }
    
    def custom_encode_plus(self, sentence, tokenizer, return_tensors=None):

        words = sentence.split()

        tokens = []
        tokens_mask = []

        for word in words:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            tokens_mask.extend([1] + [0] * (len(word_tokens) - 1))

        ids = tokenizer.convert_tokens_to_ids(tokens)
        len_ids = len(ids)
        total_len = len_ids + tokenizer.num_special_tokens_to_add()
        if tokenizer.model_max_length and total_len > tokenizer.model_max_length:
            ids, _, _ = tokenizer.truncate_sequences(
                ids,
                pair_ids=None,
                num_tokens_to_remove=total_len - tokenizer.model_max_length,
                truncation_strategy="longest_first",
                stride=0,
            )

        sequence = tokenizer.build_inputs_with_special_tokens(ids)
        token_type_ids = tokenizer.create_token_type_ids_from_sequences(ids)
        # HARD-CODED: As I know, most of the transformers architecture will be `[CLS] + text + [SEP]``
        #             Only way to safely cover all the cases is to integrate `token mask builder` in internal library.
        tokens_mask = [1] + tokens_mask + [1]
        words = [tokenizer.cls_token] + words + [tokenizer.sep_token]

        encoded_inputs = {}
        encoded_inputs["input_ids"] = sequence
        encoded_inputs["token_type_ids"] = token_type_ids


        encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])

        if "token_type_ids" in encoded_inputs:
            encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])

        if "attention_mask" in encoded_inputs:
            encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])

        elif return_tensors is not None:
            logger.warning(
                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
                    return_tensors
                )
            )

        return encoded_inputs, words, tokens_mask

In [19]:
def labels2id(labels):
    labels_lst = ["O",
            "PER-B", "PER-I", "FLD-B", "FLD-I", "AFW-B", "AFW-I", "ORG-B", "ORG-I",
            "LOC-B", "LOC-I", "CVL-B", "CVL-I", "DAT-B", "DAT-I", "TIM-B", "TIM-I",
            "NUM-B", "NUM-I", "EVT-B", "EVT-I", "ANM-B", "ANM-I", "PLT-B", "PLT-I",
            "MAT-B", "MAT-I", "TRM-B", "TRM-I"]
    labels_dict = {label:i for i, label in enumerate(labels_lst)}
    id_value=[]
    try:
        for label in labels:
            id_value.append(labels_dict[label])
    except:
        raise Exception(f'Not in NER labels : {label}')
    return id_value

In [20]:
data = pd.read_csv(data_path, sep='\t', names = ['text', 'label'])

texts = []
labels = []
for i in range(len(data)):
    texts.append(data.iloc[i].text)
    labels.append(labels2id(data.iloc[i].label.split()))

In [21]:
tokens, words, tokens_mask = custom_encode_plus(sentence, tokenizer)

In [22]:
len(tokens['input_ids'][0])

52

In [24]:
#labels

In [25]:
# from transformers import ElectraTokenizer, ElectraForTokenClassification
# import torch

# tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
# model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1

# outputs = model(**inputs, labels=labels)
# loss, scores = outputs[:2]

# IPU model inference

In [26]:
import os, sys
os.chdir('../')

In [27]:
from pipeline_electra import PipelinedElectraForTokenClassification
from easydict import EasyDict
import yaml

In [28]:
config_file = 'finetune/squad_configurations.yaml'
config = EasyDict(yaml.load(open(config_file).read(), Loader=yaml.Loader))


In [29]:
train_ipu_config = {
    "layers_per_ipu": config.train_config.train_layers_per_ipu,
    "recompute_checkpoint_every_layer": config.train_config.train_recompute_checkpoint_every_layer,
    "embedding_serialization_factor": config.train_config.train_embedding_serialization_factor
}

In [30]:
model = PipelinedElectraForTokenClassification.from_pretrained_transformers("monologg/koelectra-small-finetuned-naver-ner", train_ipu_config)

In [251]:
import poptorch
from finetune.run_squad_ipu import ipu_options

In [252]:
valid_micro_batch_size = 1
valid_replication_factor = 1
valid_global_batch_size = valid_micro_batch_size * valid_replication_factor
valid_device_iterations = 1
valid_samples_per_iteration = valid_global_batch_size * valid_device_iterations

In [253]:
valid_samples_per_iteration

1

In [254]:
val_opts = ipu_options(1, valid_replication_factor, valid_device_iterations, train_option=False)

In [255]:
inference_model = poptorch.inferenceModel(model, val_opts)

In [256]:
#token = tokenizer.encode_plus(sentence, return_tensors='pt', return_offsets_mapping=True)

In [257]:
word_list = sentence.split()
split_tokens = tokenizer.encode_plus(word_list, is_split_into_words=True, return_tensors='pt', return_offsets_mapping=True)

In [258]:
word_offset_mapping = split_tokens['offset_mapping']

In [259]:
#input_ids = torch.vstack((token['input_ids'],token['input_ids']))

In [260]:
#token_type_ids = torch.vstack((token['token_type_ids'],token['token_type_ids']))

In [261]:
split_tokens['input_ids']

tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
          6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
          5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
          6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
          5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
         10822,     3]])

In [262]:
entities = inference_model(split_tokens['input_ids'], split_tokens['token_type_ids'])#, split_tokens['attention_mask'])

Graph compilation: 100%|██████████| 100/100 [00:02<00:00]


In [263]:
ignore_labels=["O"]
ignore_special_tokens=True

word_offset_mapping = word_offset_mapping[0].cpu().numpy()
entities = entities[0].cpu().numpy()
input_ids = split_tokens["input_ids"].numpy()[0]

In [264]:
word_offset_mapping

array([[0, 0],
       [0, 5],
       [0, 2],
       [0, 2],
       [2, 4],
       [4, 5],
       [0, 2],
       [0, 4],
       [0, 6],
       [0, 1],
       [1, 2],
       [0, 1],
       [1, 3],
       [3, 5],
       [0, 2],
       [2, 3],
       [0, 1],
       [1, 2],
       [2, 4],
       [0, 4],
       [0, 1],
       [1, 3],
       [3, 4],
       [0, 2],
       [0, 1],
       [1, 3],
       [3, 5],
       [5, 6],
       [0, 1],
       [1, 2],
       [2, 3],
       [0, 1],
       [1, 2],
       [2, 3],
       [0, 2],
       [0, 2],
       [0, 2],
       [0, 3],
       [3, 4],
       [0, 1],
       [1, 2],
       [2, 4],
       [4, 5],
       [0, 1],
       [1, 2],
       [2, 4],
       [0, 2],
       [2, 3],
       [0, 2],
       [0, 2],
       [2, 5],
       [0, 0]])

In [285]:
score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

token_level_answer = []
for idx, label_idx in enumerate(labels_idx):
    if model.config.id2label[label_idx] in ignore_labels:
        token_answer = []
        continue
    
    elif word_offset_mapping[idx][0] == 0:
        if token_answer:
            token_answer["word"] = tokenizer.decode(token_answer["word"])
            token_level_answer.append(token_answer)
        token_answer = {
                "word": [int(input_ids[idx])],
                "score": score[idx][label_idx].item(),
                "entity": model.config.id2label[label_idx],
        }
        
        
    else:
        token_answer['word'].append(int(input_ids[idx]))

In [286]:
token_level_answer

[{'word': '2009년', 'score': 0.9996205568313599, 'entity': 'DAT-B'},
 {'word': '7월', 'score': 0.9350952506065369, 'entity': 'DAT-I'},
 {'word': '잉글랜드', 'score': 0.9983227849006653, 'entity': 'LOC-B'},
 {'word': '프리미어리그', 'score': 0.9989780187606812, 'entity': 'ORG-B'},
 {'word': '볼턴', 'score': 0.9300729632377625, 'entity': 'ORG-B'},
 {'word': '이청용은', 'score': 0.9994910359382629, 'entity': 'PER-B'},
 {'word': '크리스탈', 'score': 0.9994640350341797, 'entity': 'ORG-B'},
 {'word': '독일', 'score': 0.9977171421051025, 'entity': 'LOC-B'},
 {'word': '분데스리가2', 'score': 0.9814788699150085, 'entity': 'ORG-B'},
 {'word': 'VfL', 'score': 0.8719519376754761, 'entity': 'ORG-B'},
 {'word': '지난', 'score': 0.9963383078575134, 'entity': 'DAT-B'},
 {'word': '3월', 'score': 0.9909418225288391, 'entity': 'DAT-I'}]

In [287]:
tokens = []
for i, word_offset in enumerate(word_offset_mapping):
    if (word_offset[1] - word_offset[0]) == 0:
        token = []
        continue
    elif word_offset[0] == 0:
        if token:
            tokens.append(token)
        token = []
        token.append(split_tokens['input_ids'][0][i])
    else:
        token.append(split_tokens['input_ids'][0][i])


In [294]:
word_list

['2009년',
 '7월',
 'FC서울을',
 '떠나',
 '잉글랜드',
 '프리미어리그',
 '볼턴',
 '원더러스로',
 '이적한',
 '이청용은',
 '크리스탈',
 '팰리스와',
 '독일',
 '분데스리가2',
 'VfL',
 '보훔을',
 '거쳐',
 '지난',
 '3월',
 'K리그로',
 '컴백했다.',
 '행선지는',
 '서울이',
 '아닌',
 '울산이었다']

In [293]:
word_offset_mapping

array([[0, 0],
       [0, 5],
       [0, 2],
       [0, 2],
       [2, 4],
       [4, 5],
       [0, 2],
       [0, 4],
       [0, 6],
       [0, 1],
       [1, 2],
       [0, 1],
       [1, 3],
       [3, 5],
       [0, 2],
       [2, 3],
       [0, 1],
       [1, 2],
       [2, 4],
       [0, 4],
       [0, 1],
       [1, 3],
       [3, 4],
       [0, 2],
       [0, 1],
       [1, 3],
       [3, 5],
       [5, 6],
       [0, 1],
       [1, 2],
       [2, 3],
       [0, 1],
       [1, 2],
       [2, 3],
       [0, 2],
       [0, 2],
       [0, 2],
       [0, 3],
       [3, 4],
       [0, 1],
       [1, 2],
       [2, 4],
       [4, 5],
       [0, 1],
       [1, 2],
       [2, 4],
       [0, 2],
       [2, 3],
       [0, 2],
       [0, 2],
       [2, 5],
       [0, 0]])

In [164]:
tokenizer.decode(tokens[0])

'2009년'

In [291]:
for token in tokens:
    print(tokenizer.decode(token))

2009년
7월
FC서울을
떠나
잉글랜드
프리미어리그
볼턴
원더러스로
이적한
이청용은
크리스탈
팰리스와
독일
분데스리가2
VfL
보훔을
거쳐
지난
3월
K리그로
컴백했다.
행선지는
서울이
아닌


In [73]:
#cur_offset_mapping = offset_mapping[0]
for i, offset in enumerate(cur_offset_mapping[1:-1]):
    #print(offset)
    if (cur_offset_mapping[i+1][0] - offset[1]) > 0:
        continue
    else:
        print(token_level_answer[i])

In [145]:
token_level_answer

[{'word': '7월', 'score': 0.9350952506065369, 'entity': 'DAT-I'},
 {'word': 'FC', 'score': 0.9994584321975708, 'entity': 'ORG-B'},
 {'word': '##서울', 'score': 0.9809041023254395, 'entity': 'ORG-I'},
 {'word': '##을', 'score': 0.9081450700759888, 'entity': 'O'},
 {'word': '떠나', 'score': 0.9999575614929199, 'entity': 'O'},
 {'word': '잉글랜드', 'score': 0.9983227849006653, 'entity': 'LOC-B'},
 {'word': '프리미어리그', 'score': 0.9989780187606812, 'entity': 'ORG-B'},
 {'word': '볼', 'score': 0.9300729632377625, 'entity': 'ORG-B'},
 {'word': '##턴', 'score': 0.9710206389427185, 'entity': 'ORG-I'},
 {'word': '원', 'score': 0.999349057674408, 'entity': 'ORG-I'},
 {'word': '##더러', 'score': 0.9925868511199951, 'entity': 'ORG-I'},
 {'word': '##스로', 'score': 0.9959636926651001, 'entity': 'ORG-I'},
 {'word': '이적', 'score': 0.9999213814735413, 'entity': 'O'},
 {'word': '##한', 'score': 0.9999334216117859, 'entity': 'O'},
 {'word': '이', 'score': 0.9994910359382629, 'entity': 'PER-B'},
 {'word': '##청', 'score': 0.70

In [171]:
ignore_labels=["O"]
ignore_special_tokens=True

In [172]:
# [FIX] Now let's change it to word-level NER
word_idx = 0
word_level_answer = []

# NOTE: Might not be safe. BERT, ELECTRA etc. won't make issues.
if ignore_special_tokens:
    words = words[1:-1]
    tokens_mask = tokens_mask[1:-1]
    token_level_answer = token_level_answer[1:-1]

for mask, ans in zip(tokens_mask, token_level_answer):
    if mask == 1:
        ans["word"] = words[word_idx]
        word_idx += 1
        if ans["entity"] not in ignore_labels:
            word_level_answer.append(ans)

In [202]:
words

['[CLS]',
 '2009년',
 '7월',
 'FC서울을',
 '떠나',
 '잉글랜드',
 '프리미어리그',
 '볼턴',
 '원더러스로',
 '이적한',
 '이청용은',
 '크리스탈',
 '팰리스와',
 '독일',
 '분데스리가2',
 'VfL',
 '보훔을',
 '거쳐',
 '지난',
 '3월',
 'K리그로',
 '컴백했다.',
 '행선지는',
 '서울이',
 '아닌',
 '울산이었다',
 '[SEP]']

In [199]:
token_level_answer

[{'word': '2009년', 'score': 0.9996205568313599, 'entity': 'DAT-B'},
 {'word': '7월', 'score': 0.9350952506065369, 'entity': 'DAT-I'},
 {'word': 'FC', 'score': 0.9994584321975708, 'entity': 'ORG-B'},
 {'word': '##서울', 'score': 0.9809041023254395, 'entity': 'ORG-I'},
 {'word': '잉글랜드', 'score': 0.9983227849006653, 'entity': 'LOC-B'},
 {'word': '프리미어리그', 'score': 0.9989780187606812, 'entity': 'ORG-B'},
 {'word': '볼', 'score': 0.9300729632377625, 'entity': 'ORG-B'},
 {'word': '##턴', 'score': 0.9710206389427185, 'entity': 'ORG-I'},
 {'word': '원', 'score': 0.999349057674408, 'entity': 'ORG-I'},
 {'word': '##더러', 'score': 0.9925868511199951, 'entity': 'ORG-I'},
 {'word': '##스로', 'score': 0.9959636926651001, 'entity': 'ORG-I'},
 {'word': '이', 'score': 0.9994910359382629, 'entity': 'PER-B'},
 {'word': '##청', 'score': 0.7056900262832642, 'entity': 'PER-B'},
 {'word': '##용은', 'score': 0.5724880695343018, 'entity': 'PER-B'},
 {'word': '크리스탈', 'score': 0.9994640350341797, 'entity': 'ORG-B'},
 {'word'