In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

from transformers import ElectraTokenizerFast, ElectraForTokenClassification
from pprint import pprint

tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")
model = ElectraForTokenClassification.from_pretrained("monologg/koelectra-small-finetuned-naver-ner")


In [3]:
import logging
from typing import Optional, Union

import torch
import numpy as np

from transformers import (
    BasicTokenizer,
    PreTrainedTokenizer,
    Pipeline
)


logger = logging.getLogger(__name__)


def custom_encode_plus(sentence,
                       tokenizer,
                       return_tensors=None):
    # {'input_ids': [2, 10841, 10966, 10832, 10541, 21509, 27660, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
    words = sentence.split()

    tokens = []
    tokens_mask = []

    for word in words:
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        tokens_mask.extend([1] + [0] * (len(word_tokens) - 1))

    ids = tokenizer.convert_tokens_to_ids(tokens)
    len_ids = len(ids)
    total_len = len_ids + tokenizer.num_special_tokens_to_add()
    if tokenizer.model_max_length and total_len > tokenizer.model_max_length:
        ids, _, _ = tokenizer.truncate_sequences(
            ids,
            pair_ids=None,
            num_tokens_to_remove=total_len - tokenizer.model_max_length,
            truncation_strategy="longest_first",
            stride=0,
        )

    sequence = tokenizer.build_inputs_with_special_tokens(ids)
    token_type_ids = tokenizer.create_token_type_ids_from_sequences(ids)
    # HARD-CODED: As I know, most of the transformers architecture will be `[CLS] + text + [SEP]``
    #             Only way to safely cover all the cases is to integrate `token mask builder` in internal library.
    tokens_mask = [1] + tokens_mask + [1]
    words = [tokenizer.cls_token] + words + [tokenizer.sep_token]

    encoded_inputs = {}
    encoded_inputs["input_ids"] = sequence
    encoded_inputs["token_type_ids"] = token_type_ids

    encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])

    if "token_type_ids" in encoded_inputs:
        encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])

    if "attention_mask" in encoded_inputs:
        encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])

    elif return_tensors is not None:
        logger.warning(
            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
                return_tensors
            )
        )

    return encoded_inputs, words, tokens_mask


In [4]:
sentence = "2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다"

In [5]:
tokens, words, tokens_mask = custom_encode_plus(sentence, tokenizer)

In [6]:
tokens

{'input_ids': tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
           6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
           5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
           6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
           5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
          10822,     3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]])}

In [7]:
tokens = tokenizer.encode_plus(sentence,
    padding='longest',
    max_length=512,
    truncation=True, 
    return_tensors='pt')

In [8]:
entities = model(**tokens)

In [9]:
result = entities[0][0].detach().numpy()

In [10]:
score = np.exp(result) / np.exp(result).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

In [11]:
token_level_answer = []
input_ids = tokens["input_ids"].numpy()[0]

for idx, label_idx in enumerate(labels_idx):
    # NOTE Append every answer even though the `entity` is in `ignore_labels`
    token_level_answer += [
        {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": model.config.id2label[label_idx],
        }
    ]

In [7]:
import pandas as pd
from torch.utils.data import Dataset
from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader, RandomSampler, SubsetRandomSampler

In [8]:
class NERDataset(Dataset):
    def __init__(self, data_path):
        data = pd.read_csv(data_path, sep='\t', names = ['text', 'label'])
        
        self.texts = []
        self.labels = []
        for i in range(len(data)):
            self.texts.append(data.iloc[i].text.split())
            self.labels.append(data.iloc[i].label)

    def __getitem__(self, index):
        return {'text':self.texts[index], 'label':self.labels[index]}
        
    def __len__(self):
        return len(self.texts)

In [9]:
@dataclass
class NERCollator:
    def __init__(self, tokenizer, sequence_length=512, mapping=None):
        self.tokenizer = tokenizer
        self.text = 'text'
        self.label = 'label'
        self.pad_token_label_id = -100

        labels_lst = ["O",
                "PER-B", "PER-I", "FLD-B", "FLD-I", "AFW-B", "AFW-I", "ORG-B", "ORG-I",
                "LOC-B", "LOC-I", "CVL-B", "CVL-I", "DAT-B", "DAT-I", "TIM-B", "TIM-I",
                "NUM-B", "NUM-I", "EVT-B", "EVT-I", "ANM-B", "ANM-I", "PLT-B", "PLT-I",
                "MAT-B", "MAT-I", "TRM-B", "TRM-I"]
        self.labels_dict = {label:i for i, label in enumerate(labels_lst)}


    def __call__(self, batch):
        if self.text not in batch[0] or self.label not in batch[0]:
            raise Exception("Error: Undefined data keys")

        sentence = [item[self.text] for item in batch]

        source_batch = self.tokenizer.batch_encode_plus(sentence,
                    padding='max_length',
                    is_split_into_words=True,
                    max_length=512,
                    truncation=True, 
                    return_offsets_mapping=True,
                    return_tensors='pt')

        labels = [[self.labels_dict[la] for la in item[self.label].split()] for item in batch]
        sequence_length = source_batch.input_ids.shape[1]
        labels = [label + [self.pad_token_label_id] * (sequence_length-len(label)) for label in labels]
        

        return {'input_ids':source_batch.input_ids,
                 'attention_mask':source_batch.attention_mask,
                 'token_type_ids':source_batch.token_type_ids,
                 'labels': torch.tensor(labels),
                 'offset_mapping': source_batch.offset_mapping,
                 'sentence': sentence
                 }


In [10]:
#dataset = NERDataset(data_path)

In [11]:
collator = NERCollator(tokenizer)

In [15]:
#dl = DataLoader(dataset, batch_size=4, collate_fn=collator)

In [None]:
batch = next(iter(dl))

In [None]:
# from transformers import ElectraTokenizer, ElectraForTokenClassification
# import torch

# tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
# model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1

# outputs = model(**inputs, labels=labels)
# loss, scores = outputs[:2]

# IPU model Training

In [12]:
!ls

Fine-tuning-Electra.ipynb  evaluate_v1_0.py	     ner_configurations.yaml
__init__.py		   exe_cache		     run_korquad_ipu.py
__pycache__		   koelectra_korquad.ipynb   run_ner_ipu.py
checkpoints		   korquad_preprocessing.py  squad_configurations.yaml
dataloader_ner.py	   naver_ner.ipynb	     squad_preprocessing.py


In [13]:
import os, sys
os.chdir('../')

In [14]:
from easydict import EasyDict
import yaml
from pipeline_electra import PipelinedElectraForTokenClassification
import poptorch
import torch

In [15]:
config_file = "finetune/ner_configurations.yaml"
config = EasyDict(yaml.load(open(config_file).read(), Loader=yaml.Loader))


In [16]:
dataset = NERDataset(config.train_data_path)
collator = NERCollator(tokenizer)

In [17]:
dl = DataLoader(dataset, batch_size=4, collate_fn=collator)

In [18]:
batch = next(iter(dl))

In [19]:
#data_path
train_ipu_config = {
    "layer_per_ipu": config.train_config.train_layers_per_ipu,
    "recompute_checkpoint_every_layer": config.train_config.train_recompute_checkpoint_every_layer,
    "embedding_serialization_factor": config.train_config.train_embedding_serialization_factor
}

train_ipu_config = EasyDict(train_ipu_config)
model = PipelinedElectraForTokenClassification.from_pretrained_transformers(config.train_config.model_name_or_path, train_ipu_config, num_labels=29)

# model.parallelize().half().train()

train_global_batch_size = config.train_config.train_global_batch_size
train_micro_batch_size = config.train_config.train_micro_batch_size
train_replication_factor = config.train_config.train_replication_factor
gradient_accumulation = int(train_global_batch_size / train_micro_batch_size / train_replication_factor)
train_device_iterations = config.train_config.train_device_iterations
train_samples_per_iteration = train_global_batch_size * train_device_iterations
num_epochs = config.train_config.num_epochs

from finetune.run_ner_ipu import ipu_options
train_opts = ipu_options(gradient_accumulation, train_replication_factor, train_device_iterations, train_option=True)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing PipelinedElectraForTokenClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing PipelinedElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PipelinedElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of PipelinedElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and

In [20]:
regularized_params = []
non_regularized_params = []
for param in model.parameters():
    if param.requires_grad:
        if len(param.shape) == 1:
            non_regularized_params.append(param)
        else:
            regularized_params.append(param)

params = [
    {"params": regularized_params, "weight_decay": 0.01},
    {"params": non_regularized_params, "weight_decay": 0}
]
optimizer = poptorch.optim.AdamW(params,
                                 lr=1e-4,
                                 weight_decay=0,
                                 eps=1e-6,
                                 bias_correction=True,
                                 loss_scaling=64,
                                 first_order_momentum_accum_type=torch.float16,
                                 accum_type=torch.float16)

In [21]:
#train_model = poptorch.trainingModel(model, train_opts, optimizer)

In [22]:
#model

In [23]:
nerdataset = NERDataset(config.train_data_path)

In [24]:
train_dl = poptorch.DataLoader(train_opts,
                               nerdataset,
                               batch_size=train_micro_batch_size,
                               collate_fn=collator,
                               shuffle=True,
                               drop_last=False)



In [25]:
import transformers

In [26]:
batch_one = next(iter(train_dl))

In [27]:
num_steps = num_epochs * len(train_dl)
lr_scheduler = transformers.get_scheduler("cosine", optimizer, 0.1 * num_steps, num_steps)

# Wrap the pytorch model with poptorch.trainingModel
training_model = poptorch.trainingModel(model, train_opts, optimizer)

In [28]:
# Compile model or load from executable cache
batch = next(iter(train_dl))
outputs = training_model.compile(batch["input_ids"],
                                 batch["attention_mask"],
                                 batch["token_type_ids"],
                                batch["labels"])

Error: In poptorch/source/RemoveSurplusIdentityLosses.cpp:103: 'poptorch_cpp_error': Couldn't find a loss in graph!

In [None]:
outputs

In [None]:
# Training Loop
for epoch in trange(num_epochs, desc="Epochs"):
    train_iter = tqdm(train_dl)
    for step, batch in enumerate(train_iter):
        start_step = time.perf_counter()

        # This completes a forward+backward+weight update step
        outputs = training_model(batch["input_ids"],
                                 batch["attention_mask"],
                                 batch["token_type_ids"])

        # Update the LR and update the poptorch optimizer
        lr_scheduler.step()
        training_model.setOptimizer(optimizer)
        step_length = time.perf_counter() - start_step
        step_throughput = samples_per_iteration / step_length
        loss = outputs[0].mean().item()
        train_iter.set_description(
            f"Epoch: {epoch} - "
            f"Step: {step} - "
            f"Loss: {loss:3.3f} - "
            f"Throughput: {step_throughput:3.3f} seq/s")

# Detach the model from the device once training is over so the device is free to be reused for validation
training_model.detachFromDevice()

# IPU model inference

In [67]:
import os, sys
os.chdir('../')

In [68]:
from pipeline_electra import PipelinedElectraForTokenClassification
from easydict import EasyDict
import yaml

In [134]:
config_file = 'finetune/squad_configurations.yaml'
config = EasyDict(yaml.load(open(config_file).read(), Loader=yaml.Loader))


In [135]:
train_ipu_config = {
    "layers_per_ipu": config.train_config.train_layers_per_ipu,
    "recompute_checkpoint_every_layer": config.train_config.train_recompute_checkpoint_every_layer,
    "embedding_serialization_factor": config.train_config.train_embedding_serialization_factor
}

In [99]:
model = PipelinedElectraForTokenClassification.from_pretrained_transformers("monologg/koelectra-small-finetuned-naver-ner", train_ipu_config, num_labels=29)

# BatchTest

In [110]:
outputs = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'], torch.tensor(batch['labels']))

In [137]:
import poptorch
from finetune.run_squad_ipu import ipu_options

In [138]:
valid_micro_batch_size = 1
valid_replication_factor = 1
valid_global_batch_size = valid_micro_batch_size * valid_replication_factor
valid_device_iterations = 1
valid_samples_per_iteration = valid_global_batch_size * valid_device_iterations

In [139]:
valid_samples_per_iteration

1

In [140]:
val_opts = ipu_options(1, valid_replication_factor, valid_device_iterations, train_option=False)

In [141]:
inference_model = poptorch.inferenceModel(model, val_opts)

In [36]:
#token = tokenizer.encode_plus(sentence, return_tensors='pt', return_offsets_mapping=True)

In [143]:
word_list = sentence.split()
split_tokens = tokenizer.encode_plus(word_list, is_split_into_words=True, return_tensors='pt', return_offsets_mapping=True)

In [144]:
word_offset_mapping = split_tokens['offset_mapping']

In [145]:
#input_ids = torch.vstack((token['input_ids'],token['input_ids']))

In [146]:
#token_type_ids = torch.vstack((token['token_type_ids'],token['token_type_ids']))

In [147]:
split_tokens['input_ids']

tensor([[    2, 11909, 11243, 16810, 15953,  5703, 12584, 17085, 30548,  2355,
          6134,  3658, 21851, 11184, 15334,  5696,  3777,  5736, 19686, 26772,
          5129, 10856,  5813, 11276,  2428, 17804, 10731,  5920,    58,  5988,
          6240,  2348,  7462,  5703, 12103, 10563, 11207, 21444,  5699,  4675,
          5821, 10542,    18,  5386,  5843, 10639, 10602,  5706, 10940, 13125,
         10822,     3]])

In [148]:
entities = inference_model(split_tokens['input_ids'], split_tokens['token_type_ids'])#, split_tokens['attention_mask'])

Graph compilation: 100%|██████████| 100/100 [00:01<00:00]


In [2]:
ignore_labels=["O"]
ignore_special_tokens=True

word_offset_mapping = word_offset_mapping[0].cpu().numpy()
entities = entities[0].cpu().numpy()
input_ids = split_tokens["input_ids"].numpy()[0]

NameError: name 'word_offset_mapping' is not defined

In [285]:
score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
labels_idx = score.argmax(axis=-1)

token_level_answer = []
for idx, label_idx in enumerate(labels_idx):
    if model.config.id2label[label_idx] in ignore_labels:
        token_answer = []
        continue
    
    elif word_offset_mapping[idx][0] == 0:
        if token_answer:
            token_answer["word"] = tokenizer.decode(token_answer["word"])
            token_level_answer.append(token_answer)
        token_answer = {
                "word": [int(input_ids[idx])],
                "score": score[idx][label_idx].item(),
                "entity": model.config.id2label[label_idx],
        }
        
    else:
        token_answer['word'].append(int(input_ids[idx]))

In [287]:
tokens = []
for i, word_offset in enumerate(word_offset_mapping):
    if (word_offset[1] - word_offset[0]) == 0:
        token = []
        continue
    elif word_offset[0] == 0:
        if token:
            tokens.append(token)
        token = []
        token.append(split_tokens['input_ids'][0][i])
    else:
        token.append(split_tokens['input_ids'][0][i])


In [293]:
word_offset_mapping

array([[0, 0],
       [0, 5],
       [0, 2],
       [0, 2],
       [2, 4],
       [4, 5],
       [0, 2],
       [0, 4],
       [0, 6],
       [0, 1],
       [1, 2],
       [0, 1],
       [1, 3],
       [3, 5],
       [0, 2],
       [2, 3],
       [0, 1],
       [1, 2],
       [2, 4],
       [0, 4],
       [0, 1],
       [1, 3],
       [3, 4],
       [0, 2],
       [0, 1],
       [1, 3],
       [3, 5],
       [5, 6],
       [0, 1],
       [1, 2],
       [2, 3],
       [0, 1],
       [1, 2],
       [2, 3],
       [0, 2],
       [0, 2],
       [0, 2],
       [0, 3],
       [3, 4],
       [0, 1],
       [1, 2],
       [2, 4],
       [4, 5],
       [0, 1],
       [1, 2],
       [2, 4],
       [0, 2],
       [2, 3],
       [0, 2],
       [0, 2],
       [2, 5],
       [0, 0]])

In [164]:
tokenizer.decode(tokens[0])

'2009년'

In [291]:
for token in tokens:
    print(tokenizer.decode(token))

2009년
7월
FC서울을
떠나
잉글랜드
프리미어리그
볼턴
원더러스로
이적한
이청용은
크리스탈
팰리스와
독일
분데스리가2
VfL
보훔을
거쳐
지난
3월
K리그로
컴백했다.
행선지는
서울이
아닌


In [73]:
#cur_offset_mapping = offset_mapping[0]
for i, offset in enumerate(cur_offset_mapping[1:-1]):
    #print(offset)
    if (cur_offset_mapping[i+1][0] - offset[1]) > 0:
        continue
    else:
        print(token_level_answer[i])

In [145]:
token_level_answer

[{'word': '7월', 'score': 0.9350952506065369, 'entity': 'DAT-I'},
 {'word': 'FC', 'score': 0.9994584321975708, 'entity': 'ORG-B'},
 {'word': '##서울', 'score': 0.9809041023254395, 'entity': 'ORG-I'},
 {'word': '##을', 'score': 0.9081450700759888, 'entity': 'O'},
 {'word': '떠나', 'score': 0.9999575614929199, 'entity': 'O'},
 {'word': '잉글랜드', 'score': 0.9983227849006653, 'entity': 'LOC-B'},
 {'word': '프리미어리그', 'score': 0.9989780187606812, 'entity': 'ORG-B'},
 {'word': '볼', 'score': 0.9300729632377625, 'entity': 'ORG-B'},
 {'word': '##턴', 'score': 0.9710206389427185, 'entity': 'ORG-I'},
 {'word': '원', 'score': 0.999349057674408, 'entity': 'ORG-I'},
 {'word': '##더러', 'score': 0.9925868511199951, 'entity': 'ORG-I'},
 {'word': '##스로', 'score': 0.9959636926651001, 'entity': 'ORG-I'},
 {'word': '이적', 'score': 0.9999213814735413, 'entity': 'O'},
 {'word': '##한', 'score': 0.9999334216117859, 'entity': 'O'},
 {'word': '이', 'score': 0.9994910359382629, 'entity': 'PER-B'},
 {'word': '##청', 'score': 0.70

In [171]:
ignore_labels=["O"]
ignore_special_tokens=True

In [172]:
# [FIX] Now let's change it to word-level NER
word_idx = 0
word_level_answer = []

# NOTE: Might not be safe. BERT, ELECTRA etc. won't make issues.
if ignore_special_tokens:
    words = words[1:-1]
    tokens_mask = tokens_mask[1:-1]
    token_level_answer = token_level_answer[1:-1]

for mask, ans in zip(tokens_mask, token_level_answer):
    if mask == 1:
        ans["word"] = words[word_idx]
        word_idx += 1
        if ans["entity"] not in ignore_labels:
            word_level_answer.append(ans)