In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from transformers import ElectraTokenizerFast
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]")


['[CLS]', '한국어', 'EL', '##EC', '##TRA', '##를', '공유', '##합니다', '.', '[SEP]']

In [5]:
tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'EL', '##EC', '##TRA', '##를', '공유', '##합니다', '.', '[SEP]'])

[2, 11229, 29173, 13352, 25541, 4110, 7824, 17788, 18, 3]

In [6]:
import os, sys

os.chdir('../')

In [7]:
!ls

__init__.py		  configurations  ipu_configuration.py	server.py
__pycache__		  exe_cache	  ipu_electra.ipynb	test.py
bert_test.ipynb		  finetune	  ipu_train.py
checkpoints		  inference.py	  modeling_electra.py
configuration_electra.py  input.txt	  pipeline_electra.py


# KoELECTRA training

In [34]:
import yaml
from easydict import EasyDict

In [35]:
config_file = 'finetune/squad_configurations.yaml'

In [36]:
config = EasyDict(yaml.load(open(config_file).read(), Loader=yaml.Loader))

In [37]:
config

{'tokenizer_name': 'monologg/koelectra-base-v3-discriminator',
 'train_config': {'model_name_or_path': 'monologg/koelectra-base-v3-discriminator',
  'train_global_batch_size': 128,
  'train_micro_batch_size': 2,
  'train_replication_factor': 1,
  'train_device_iterations': 1,
  'num_epochs': 3,
  'sequence_length': 384,
  'train_layers_per_ipu': [2, 3, 4, 3],
  'train_recompute_checkpoint_every_layer': True,
  'train_embedding_serialization_factor': 1,
  'saved_model_name': 'checkpoints/squad_base_4x2'},
 'valid_config': {'valid_micro_batch_size': 4,
  'valid_replication_factor': 4,
  'valid_device_iterations': 2,
  'valid_layer_per_ipu': [11, 13],
  'valid_recompute_checkpoint_every_layer': False,
  'valid_embedding_serialization_factor': 1}}

In [14]:
ipu_config = {
        "layers_per_ipu": config.train_config.train_layers_per_ipu,
        "recompute_checkpoint_every_layer":True,
        "embedding_serialization_factor": 1
    }

In [43]:
from datasets import load_dataset, load_metric
from pathlib import Path
from easydict import EasyDict 

import transformers
import popart
import poptorch
import torch

from pipeline_electra import PipelinedElectraForQuestionAnswering

from finetune.squad_preprocessing import prepare_train_features, \
                                prepare_validation_features, \
                                tokenizer, PadCollate, postprocess_qa_predictions

from finetune.run_squad_ipu import train, valid, get_optimizer, ipu_options

# Load Datasets

In [17]:
datasets = load_dataset("squad_kor_v1", cache_dir=Path.home() / ".torch./dataset")

Reusing dataset squad_kor_v1 (/root/.torch./dataset/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725)


  0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
train_dataset = datasets["train"].map(
    prepare_train_features,
    batched=True,
    num_proc=1,
    remove_columns=datasets["train"].column_names,
    load_from_cache_file=True,
)

# Create validation features from dataset
validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    num_proc=1,
    remove_columns=datasets["validation"].column_names,
    load_from_cache_file=True,
)


# Electra-base configuration
model_config = transformers.ElectraConfig(embedding_size=768,
                                    hidden_size=768,
                                    intermediate_size = 1024*3,
                                    num_hidden_layers=12,
                                    num_attention_heads=12,
                                    hidden_dropout_prob=0.1,
                                    attention_probs_dropout_prob=0.1,
                                    layer_norm_eps=1e-12,
                                    vocab_size= 35000)
#gpu_model = transformers.ElectraForQuestionAnswering.from_pretrained("bhadresh-savani/electra-base-squad2", config=config)
ipu_config = {
    "layers_per_ipu": [2,3,4,3],
    "recompute_checkpoint_every_layer":True,
    "embedding_serialization_factor": 1
}

Loading cached processed dataset at /root/.torch./dataset/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725/cache-eee16f8e2531e26c.arrow
Loading cached processed dataset at /root/.torch./dataset/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725/cache-0bccab8830f6fc91.arrow


In [19]:
ipu_config = EasyDict(ipu_config)
model = PipelinedElectraForQuestionAnswering.from_pretrained_transformers("monologg/koelectra-base-v3-discriminator", ipu_config, config=model_config)

model.parallelize().half().train()

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing PipelinedElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing PipelinedElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PipelinedElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of PipelinedElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are new

--- Device Allocation ---
Embedding --> IPU 0
Encoder 0  --> IPU 0
Encoder 1  --> IPU 0
Encoder 2  --> IPU 1
Encoder 3  --> IPU 1
Encoder 4  --> IPU 1
Encoder 5  --> IPU 2
Encoder 6  --> IPU 2
Encoder 7  --> IPU 2
Encoder 8  --> IPU 2
Encoder 9  --> IPU 3
Encoder 10 --> IPU 3
Encoder 11 --> IPU 3
QA Outputs --> IPU 3


PipelinedElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [22]:
# Runtime Configuration
'''
set replication_factor to 1
it's mean the model using 8 IPU setted beforehand
'''
global_batch_size = 128
micro_batch_size = 2
replication_factor = 1
gradient_accumulation = int(global_batch_size / micro_batch_size / replication_factor)
device_iterations = 1
train_samples_per_iteration = global_batch_size * device_iterations
num_epochs = 3

train_opts = ipu_options(gradient_accumulation, replication_factor, device_iterations, train_option=True)

# Training
sequence_length = 384
train_dl = poptorch.DataLoader(train_opts,
                           train_dataset,
                           batch_size=micro_batch_size,
                           shuffle=True,
                           drop_last=False,
                           collate_fn=PadCollate(global_batch_size,
                                                 {"input_ids": 0,
                                                  "attention_mask": 0,
                                                  "token_type_ids": 0,
                                                  "start_positions": sequence_length,
                                                  "end_positions": sequence_length}))

optimizer = get_optimizer(model)
train(model, train_opts, optimizer, train_dl, num_epochs, train_samples_per_iteration)

model.save_pretrained("checkpoints/squad_base_4x2")


Graph compilation: 100%|██████████| 100/100 [00:18<00:00]


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]



  0%|          | 0/546 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]

In [41]:
valid_micro_batch_size = config.valid_config.valid_micro_batch_size
valid_replication_factor = config.valid_config.valid_replication_factor
valid_global_batch_size = valid_micro_batch_size * valid_replication_factor
valid_device_iterations = config.valid_config.valid_device_iterations
valid_samples_per_iteration = valid_global_batch_size * valid_device_iterations

val_opts = ipu_options(1, valid_replication_factor, valid_device_iterations, train_option=False)

valid_ipu_config = {"layers_per_ipu": config.valid_config.valid_layer_per_ipu,
          "recompute_checkpoint_every_layer": config.valid_config.valid_recompute_checkpoint_every_layer,
          "embedding_serialization_factor": config.valid_config.valid_embedding_serialization_factor}

valid_ipu_config = EasyDict(valid_ipu_config)

model = PipelinedElectraForQuestionAnswering.from_pretrained_transformers(config.train_config.saved_model_name, valid_ipu_config)
model.parallelize().half().eval()



--- Device Allocation ---
Embedding --> IPU 0
Encoder 0  --> IPU 0
Encoder 1  --> IPU 0
Encoder 2  --> IPU 0
Encoder 3  --> IPU 0
Encoder 4  --> IPU 0
Encoder 5  --> IPU 0
Encoder 6  --> IPU 0
Encoder 7  --> IPU 0
Encoder 8  --> IPU 0
Encoder 9  --> IPU 0
Encoder 10 --> IPU 0
Encoder 11 --> IPU 1
QA Outputs --> IPU 1


PipelinedElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [44]:
val_dl = poptorch.DataLoader(val_opts,
                         validation_features.remove_columns(
                             ['example_id', 'offset_mapping']),
                         batch_size=valid_micro_batch_size,
                         shuffle=False,
                         drop_last=False,
                         collate_fn=PadCollate(valid_global_batch_size,
                                               {"input_ids": 0,
                                                "attention_mask": 0,
                                                "token_type_ids": 0}))


raw_predictions = valid(model, val_opts, val_dl, valid_samples_per_iteration)



validation:   0%|          | 0/216 [00:00<?, ?it/s]


Graph compilation:   0%|          | 0/100 [00:00<?][A
Graph compilation: 100%|██████████| 100/100 [00:09<00:00][A


In [45]:
final_predictions = postprocess_qa_predictions(datasets["validation"],
                                           validation_features,
                                           raw_predictions)

metric = load_metric("squad")
formatted_predictions = [{"id": k, "prediction_text": v}
                        for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]}
            for ex in datasets["validation"]]
metrics = metric.compute(predictions=formatted_predictions, references=references)
print(metrics)

Post-processing 5774 example predictions split into 6892 features.


  0%|          | 0/5774 [00:00<?, ?it/s]

{'exact_match': 85.45202632490475, 'f1': 90.83352067245401}


In [55]:
val_opts = ipu_options(1, valid_replication_factor, valid_device_iterations, train_option=False)

In [61]:
data_dl = next(iter(val_dl))

In [59]:
inference_model = poptorch.inferenceModel(model, val_opts)

In [65]:
outputs = inference_model(data_dl["input_ids"], data_dl["attention_mask"], data_dl["token_type_ids"])

In [66]:
outputs

(tensor([[-5.7227, -6.4648, -6.9102,  ..., -6.9102, -6.9258, -6.8828],
         [-5.7148, -6.6680, -6.9219,  ..., -6.9609, -6.9531, -6.9609],
         [-5.5977, -6.5039, -6.9258,  ..., -6.9531, -6.9766, -6.9727],
         ...,
         [-5.0156, -6.6914, -6.8086,  ..., -6.9688, -6.9766, -6.9805],
         [-5.4258, -6.7812, -6.8125,  ..., -6.8516, -6.8828, -6.8945],
         [-5.1758, -6.4961, -6.7383,  ..., -6.8906, -6.9336, -6.9414]],
        dtype=torch.float16),
 tensor([[-5.8828, -6.8906, -6.5195,  ..., -6.7578, -6.7617, -6.7344],
         [-5.7969, -6.8633, -6.6367,  ..., -6.7227, -6.7578, -6.7344],
         [-5.6680, -6.8164, -6.4844,  ..., -6.7070, -6.6914, -6.6367],
         ...,
         [-4.9883, -6.7852, -6.6992,  ..., -5.9375, -6.1133, -6.2305],
         [-5.5898, -6.7578, -6.7617,  ..., -6.7695, -6.7422, -6.7461],
         [-5.2812, -7.0156, -6.8086,  ..., -6.7422, -6.6484, -6.6641]],
        dtype=torch.float16))

# Inference

In [72]:
datasets['validation'][0]

{'id': '6548850-0-0',
 'title': '임종석',
 'context': '1989년 2월 15일 여의도 농민 폭력 시위를 주도한 혐의(폭력행위등처벌에관한법률위반)으로 지명수배되었다. 1989년 3월 12일 서울지방검찰청 공안부는 임종석의 사전구속영장을 발부받았다. 같은 해 6월 30일 평양축전에 임수경을 대표로 파견하여 국가보안법위반 혐의가 추가되었다. 경찰은 12월 18일~20일 사이 서울 경희대학교에서 임종석이 성명 발표를 추진하고 있다는 첩보를 입수했고, 12월 18일 오전 7시 40분 경 가스총과 전자봉으로 무장한 특공조 및 대공과 직원 12명 등 22명의 사복 경찰을 승용차 8대에 나누어 경희대학교에 투입했다. 1989년 12월 18일 오전 8시 15분 경 서울청량리경찰서는 호위 학생 5명과 함께 경희대학교 학생회관 건물 계단을 내려오는 임종석을 발견, 검거해 구속을 집행했다. 임종석은 청량리경찰서에서 약 1시간 동안 조사를 받은 뒤 오전 9시 50분 경 서울 장안동의 서울지방경찰청 공안분실로 인계되었다.',
 'question': '임종석이 여의도 농민 폭력 시위를 주도한 혐의로 지명수배 된 날은?',
 'answers': {'text': ['1989년 2월 15일'], 'answer_start': [0]}}

In [73]:
question = datasets['validation'][0]['question']
passage = datasets['validation'][0]['context']

In [74]:
# Apply the tokenizer to the input text, treating them as a text-pair.
input_encoding = tokenizer.encode_plus((question, passage))

# Extract inputs, add batch dimension
input_tensor = torch.tensor(input_encoding["input_ids"]).unsqueeze(0)
attention_tensor= torch.tensor(input_encoding["attention_mask"]).unsqueeze(0)
token_types=torch.tensor(input_encoding["token_type_ids"]).unsqueeze(0)
    
# Get model and load the fine-tuned weights
model = transformers.ElectraForQuestionAnswering.from_pretrained("checkpoints/squad_base_4x2")

In [75]:
# Solve task
outputs = model(input_tensor, attention_tensor, token_types)

# Extract answer
answer_start, answer_stop = outputs.start_logits.argmax(), outputs.end_logits.argmax()
answer_ids = input_tensor.squeeze()[answer_start:answer_stop + 1]
answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids, skip_special_tokens=True)
answer = tokenizer.convert_tokens_to_string(answer_tokens)

# Print results
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: 임종석이 여의도 농민 폭력 시위를 주도한 혐의로 지명수배 된 날은?
Answer: 1989년 2월 15일
