In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import yaml
from easydict import EasyDict

import torch
from transformers import ElectraTokenizer#, ElectraModel
from optimum.graphcore import IPUSeq2SeqTrainingArguments as Seq2SeqTrainingArguments

from modeling_electra import ElectraModel, ElectraForMaskedLM
from ipu_configuration import IPUConfig, ipu_options
import inspect


tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')

input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]

outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]

In [3]:
input_value = tokenizer.encode_plus("Hello, my dog is cute", return_tensors="pt", padding="max_length")

In [4]:
model(**input_value)

MaskedLMOutput(loss=None, logits=tensor([[[ 11.1353,  -7.4241,  -7.4119,  ...,  -7.4380,  -7.4108,  -7.3961],
         [ -9.8389, -10.2289, -10.2246,  ..., -10.2524, -10.2351, -10.0815],
         [  3.9714, -10.9872, -10.9838,  ..., -10.9641, -10.9766, -10.8934],
         ...,
         [ 50.3298,   1.1282,   1.1599,  ...,   1.0819,   1.2214,   1.1251],
         [ 49.8866,   0.8877,   0.9184,  ...,   0.8425,   0.9800,   0.8855],
         [ 11.1366,  -7.4236,  -7.4115,  ...,  -7.4376,  -7.4104,  -7.3956]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [5]:
# from transformers import BertConfig, ElectraConfig
# bert_config = BertConfig.from_pretrained('bert-base-uncased')
# electra_config = ElectraConfig.from_pretrained('google/electra-small-discriminator')
# print(bert_config)
# print(electra_config)

In [6]:
signature = inspect.signature(model.forward)
signature

<Signature (input_ids:Union[torch.Tensor, NoneType]=None, attention_mask:Union[torch.Tensor, NoneType]=None, token_type_ids:Union[torch.Tensor, NoneType]=None, position_ids:Union[torch.Tensor, NoneType]=None, head_mask:Union[torch.Tensor, NoneType]=None, inputs_embeds:Union[torch.Tensor, NoneType]=None, labels:Union[torch.Tensor, NoneType]=None, output_attentions:Union[bool, NoneType]=None, output_hidden_states:Union[bool, NoneType]=None, return_dict:Union[bool, NoneType]=None) -> Union[Tuple, transformers.modeling_outputs.MaskedLMOutput]>

In [7]:
!ls

__init__.py		  data			ipu_electra.ipynb
__pycache__		  exe_cache		ipu_train.py
bert_test.ipynb		  finetune		modeling_electra.py
checkpoints		  inference.py		pipeline_electra.py
configuration_electra.py  input.txt		server.py
configurations		  ipu_configuration.py	test.py


In [1]:
# model error
#pipelinedElectraForMaskedLM

# PipelinedElectraForPreTraining

# PipelinedElectraFor

In [51]:
from pipeline_electra import PipelinedElectraForMaskedLM
import poptorch

In [14]:
train_ipu_config = {
    "layers_per_ipu": [2,3,4,3],
    "recompute_checkpoint_every_layer":True,
    "embedding_serialization_factor": 1
}

train_ipu_config = EasyDict(train_ipu_config)

In [15]:
ipu_options

<function ipu_configuration.ipu_options(gradient_accumulation, replication_factor, device_iterations, train_option, seed=42)>

In [18]:
#opts = ipu_config.to_options()

optimizer_class = torch.optim.AdamW
optimizer = optimizer_class(model.parameters(), lr=0.004)

In [19]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 128, padding_idx=0)

In [21]:
#ipu_model = PipelinedElectraForMaskedLM.from_transformers(model, ipu_config)

In [43]:
#opts.toDict()

In [48]:
ipu_model = PipelinedElectraForMaskedLM.from_pretrained_transformers("google/electra-small-generator", train_ipu_config)

In [49]:
val_opts = ipu_options(1, 1, 1, train_option=False)

In [52]:
inference_model = poptorch.inferenceModel(ipu_model, val_opts)

In [55]:
entities = inference_model(input_value['input_ids'], input_value['attention_mask'], input_value['token_type_ids'])

Graph compilation: 100%|██████████| 100/100 [01:15<00:00]


In [56]:
entities

(tensor([[[ 11.1311,  -7.4237,  -7.4116,  ...,  -7.4377,  -7.4105,  -7.3957],
          [ -9.8425, -10.2310, -10.2267,  ..., -10.2546, -10.2372, -10.0839],
          [  3.9734, -10.9868, -10.9833,  ..., -10.9637, -10.9761, -10.8932],
          ...,
          [ 50.3746,   1.1534,   1.1852,  ...,   1.1069,   1.2467,   1.1503],
          [ 49.9352,   0.9090,   0.9397,  ...,   0.8637,   1.0014,   0.9068],
          [ 11.1324,  -7.4233,  -7.4112,  ...,  -7.4373,  -7.4101,  -7.3953]]]),)

In [46]:
tuple_data = (input_value['input_ids'], input_value['attention_mask'], input_value['token_type_ids'])

In [47]:
ipu_model(*tuple_data)#**input_value)

(tensor([[[ 11.1353,  -7.4241,  -7.4119,  ...,  -7.4380,  -7.4108,  -7.3961],
          [ -9.8389, -10.2289, -10.2246,  ..., -10.2524, -10.2351, -10.0815],
          [  3.9714, -10.9872, -10.9838,  ..., -10.9641, -10.9766, -10.8934],
          ...,
          [ 50.3298,   1.1282,   1.1599,  ...,   1.0819,   1.2214,   1.1251],
          [ 49.8866,   0.8877,   0.9184,  ...,   0.8425,   0.9800,   0.8855],
          [ 11.1366,  -7.4236,  -7.4115,  ...,  -7.4376,  -7.4104,  -7.3956]]],
        grad_fn=<AddBackward0>),)

--- Device Allocation ---
Embedding --> IPU 0
Encoder 0  --> IPU 0
Encoder 1  --> IPU 0
Encoder 2  --> IPU 1
Encoder 3  --> IPU 1
Encoder 4  --> IPU 1
Encoder 5  --> IPU 2
Encoder 6  --> IPU 2
Encoder 7  --> IPU 2
Encoder 8  --> IPU 2
Encoder 9  --> IPU 3
Encoder 10 --> IPU 3
Encoder 11 --> IPU 3
Generator Predictions --> IPU 3
Generator LM Head --> IPU 3


PipelinedElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_featu

In [32]:
from finetune.run_squad_ipu import ipu_options
val_opts = ipu_options(1, 1, 1, train_option=False)

In [33]:
t_model = poptorch.trainingModel(ipu_model.train(), options=opts, optimizer=optimizer)

In [38]:
t_model.compile(input_value['input_ids'])

RuntimeError: Tracer cannot infer type of (tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]]), <class 'inspect._empty'>)
:Only tensors and (possibly nested) tuples of tensors, lists, or dictsare supported as inputs or outputs of traced functions, but instead got value of type type.

In [37]:
input_value['input_ids'].shape

torch.Size([1, 512])

In [65]:
model.electra.encoder.layer[2]

ElectraLayer(
  (attention): ElectraAttention(
    (self): ElectraSelfAttention(
      (query): Linear(in_features=256, out_features=256, bias=True)
      (key): Linear(in_features=256, out_features=256, bias=True)
      (value): Linear(in_features=256, out_features=256, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): ElectraSelfOutput(
      (dense): Linear(in_features=256, out_features=256, bias=True)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): ElectraIntermediate(
    (dense): Linear(in_features=256, out_features=1024, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): ElectraOutput(
    (dense): Linear(in_features=1024, out_features=256, bias=True)
    (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [22]:
from torch.utils.data import Dataset
import os

In [24]:
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


block_size = 128  # spatial extent of the model for its context
if not os.path.isfile('input.txt'):
    os.system('wget https://github.com/karpathy/char-rnn/raw/master/data/tinyshakespeare/input.txt')
text = open('input.txt', 'r').read()  # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size)  # one line of poem is roughly 50 characters

data has 1115394 characters, 65 unique.


In [26]:
train_dataset[0]

(tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
         53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
          1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
         57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
          6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
         58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
          1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
         53,  1]),
 tensor([47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44, 53,
         56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,  1,
         44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1, 57,
         54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,  6,
          1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47, 58,
         

In [30]:
tuple_data

(tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1]]))