# Treinando um modelo POS-Tagger com o corpus GENIA

Baseado no código de Thiago Castro (https://www.youtube.com/user/Thicasfer)

Vamos começar baixando as dependências

In [1]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 26.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 66.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [2]:
from google.colab import drive
import os
drive.mount('/content/gdrive')
!ls

Mounted at /content/gdrive
gdrive	sample_data


In [4]:
cd /content/gdrive/My\ Drive/postagger_genia

/content/gdrive/My Drive/postagger_genia


In [7]:
!ls

GENIAtest.pos  GENIAtrain.pos


**Treinamento**

No caso, vamos treinar um *part-of-speech tagger*, i.e. um modelo que ache as classes gramaticais dos tokens de um texto.

Lendo o córpus

In [336]:
with open('GENIAtrain.pos') as f:
  traindata = [[tuple(w.split('/'))for w in snt.split()] for snt in f.read().split('\n')]

with open('GENIAtest.pos') as f:
  devdata = [[tuple(w.split('/'))for w in snt.split()] for snt in f.read().split('\n')]

devdata[2]

[('AB', 'LS'),
 ('-', ':'),
 ('The', 'DT'),
 ('involvement', 'NN'),
 ('of', 'IN'),
 ('ion', 'NN'),
 ('channels', 'NNS'),
 ('in', 'IN'),
 ('B', 'NN'),
 ('and', 'CC'),
 ('T', 'NN'),
 ('lymphocyte', 'NN'),
 ('activation', 'NN'),
 ('is', 'VBZ'),
 ('supported', 'VBN'),
 ('by', 'IN'),
 ('many', 'JJ'),
 ('reports', 'NNS'),
 ('of', 'IN'),
 ('changes', 'NNS'),
 ('in', 'IN'),
 ('ion', 'NN'),
 ('fluxes', 'NNS'),
 ('and', 'CC'),
 ('membrane', 'NN'),
 ('potential', 'NN'),
 ('after', 'IN'),
 ('mitogen', 'NN'),
 ('binding', 'NN'),
 ('.', '.')]

In [306]:
traindata[:2]

[[('UI', 'LS'), ('-', ':'), ('95369245', 'CD')],
 [('TI', 'LS'),
  ('-', ':'),
  ('IL-2', 'NN'),
  ('gene', 'NN'),
  ('expression', 'NN'),
  ('and', 'CC'),
  ('NF-kappa', 'NN'),
  ('B', 'NN'),
  ('activation', 'NN'),
  ('through', 'IN'),
  ('CD28', 'NN'),
  ('requires', 'VBZ'),
  ('reactive', 'JJ'),
  ('oxygen', 'NN'),
  ('production', 'NN'),
  ('by', 'IN'),
  ('5-lipoxygenase', 'NN'),
  ('.', '.')]]

In [337]:
def parse(data):
  X = list()
  temp=''
  for snt in data:
    for w in snt:
      if len(w)>1:
        temp = temp+' '+w[0].strip()
    if temp.strip():
      X.append(temp.strip())
    temp=''

  temp=list()
  y=list()
  y2=list()
  for snt in data:
    for w in snt:
      if len(w)>1:
        a=w[-1]
        if a=='' or a=='(' or a==')' or a==',' or a=='.' or a=='"' or a=='\'' or a=="''" or a=='``' or a==':' or a=='-':
          a='PUNCT'
        if '|' in a:
          a = a.split('|')[0]
        temp.append(a)
        y2.append(a)
    if len(temp)>0:
      y.append(temp)
    temp=list()
       
  y2 = list(set(y2))

  return X, y, y2

train_X, train_y, y2_train = parse(traindata)
dev_X, dev_y, y2_dev = parse(devdata)

In [338]:
def id2tag(y2_train, y2_dev):
  y2=y2_train + y2_dev
  y2 = list(set(y2))
  tags2 = [w.split('|')[0] for w in y2]
  tags3=list()
  for w in tags2:
    tags3.append(w)
  tags = list(set(tags3))
  tags.append('<pad>')
  tag2id = { tag:i for i, tag in enumerate(tags) }
  id2tag = { i:tag for i, tag in enumerate(tags) }

  return tag2id, id2tag

tag2id, id2tag = id2tag(y2_train, y2_dev)


In [339]:
tag2id

{'VBD': 0,
 'N': 1,
 'XT': 2,
 'JJS': 3,
 'E2A': 4,
 'WRB': 5,
 'VB': 6,
 'TO': 7,
 'VBP': 8,
 'FW': 9,
 'EX': 10,
 'VBN': 11,
 'VBZ': 12,
 'NNS': 13,
 'VBG': 14,
 'RBR': 15,
 'WP': 16,
 'CT': 17,
 'PRP': 18,
 'JJR': 19,
 'CC': 20,
 'NNPS': 21,
 'CD': 22,
 'DT': 23,
 'NNP': 24,
 'PDT': 25,
 'LS': 26,
 'PP': 27,
 'PRP$': 28,
 'NN': 29,
 'JJ': 30,
 'RP': 31,
 'RBS': 32,
 'MD': 33,
 'WP$': 34,
 'RB': 35,
 'SYM': 36,
 'IN': 37,
 'PUNCT': 38,
 'WDT': 39,
 'POS': 40,
 '<pad>': 41}

In [340]:
train_X[20]

'TI - E1A gene expression induces susceptibility to killing by NK cells following immortalization but not adenovirus infection of human cells .'

In [341]:
train_y[20]


['LS',
 'PUNCT',
 'NN',
 'NN',
 'NN',
 'VBZ',
 'NN',
 'TO',
 'NN',
 'IN',
 'NN',
 'NNS',
 'VBG',
 'NN',
 'CC',
 'RB',
 'NN',
 'NN',
 'IN',
 'JJ',
 'NNS',
 'PUNCT']

In [342]:
dev_X[1]

'TI - Charybdotoxin-sensitive , Ca(2+)-dependent membrane potential changes are not involved in human T or B cell activation and proliferation .'

In [343]:
dev_y[1]

['LS',
 'PUNCT',
 'JJ',
 'PUNCT',
 'JJ',
 'NN',
 'JJ',
 'NNS',
 'VBP',
 'RB',
 'VBN',
 'IN',
 'JJ',
 'NN',
 'CC',
 'NN',
 'NN',
 'NN',
 'CC',
 'NN',
 'PUNCT']

In [345]:
id2tag 

{0: 'VBD',
 1: 'N',
 2: 'XT',
 3: 'JJS',
 4: 'E2A',
 5: 'WRB',
 6: 'VB',
 7: 'TO',
 8: 'VBP',
 9: 'FW',
 10: 'EX',
 11: 'VBN',
 12: 'VBZ',
 13: 'NNS',
 14: 'VBG',
 15: 'RBR',
 16: 'WP',
 17: 'CT',
 18: 'PRP',
 19: 'JJR',
 20: 'CC',
 21: 'NNPS',
 22: 'CD',
 23: 'DT',
 24: 'NNP',
 25: 'PDT',
 26: 'LS',
 27: 'PP',
 28: 'PRP$',
 29: 'NN',
 30: 'JJ',
 31: 'RP',
 32: 'RBS',
 33: 'MD',
 34: 'WP$',
 35: 'RB',
 36: 'SYM',
 37: 'IN',
 38: 'PUNCT',
 39: 'WDT',
 40: 'POS',
 41: '<pad>'}

In [372]:
tag2id

{'VBD': 0,
 'N': 1,
 'XT': 2,
 'JJS': 3,
 'E2A': 4,
 'WRB': 5,
 'VB': 6,
 'TO': 7,
 'VBP': 8,
 'FW': 9,
 'EX': 10,
 'VBN': 11,
 'VBZ': 12,
 'NNS': 13,
 'VBG': 14,
 'RBR': 15,
 'WP': 16,
 'CT': 17,
 'PRP': 18,
 'JJR': 19,
 'CC': 20,
 'NNPS': 21,
 'CD': 22,
 'DT': 23,
 'NNP': 24,
 'PDT': 25,
 'LS': 26,
 'PP': 27,
 'PRP$': 28,
 'NN': 29,
 'JJ': 30,
 'RP': 31,
 'RBS': 32,
 'MD': 33,
 'WP$': 34,
 'RB': 35,
 'SYM': 36,
 'IN': 37,
 'PUNCT': 38,
 'WDT': 39,
 'POS': 40,
 '<pad>': 41}

In [373]:
print(train_X[:10])
print(train_y[:10])

['UI - 95369245', 'TI - IL-2 gene expression and NF-kappa B activation through CD28 requires reactive oxygen production by 5-lipoxygenase .', 'AB - Activation of the CD28 surface receptor provides a major costimulatory signal for T cell activation resulting in enhanced production of interleukin-2 ( IL-2 ) and cell proliferation .', 'In primary T lymphocytes we show that CD28 ligation leads to the rapid intracellular formation of reactive oxygen intermediates ( ROIs ) which are required for CD28-mediated activation of the NF-kappa B complex and IL-2 expression .', 'Delineation of the CD28 signaling cascade was found to involve protein tyrosine kinase activity , followed by the activation of phospholipase A2 and 5-lipoxygenase .', 'Our data suggest that lipoxygenase metabolites activate ROI formation which then induce IL-2 expression via NF-kappa B activation .', 'These findings should be useful for therapeutic strategies and the development of immunosuppressants targeting the CD28 costi

In [347]:
print(dev_X[:10])
print(dev_y[:10])

['UI - 92043714', 'TI - Charybdotoxin-sensitive , Ca(2+)-dependent membrane potential changes are not involved in human T or B cell activation and proliferation .', 'AB - The involvement of ion channels in B and T lymphocyte activation is supported by many reports of changes in ion fluxes and membrane potential after mitogen binding .', 'Human T and B lymphocytes demonstrate an early and transient hyperpolarization after ligand binding .', 'Inasmuch as the change in membrane potential is dependent on elevation of free cytosolic calcium , the hyperpolarization is presumably through opening of Ca(2+)-stimulated K+ channels .', 'We have used charybdotoxin , a known inhibitor of Ca(2+)-dependent K+ channels , to study the role of these channels in lymphocyte activation and mitogenesis .', 'We demonstrate that charybdotoxin inhibits the ligand-induced transient membrane hyperpolarization in B and T cells in a dose-dependent fashion , without affecting changes in cytosolic Ca2+ .', 'However 

In [348]:
# align
from transformers import AutoTokenizer

def align(X, y):
  tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', do_lower_case=False)
  
  procdata = []
  for (X_, y_) in zip(X, y):
    inputs = tokenizer(X_, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    try:
      new_tags = ['<pad>']
      pos = 0
      for token in tokens[1:-1]:
        if '##' in token:
          new_tags.append(y_[pos-1])
        else:
          new_tags.append(y_[pos])
          pos += 1
      new_tags.append('<pad>')

      procdata.append({ 'X': X_, 'y': ' '.join(new_tags) })
    except:
      pass
  return procdata

trainset = align(train_X, train_y)
devset = align(dev_X, dev_y)

len(trainset), len(devset) 

(6229, 727)

Importando dependências. Veja que utilizaremos os métodos `AutoTokenizer` e `AutoModelForTokenClassification` para instanciar o tokenizador e o modelo de classificação de tokens.

In [349]:
import os
import torch
import torch.nn as nn
from torch import optim
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [350]:
devset[:4]

[{'X': 'UI - 92043714', 'y': '<pad> LS LS PUNCT CD CD CD CD CD <pad>'},
 {'X': 'AB - The involvement of ion channels in B and T lymphocyte activation is supported by many reports of changes in ion fluxes and membrane potential after mitogen binding .',
  'y': '<pad> LS PUNCT DT NN IN NN NNS IN NN CC NN NN NN NN NN NN VBZ VBN IN JJ NNS IN NNS IN NN NNS NNS CC NN NN IN NN NN NN PUNCT <pad>'},
 {'X': 'Human T and B lymphocytes demonstrate an early and transient hyperpolarization after ligand binding .',
  'y': '<pad> JJ NN CC NN NNS NNS NNS VBP DT JJ CC JJ JJ NN NN NN NN NN IN JJ JJ JJ NN PUNCT <pad>'},
 {'X': 'UI - 92135145', 'y': '<pad> LS LS PUNCT CD CD CD CD <pad>'}]

In [351]:
trainset[:8]

[{'X': 'UI - 95369245', 'y': '<pad> LS LS PUNCT CD CD CD CD CD <pad>'},
 {'X': 'These findings should be useful for therapeutic strategies and the development of immunosuppressants targeting the CD28 costimulatory pathway .',
  'y': '<pad> DT NNS MD VB JJ IN JJ NNS CC DT NN IN NNS NNS NNS NNS NNS NNS VBG DT NN NN NN NN NN NN PUNCT <pad>'},
 {'X': 'UI - 95333264', 'y': '<pad> LS LS PUNCT CD CD CD CD <pad>'},
 {'X': 'UI - 95343554', 'y': '<pad> LS LS PUNCT CD CD CD CD CD <pad>'},
 {'X': 'TI - E1A gene expression induces susceptibility to killing by NK cells following immortalization but not adenovirus infection of human cells .',
  'y': '<pad> LS LS PUNCT NN NN NN NN NN VBZ VBZ NN NN NN NN NN TO NN IN NN NN NNS VBG NN NN CC RB NN NN NN NN IN JJ NNS PUNCT <pad>'},
 {'X': 'AB - Adenovirus ( Ad ) infection and E1A transfection were used to model changes in susceptibility to NK cell killing caused by transient vs stable E1A expression in human cells .',
  'y': '<pad> LS PUNCT NN NN NN NN PUN

Definindo parâmetros do modelo e treinamento

In [352]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nclasses = len(tag2id)
nepochs = 30
batch_size = 32
batch_status = 32
learning_rate = 1e-5

early_stop = 3
max_length = 200
write_path = 'model'

Separando os dados em batches

In [353]:
from torch.utils.data import DataLoader

traindata = DataLoader(trainset, batch_size=batch_size, shuffle=True)
devdata = DataLoader(devset, batch_size=batch_size, shuffle=True)

In [354]:
devdata

<torch.utils.data.dataloader.DataLoader at 0x7f3c04382410>

Inicializando tokenizador, modelo, função de erro e otimizador

In [355]:
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', do_lower_case=False)
model = AutoModelForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.2', num_labels=nclasses).to(device)

optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [356]:
nclasses

42

Método de Avaliação

In [357]:
def evaluate(model, testdata):
  model.eval()
  y_real, y_pred = [], []
  for batch_idx, inp in enumerate(testdata):
    texts = inp['X']

    labels = []
    for tags in inp['y']:
      tag_idxs = [tag2id[tag] for tag in tags.split()]
      labels.append(tag_idxs)
    
    # classifying
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
    output = model(**inputs)
                
    pred_labels = torch.argmax(output.logits, 2).tolist()

    for i in range(len(labels)):
      y_real.extend(labels[i][1:-1])
      seq_size = len(labels[i][1:-1])
      y_pred.extend(pred_labels[i][1:seq_size+1])
    
    if (batch_idx+1) % batch_status == 0:
      print('Progress:', round(batch_idx / len(testdata), 2), batch_idx)
  
  print(classification_report(y_real, y_pred))
  f1 = f1_score(y_real, y_pred, average='weighted')
  acc = accuracy_score(y_real, y_pred)
  return f1, acc

Treinamento

In [358]:
from torch.nn.utils.rnn import pad_sequence

max_f1, repeat = 0, 0
num=-1
for epoch in range(nepochs):
  model.train()
  losses = []
  for batch_idx, inp in enumerate(traindata):
    num=num+1
    texts = inp['X']
    
    labels = []
    for tags in inp['y']:
      try:
        tag_idxs = [tag2id[tag] for tag in tags.split()]
        labels.append(torch.tensor(tag_idxs[:max_length]))
      except:
        print('inp[y]:', inp['y'])
        print('texts:', texts)
        print('num:', num)
        raise
    
    labels= pad_sequence(labels, padding_value=tag2id['<pad>']).transpose(0, 1).unsqueeze(0).contiguous()

    # classifying
    try:
      inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
      output = model(**inputs, labels=labels.to(device))
    except:
        print('inp[y]:', inp['y'])
        print('texts:', texts)
        print('inputs:', inputs)
        print('batch_idx:', batch_idx)
        raise

    # Calculate loss
    loss = output.loss
    losses.append(float(loss))

    # Backpropagation
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Display
    if (batch_idx+1) % batch_status == 0:
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tTotal Loss: {:.6f}'.format(epoch, \
        batch_idx+1, len(traindata), 100. * batch_idx / len(traindata), 
        float(loss), round(sum(losses) / len(losses), 5)))
  
  f1, acc = evaluate(model, devdata)
  print('F1: ', f1, 'Accuracy: ', acc)
  if f1 > max_f1:
    model.save_pretrained(os.path.join(write_path, 'model'))
    max_f1 = f1
    repeat = 0
    print('Saving best model...')
  else:
    repeat += 1
  
  if repeat == early_stop:
    print('FIM!!! early_stop')
    break



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.92      0.91      0.91       263
           3       0.00      0.00      0.00        14
           5       0.00      0.00      0.00         8
           6       0.94      0.91      0.92       169
           7       1.00      0.91      0.95       203
           8       0.89      0.94      0.92       195
           9       1.00      0.11      0.20        98
          10       0.00      0.00      0.00         5
          11       0.73      0.81      0.77       532
          12       0.88      1.00      0.93       252
          13       0.97      0.97      0.97      1575
          14       0.00      0.00      0.00       133
          15       0.00      0.00      0.00         9
          16       0.00      0.00      0.00         3
          18       1.00      0.65      0.79        69
          19       0.00      0.00      0.00        22
          20       0.98      0.91      0.95       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      0.97      0.97       263
           3       0.00      0.00      0.00        14
           5       0.00      0.00      0.00         8
           6       0.98      0.98      0.98       169
           7       1.00      1.00      1.00       203
           8       0.98      0.97      0.98       195
           9       1.00      0.56      0.72        98
          10       0.00      0.00      0.00         5
          11       0.91      0.97      0.94       532
          12       0.98      1.00      0.99       252
          13       0.99      0.97      0.98      1575
          14       0.90      0.89      0.90       133
          15       0.00      0.00      0.00         9
          16       0.00      0.00      0.00         3
          18       0.92      1.00      0.96        69
          19       0.00      0.00      0.00        22
          20       0.99      0.98      0.99       395
          22       0.99    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       263
           3       1.00      0.21      0.35        14
           5       0.00      0.00      0.00         8
           6       0.98      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      0.99      0.99       195
           9       0.92      0.89      0.90        98
          10       1.00      0.60      0.75         5
          11       0.92      0.98      0.95       532
          12       0.99      1.00      0.99       252
          13       0.99      0.98      0.98      1575
          14       0.88      0.94      0.91       133
          15       0.00      0.00      0.00         9
          16       0.00      0.00      0.00         3
          18       0.93      1.00      0.97        69
          19       0.91      0.45      0.61        22
          20       0.99      1.00      0.99       395
          22       0.99    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99       263
           3       1.00      0.71      0.83        14
           5       1.00      1.00      1.00         8
           6       1.00      0.98      0.99       169
           7       1.00      1.00      1.00       203
           8       0.98      1.00      0.99       195
           9       0.94      0.85      0.89        98
          10       0.83      1.00      0.91         5
          11       0.95      0.97      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.94      0.91      0.92       133
          15       0.00      0.00      0.00         9
          16       0.00      0.00      0.00         3
          18       0.99      1.00      0.99        69
          19       0.66      0.95      0.78        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99       263
           3       0.92      0.79      0.85        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      1.00       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      1.00       195
           9       0.99      0.78      0.87        98
          10       0.83      1.00      0.91         5
          11       0.94      0.98      0.96       532
          12       1.00      1.00      1.00       252
          13       1.00      0.98      0.99      1575
          14       0.93      0.94      0.93       133
          15       0.75      0.33      0.46         9
          16       0.00      0.00      0.00         3
          18       0.99      1.00      0.99        69
          19       0.72      0.95      0.82        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      0.98      0.99       263
           3       0.92      0.86      0.89        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      0.99       195
           9       0.94      0.89      0.91        98
          10       0.83      1.00      0.91         5
          11       0.94      0.97      0.96       532
          12       1.00      1.00      1.00       252
          13       1.00      0.98      0.99      1575
          14       0.93      0.94      0.93       133
          15       0.89      0.89      0.89         9
          16       0.00      0.00      0.00         3
          18       0.99      1.00      0.99        69
          19       0.91      0.95      0.93        22
          20       0.99      1.00      1.00       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       0.99      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      1.00       195
           9       0.95      0.88      0.91        98
          10       0.83      1.00      0.91         5
          11       0.95      0.98      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.99      1575
          14       0.95      0.95      0.95       133
          15       0.89      0.89      0.89         9
          16       1.00      0.67      0.80         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      0.99      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       0.99      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      0.99      0.99       195
           9       0.94      0.86      0.90        98
          10       0.83      1.00      0.91         5
          11       0.96      0.98      0.97       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.93      0.95      0.94       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       0.99      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      0.99      0.99       195
           9       0.93      0.78      0.84        98
          10       0.83      1.00      0.91         5
          11       0.97      0.96      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.97      0.98      1575
          14       0.94      0.95      0.95       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.98      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      0.99       195
           9       1.00      0.64      0.78        98
          10       0.83      1.00      0.91         5
          11       0.97      0.96      0.96       532
          12       1.00      1.00      1.00       252
          13       1.00      0.98      0.99      1575
          14       0.97      0.92      0.95       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      1.00       195
           9       0.94      0.92      0.93        98
          10       0.83      1.00      0.91         5
          11       0.96      0.98      0.97       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.96      0.95      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.98      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      0.99       195
           9       0.94      0.90      0.92        98
          10       0.83      1.00      0.91         5
          11       0.96      0.97      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.97      0.95      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      0.99      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      1.00       195
           9       0.93      0.92      0.92        98
          10       0.83      1.00      0.91         5
          11       0.97      0.95      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.98      0.93      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      0.99       195
           9       0.95      0.84      0.89        98
          10       0.83      1.00      0.91         5
          11       0.95      0.98      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.96      0.97      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      0.99      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       0.98      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      0.99      0.99       195
           9       0.96      0.91      0.93        98
          10       0.83      1.00      0.91         5
          11       0.97      0.95      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.96      0.95      0.95       133
          15       0.80      0.89      0.84         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      0.99      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      1.00       195
           9       0.95      0.92      0.93        98
          10       0.83      1.00      0.91         5
          11       0.96      0.97      0.97       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.99      1575
          14       0.96      0.95      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      0.99      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.98      0.99       169
           7       1.00      1.00      1.00       203
           8       0.98      1.00      0.99       195
           9       0.95      0.92      0.93        98
          10       0.83      1.00      0.91         5
          11       0.96      0.97      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.96      0.96      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      0.99       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       263
           3       0.93      1.00      0.97        14
           5       1.00      1.00      1.00         8
           6       1.00      0.99      0.99       169
           7       1.00      1.00      1.00       203
           8       0.99      1.00      1.00       195
           9       0.94      0.90      0.92        98
          10       0.83      1.00      0.91         5
          11       0.95      0.98      0.96       532
          12       1.00      1.00      1.00       252
          13       0.99      0.98      0.98      1575
          14       0.97      0.95      0.96       133
          15       0.89      0.89      0.89         9
          16       1.00      1.00      1.00         3
          18       0.99      1.00      0.99        69
          19       1.00      0.95      0.98        22
          20       0.99      1.00      1.00       395
          22       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [363]:
id2tag #atual

{0: 'VBD',
 1: 'N',
 2: 'XT',
 3: 'JJS',
 4: 'E2A',
 5: 'WRB',
 6: 'VB',
 7: 'TO',
 8: 'VBP',
 9: 'FW',
 10: 'EX',
 11: 'VBN',
 12: 'VBZ',
 13: 'NNS',
 14: 'VBG',
 15: 'RBR',
 16: 'WP',
 17: 'CT',
 18: 'PRP',
 19: 'JJR',
 20: 'CC',
 21: 'NNPS',
 22: 'CD',
 23: 'DT',
 24: 'NNP',
 25: 'PDT',
 26: 'LS',
 27: 'PP',
 28: 'PRP$',
 29: 'NN',
 30: 'JJ',
 31: 'RP',
 32: 'RBS',
 33: 'MD',
 34: 'WP$',
 35: 'RB',
 36: 'SYM',
 37: 'IN',
 38: 'PUNCT',
 39: 'WDT',
 40: 'POS',
 41: '<pad>'}

In [360]:
!ls model/model

config.json  pytorch_model.bin


In [361]:
tokenizer.save_pretrained(os.path.join(write_path, 'model'))

('model/model/tokenizer_config.json',
 'model/model/special_tokens_map.json',
 'model/model/vocab.txt',
 'model/model/added_tokens.json',
 'model/model/tokenizer.json')

In [367]:
from transformers import pipeline
model.to('cpu')
nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)

nlp_token_class('AB - T cell receptor ( TCR ) stimulation induces rapid tyrosine phosphorylation of cellular proteins , including Cbl , a protooncogene product whose function remains unclear .')

  "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"


[{'entity_group': 'LABEL_26',
  'score': 0.99950683,
  'word': 'AB',
  'start': 0,
  'end': 2},
 {'entity_group': 'LABEL_38',
  'score': 0.99941576,
  'word': '-',
  'start': 3,
  'end': 4},
 {'entity_group': 'LABEL_29',
  'score': 0.8576849,
  'word': 'T cell receptor',
  'start': 5,
  'end': 20},
 {'entity_group': 'LABEL_38',
  'score': 0.47593167,
  'word': '(',
  'start': 21,
  'end': 22},
 {'entity_group': 'LABEL_29',
  'score': 0.6770358,
  'word': 'TCR',
  'start': 23,
  'end': 26},
 {'entity_group': 'LABEL_38',
  'score': 0.5545236,
  'word': ')',
  'start': 27,
  'end': 28},
 {'entity_group': 'LABEL_29',
  'score': 0.9354885,
  'word': 'stimulation',
  'start': 29,
  'end': 40},
 {'entity_group': 'LABEL_12',
  'score': 0.99714565,
  'word': 'induces',
  'start': 41,
  'end': 48},
 {'entity_group': 'LABEL_30',
  'score': 0.99940205,
  'word': 'rapid',
  'start': 49,
  'end': 54},
 {'entity_group': 'LABEL_29',
  'score': 0.9977628,
  'word': 'tyrosine phosphorylation',
  'start'

In [None]:
!ls

macmorpho-dev.txt   macmorpho-train.txt  model-bertimbau
macmorpho-test.txt  model


In [None]:
from torch.nn.utils.rnn import pad_sequence

max_f1, repeat = 0, 0
for epoch in range(nepochs):
  model.train()
  losses = []
  for batch_idx, inp in enumerate(traindata):
    texts = inp['X']
    
    labels = []
    for tags in inp['y']:
      tag_idxs = [tag2id[tag] for tag in tags.split()]
      labels.append(torch.tensor(tag_idxs[:max_length]))
    
    labels= pad_sequence(labels, padding_value=tag2id['<pad>']).transpose(0, 1).unsqueeze(0).contiguous()

    # classifying
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
    output = model(**inputs, labels=labels.to(device))

    # Calculate loss
    loss = output.loss
    losses.append(float(loss))

    # Backpropagation
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Display
    if (batch_idx+1) % batch_status == 0:
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tTotal Loss: {:.6f}'.format(epoch, \
        batch_idx+1, len(traindata), 100. * batch_idx / len(traindata), 
        float(loss), round(sum(losses) / len(losses), 5)))
  
  f1, acc = evaluate(model, devdata)
  print('F1: ', f1, 'Accuracy: ', acc)
  if f1 > max_f1:
    model.save_pretrained(os.path.join(write_path, 'model'))
    max_f1 = f1
    repeat = 0
    print('Saving best model...')
  else:
    repeat += 1
  
  if repeat == early_stop:
    print('FIM!!! early_stop')
    break

Progress: 0.62 31


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.95      0.96      0.96       512
           1       0.88      0.88      0.88       340
           2       0.84      0.89      0.86        61
           3       0.86      0.48      0.62        25
           4       0.96      0.98      0.97      4234
           5       0.90      0.90      0.90       200
           6       0.96      0.94      0.95       519
           7       0.97      0.95      0.96      1340
           8       0.99      0.98      0.99       701
           9       1.00      1.00      1.00      4127
          10       0.98      0.98      0.98      2777
          11       0.93      0.90      0.91       989
          12       0.87      0.94      0.90       355
          13       0.99      0.99      0.99      2273
          14       0.98      0.97      0.98      9937
          15       0.95      1.00      0.98       306
          16       0.00      0.00      0.00        24
          18       0.94    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      0.96      0.97       512
           1       0.87      0.93      0.90       340
           2       0.96      0.90      0.93        61
           3       0.88      0.56      0.68        25
           4       0.97      0.98      0.97      4234
           5       0.88      0.92      0.90       200
           6       0.97      0.97      0.97       519
           7       0.98      0.97      0.98      1340
           8       0.97      0.99      0.98       701
           9       1.00      1.00      1.00      4127
          10       0.98      0.99      0.98      2777
          11       0.96      0.90      0.93       989
          12       0.91      0.94      0.93       355
          13       0.99      1.00      0.99      2273
          14       0.98      0.98      0.98      9937
          15       0.97      0.99      0.98       306
          16       1.00      0.71      0.83        24
          18       0.96    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      0.98      0.97       512
           1       0.91      0.91      0.91       340
           2       0.98      0.90      0.94        61
           3       0.91      0.80      0.85        25
           4       0.97      0.98      0.98      4234
           5       0.93      0.93      0.93       200
           6       0.96      0.98      0.97       519
           7       0.98      0.98      0.98      1340
           8       0.98      0.99      0.99       701
           9       1.00      1.00      1.00      4127
          10       0.98      0.99      0.98      2777
          11       0.95      0.92      0.94       989
          12       0.91      0.95      0.93       355
          13       0.99      0.99      0.99      2273
          14       0.99      0.97      0.98      9937
          15       0.98      0.99      0.99       306
          16       1.00      0.88      0.93        24
          18       0.96    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      0.96      0.97       512
           1       0.91      0.91      0.91       340
           2       0.98      0.90      0.94        61
           3       0.81      0.88      0.85        25
           4       0.97      0.99      0.98      4234
           5       0.97      0.90      0.93       200
           6       0.97      0.98      0.98       519
           7       0.99      0.98      0.98      1340
           8       0.99      0.99      0.99       701
           9       1.00      1.00      1.00      4127
          10       0.98      0.99      0.98      2777
          11       0.94      0.93      0.94       989
          12       0.92      0.94      0.93       355
          13       0.99      0.99      0.99      2273
          14       0.98      0.98      0.98      9937
          15       0.98      1.00      0.99       306
          16       0.91      0.88      0.89        24
          18       0.97    

In [370]:
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
tokenizer.save_pretrained(os.path.join(write_path, 'model'))


('model/model/tokenizer_config.json',
 'model/model/special_tokens_map.json',
 'model/model/vocab.txt',
 'model/model/added_tokens.json',
 'model/model/tokenizer.json')

In [None]:
id2tag

{0: 'NUM',
 1: 'KS',
 2: 'PREP+PROADJ',
 3: 'ADV-KS',
 4: 'NPROP',
 5: 'PDEN',
 6: 'PROADJ',
 7: 'PCP',
 8: 'KC',
 9: 'PU',
 10: 'PREP',
 11: 'ADV',
 12: 'PRO-KS',
 13: 'ART',
 14: 'N',
 15: 'PROPESS',
 16: 'PREP+PROPESS',
 17: 'CUR',
 18: 'ADJ',
 19: 'IN',
 20: 'PREP+ART',
 21: 'PROSUB',
 22: 'PREP+PRO-KS',
 23: 'PREP+PROSUB',
 24: 'V',
 25: 'PREP+ADV',
 26: '<pad>'}

In [None]:
tag2id

{'<pad>': 26,
 'ADJ': 18,
 'ADV': 11,
 'ADV-KS': 3,
 'ART': 13,
 'CUR': 17,
 'IN': 19,
 'KC': 8,
 'KS': 1,
 'N': 14,
 'NPROP': 4,
 'NUM': 0,
 'PCP': 7,
 'PDEN': 5,
 'PREP': 10,
 'PREP+ADV': 25,
 'PREP+ART': 20,
 'PREP+PRO-KS': 22,
 'PREP+PROADJ': 2,
 'PREP+PROPESS': 16,
 'PREP+PROSUB': 23,
 'PRO-KS': 12,
 'PROADJ': 6,
 'PROPESS': 15,
 'PROSUB': 21,
 'PU': 9,
 'V': 24}

In [368]:
model = AutoModelForTokenClassification.from_pretrained("model/model")

from transformers import pipeline

nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)

nlp_token_class('On the other hand , a decline of the arsenic content in hair and nail was observed after withdrawal of the drug .')

[{'entity_group': 'LABEL_37',
  'score': 0.999373,
  'word': 'On',
  'start': 0,
  'end': 2},
 {'entity_group': 'LABEL_23',
  'score': 0.99949944,
  'word': 'the',
  'start': 3,
  'end': 6},
 {'entity_group': 'LABEL_30',
  'score': 0.9989857,
  'word': 'other',
  'start': 7,
  'end': 12},
 {'entity_group': 'LABEL_29',
  'score': 0.99937445,
  'word': 'hand',
  'start': 13,
  'end': 17},
 {'entity_group': 'LABEL_38',
  'score': 0.9996381,
  'word': ',',
  'start': 18,
  'end': 19},
 {'entity_group': 'LABEL_23',
  'score': 0.9994572,
  'word': 'a',
  'start': 20,
  'end': 21},
 {'entity_group': 'LABEL_29',
  'score': 0.99954236,
  'word': 'decline',
  'start': 22,
  'end': 29},
 {'entity_group': 'LABEL_37',
  'score': 0.99928683,
  'word': 'of',
  'start': 30,
  'end': 32},
 {'entity_group': 'LABEL_23',
  'score': 0.9995347,
  'word': 'the',
  'start': 33,
  'end': 36},
 {'entity_group': 'LABEL_30',
  'score': 0.99941427,
  'word': 'arsenic',
  'start': 37,
  'end': 44},
 {'entity_group'

In [369]:
frase = 'The SM protein derived from the spliced RNA joining BSLF2 to BMLF1 is much the most abundant protein .'
doc = nlp_token_class(frase)
for d in doc:
  print(d)
  tag = d['entity_group']
  text = frase[d['start']:d['end']]
  print(tag)
  print(text)


{'entity_group': 'LABEL_23', 'score': 0.999495, 'word': 'The', 'start': 0, 'end': 3}
LABEL_23
The
{'entity_group': 'LABEL_29', 'score': 0.99592054, 'word': 'SM protein', 'start': 4, 'end': 14}
LABEL_29
SM protein
{'entity_group': 'LABEL_11', 'score': 0.998184, 'word': 'derived', 'start': 15, 'end': 22}
LABEL_11
derived
{'entity_group': 'LABEL_37', 'score': 0.99927515, 'word': 'from', 'start': 23, 'end': 27}
LABEL_37
from
{'entity_group': 'LABEL_23', 'score': 0.99948335, 'word': 'the', 'start': 28, 'end': 31}
LABEL_23
the
{'entity_group': 'LABEL_11', 'score': 0.9971042, 'word': 'spliced', 'start': 32, 'end': 39}
LABEL_11
spliced
{'entity_group': 'LABEL_29', 'score': 0.9980102, 'word': 'RNA', 'start': 40, 'end': 43}
LABEL_29
RNA
{'entity_group': 'LABEL_14', 'score': 0.9972887, 'word': 'joining', 'start': 44, 'end': 51}
LABEL_14
joining
{'entity_group': 'LABEL_29', 'score': 0.9994783, 'word': 'BSLF2', 'start': 52, 'end': 57}
LABEL_29
BSLF2
{'entity_group': 'LABEL_7', 'score': 0.99905723, 