In [1]:
from transformers import BertTokenizer
import tensorflow as tf
from data_preparation import Example, convert_examples_to_tf_dataset
from conllu import parse

In [2]:
en_pos = open("../data/ud/en/en_pud-ud-test.conllu", "r", encoding="utf-8").read()

In [3]:
sentences = parse(en_pos)

In [4]:
example = sentences[0]

In [5]:
example

TokenList<“, While, much, of, the, digital, transition, is, unprecedented, in, the, United, States, ,, the, peaceful, transition, of, power, is, not, ,, ”, Obama, special, assistant, Kori, Schulman, wrote, in, a, blog, post, Monday, .>

In [6]:
tokens = [token["form"] for token in example]

In [7]:
tokens[:5]

['“', 'While', 'much', 'of', 'the']

In [8]:
labels = [token["upos"] for token in example]

In [9]:
labels[:5]

['PUNCT', 'SCONJ', 'ADJ', 'ADP', 'DET']

In [10]:
class ABSATokenizer(BertTokenizer):
    def subword_tokenize(self, tokens, labels):
        # This propogates the label over any subwords that
        # are created by the byte-pair tokenization for training

        # IMPORTANT: For testing, you will have to undo this step by combining
        # the subword elements, and

        split_tokens, split_labels = [], []
        idx_map = []
        for ix, token in enumerate(tokens):
            sub_tokens = self.wordpiece_tokenizer.tokenize(token)
            for jx, sub_token in enumerate(sub_tokens):
                split_tokens.append(sub_token)
                split_labels.append(labels[ix])
                idx_map.append(ix)
        return split_tokens, split_labels, idx_map

In [11]:
tokenizer = ABSATokenizer.from_pretrained('bert-base-multilingual-cased')

Some words will be broken into subwords

In [12]:
tokens[8]

'unprecedented'

In [13]:
tokenizer(tokens[8])

{'input_ids': [101, 10119, 30619, 104101, 10336, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [14]:
for subword in tokenizer(tokens[8])["input_ids"]:
    print(tokenizer.decode(subword))

[ C L S ]
u n
# # p r e
# # c e d e n t
# # e d
[ S E P ]


We need to assign the original word's label to all subwords

In [15]:
split_tokens, split_labels, idx_map = tokenizer.subword_tokenize(tokens, labels) # [CLS] and [SEP] won't be added

In [16]:
split_tokens[8:12], split_labels[8:12]

(['un', '##pre', '##cedent', '##ed'], ['ADJ', 'ADJ', 'ADJ', 'ADJ'])

Next we create the id and mask lists

In [17]:
input_ids = tokenizer.convert_tokens_to_ids(split_tokens)

In [18]:
input_ids[8:12]

[10119, 30619, 104101, 10336]

In [28]:
tokenizer.decode(input_ids[10])

'# # c e d e n t'

In [19]:
attention_mask = [1] * len(input_ids)

In [20]:
max_length = 64
padding = [0] * (max_length - len(input_ids))

In [21]:
input_ids += padding
attention_mask += padding
print(len(input_ids), len(attention_mask), input_ids[-3:])

64 64 [0, 0, 0]


In [22]:
token_type_ids = [0] * max_length

Index the tagset

In [77]:
tagset = ["O", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]

In [78]:
label_map = {label: i for i, label in enumerate(tagset)}

In [79]:
label_map

{'O': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'INTJ': 7,
 'NOUN': 8,
 'NUM': 9,
 'PART': 10,
 'PRON': 11,
 'PROPN': 12,
 'PUNCT': 13,
 'SCONJ': 14,
 'SYM': 15,
 'VERB': 16,
 'X': 17}

Label ids and mask

In [80]:
label_ids = [label_map[label] for label in split_labels]

In [81]:
label_ids[:2], split_labels[:2]

([13, 14], ['PUNCT', 'SCONJ'])

In [82]:
label_mask = [1] * len(label_ids)

In [83]:
label_ids += padding
label_mask += padding
print(len(label_ids), len(label_mask), label_ids[-3:])

64 64 [0, 0, 0]


In [85]:
from transformers import TFBertForTokenClassification

In [86]:
model = TFBertForTokenClassification.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
