In [2]:
from transformers import BertTokenizer
import transformers
import tensorflow as tf
from data_preparation import Example, convert_examples_to_tf_dataset
from conllu import parse
import numpy as np

In [3]:
en_pos = open("../data/ud/en/en_pud-ud-test.conllu", "r", encoding="utf-8").read()

In [4]:
sentences = parse(en_pos)

In [5]:
example = sentences[0]

In [6]:
example

TokenList<“, While, much, of, the, digital, transition, is, unprecedented, in, the, United, States, ,, the, peaceful, transition, of, power, is, not, ,, ”, Obama, special, assistant, Kori, Schulman, wrote, in, a, blog, post, Monday, .>

In [7]:
tokens = [token["form"] for token in example]

In [8]:
tokens[:5]

['“', 'While', 'much', 'of', 'the']

In [9]:
labels = [token["upos"] for token in example]

In [10]:
labels[:5]

['PUNCT', 'SCONJ', 'ADJ', 'ADP', 'DET']

In [11]:
class ABSATokenizer(BertTokenizer):
    def subword_tokenize(self, tokens, labels):
        # This propogates the label over any subwords that
        # are created by the byte-pair tokenization for training

        # IMPORTANT: For testing, you will have to undo this step by combining
        # the subword elements, and

        split_tokens, split_labels = [], []
        idx_map = []
        for ix, token in enumerate(tokens):
            sub_tokens = self.wordpiece_tokenizer.tokenize(token)
            for jx, sub_token in enumerate(sub_tokens):
                split_tokens.append(sub_token)
                split_labels.append(labels[ix])
                idx_map.append(ix)
        return split_tokens, split_labels, idx_map

In [12]:
tokenizer = ABSATokenizer.from_pretrained('bert-base-multilingual-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




Some words will be broken into subwords

In [13]:
tokens[8]

'unprecedented'

In [14]:
tokenizer(tokens[8])

{'input_ids': [101, 10119, 30619, 104101, 10336, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [15]:
for subword in tokenizer(tokens[8])["input_ids"]:
    print(tokenizer.decode(subword))

[ C L S ]
u n
# # p r e
# # c e d e n t
# # e d
[ S E P ]


We need to assign the original word's label to all subwords

In [16]:
split_tokens, split_labels, idx_map = tokenizer.subword_tokenize(tokens, labels) # [CLS] and [SEP] won't be added

In [17]:
split_tokens[8:12], split_labels[8:12]

(['un', '##pre', '##cedent', '##ed'], ['ADJ', 'ADJ', 'ADJ', 'ADJ'])

Next we create the id and mask lists

In [18]:
input_ids = tokenizer.convert_tokens_to_ids(split_tokens)

In [19]:
input_ids[8:12]

[10119, 30619, 104101, 10336]

In [20]:
tokenizer.decode(input_ids[10])

'# # c e d e n t'

In [21]:
attention_mask = [1] * len(input_ids)

In [22]:
max_length = 64
padding = [0] * (max_length - len(input_ids))

In [23]:
input_ids += padding
attention_mask += padding
print(len(input_ids), len(attention_mask), input_ids[-3:])

64 64 [0, 0, 0]


In [24]:
token_type_ids = [0] * max_length

Index the tagset

In [25]:
tagset = ["O", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]

In [26]:
num_labels = len(tagset)

In [48]:
label_map = {label: i for i, label in enumerate(tagset)}

In [49]:
label_map

{'O': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'INTJ': 7,
 'NOUN': 8,
 'NUM': 9,
 'PART': 10,
 'PRON': 11,
 'PROPN': 12,
 'PUNCT': 13,
 'SCONJ': 14,
 'SYM': 15,
 'VERB': 16,
 'X': 17}

Label ids and mask

In [29]:
label_ids = [label_map[label] for label in split_labels]

In [30]:
label_ids[:2], split_labels[:2]

([13, 14], ['PUNCT', 'SCONJ'])

In [31]:
label_mask = [1] * len(label_ids)

In [32]:
label_ids += padding
label_mask += padding
print(len(label_ids), len(label_mask), label_ids[-3:])

64 64 [0, 0, 0]


In [33]:
from transformers import TFBertForTokenClassification

In [34]:
config = transformers.BertConfig.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




In [35]:
model = TFBertForTokenClassification.from_pretrained('bert-base-multilingual-cased',
                                                     config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
from transformers.data.processors.utils import InputFeatures

In [53]:
def convert_examples_to_tf_dataset(examples, tokenizer):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        features.append(
            InputFeatures(
                input_ids=e["input_ids"], 
                attention_mask=e["attention_mask"], 
                token_type_ids=e["token_type_ids"], 
                label=e["labels"]
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([None]),
        ),
    )

In [38]:
dataset = convert_examples_to_tf_dataset([{"input_ids": input_ids, 
                                           "attention_mask": attention_mask,
                                           "token_type_ids": token_type_ids,
                                           "labels": label_ids},
                                         {"input_ids": input_ids, 
                                           "attention_mask": attention_mask,
                                           "token_type_ids": token_type_ids,
                                           "labels": label_ids}])
dataset = dataset.shuffle(100).batch(2).repeat(5)

In [39]:
list(iter(dataset))[0]

({'input_ids': <tf.Tensor: shape=(2, 64), dtype=int32, numpy=
  array([[   100,  14600,  13172,  10108,  10105,  16924,  35959,  10124,
           10119,  30619, 104101,  10336,  10106,  10105,  10609,  10859,
             117,  10105,  28101,  14446,  35959,  10108,  13183,  10124,
           10472,    117,    100,  28353,  14478,  20999,  30186,  10401,
           55260,  45624,  10589,  13954,  10106,    169,  31907,  11841,
           40714,    119,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0],
         [   100,  14600,  13172,  10108,  10105,  16924,  35959,  10124,
           10119,  30619, 104101,  10336,  10106,  10105,  10609,  10859,
             117,  10105,  28101,  14446,  35959,  10108,  13183,  10124,
           10472,    117,    100,  28353,  14478,  20999,  30186,  10401,
           55260,  45624,  10589,  13954,  10106,

In [40]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [42]:
model.fit(dataset, epochs=5, steps_per_epoch=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x24d80097688>

In [64]:
preds = model.predict(dataset.take(1))

In [68]:
preds[0].shape

(2, 64, 18)

In [74]:
final = np.array(tf.math.argmax(tf.nn.softmax(preds[0], axis=-1)[0], axis=-1))

In [77]:
final

array([ 1,  0,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,  0, 12, 12,  0,
        1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12, 12, 12,
       12,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int64)

In [76]:
np.array(label_ids)

array([13, 14,  1,  2,  6,  1,  8,  4,  1,  1,  1,  1,  2,  6, 12, 12, 13,
        6,  1,  1,  8,  2,  8,  4,  3, 13, 13, 12,  1,  8, 12, 12, 12, 12,
       12, 16,  2,  6,  8,  8, 12, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [79]:
np.mean(final == np.array(label_ids))

0.578125