In [1]:
import transformers
from transformers import TFBertForTokenClassification
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
from data_preparation_pos import ABSATokenizer, convert_examples_to_tf_dataset, read_conll

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [3]:
train_data = read_conll("../data/ud/vi/vi_vtb-ud-train.conllu")
train_examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(train_data[0], 
                                                                                                   train_data[1],
                                                                                                   train_data[2])]
dev_data = read_conll("../data/ud/vi/vi_vtb-ud-train.conllu")
dev_examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(dev_data[0], 
                                                                                                 dev_data[1],
                                                                                                 dev_data[2])]
tagset = ["O", "_", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
num_labels = len(tagset)

In [4]:
tokenizer = ABSATokenizer.from_pretrained('bert-base-multilingual-cased')
config = transformers.BertConfig.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained('bert-base-multilingual-cased',
                                                     config=config)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
batch_size = 8
epochs = 20
train_dataset = convert_examples_to_tf_dataset(examples=train_examples, tokenizer=tokenizer, tagset=tagset, max_length=256)
train_dataset = train_dataset.shuffle(100000).batch(batch_size).repeat(epochs)
dev_dataset = convert_examples_to_tf_dataset(examples=dev_examples, tokenizer=tokenizer, tagset=tagset, max_length=256)
dev_dataset = dev_dataset.shuffle(100000).batch(batch_size).repeat(1)

In [6]:
example_batch = train_dataset.as_numpy_iterator().next()

for token, label in zip(example_batch[0]["input_ids"][0], example_batch[1][0]):
    if token == 0:
        break
    print("{:<25}{:<20}".format(tokenizer.decode(int(token)), tagset[label]))

H ọ                      PROPN               
c ù n g                  ADJ                 
p h ả i                  VERB                
k ý                      VERB                
v à                      SCONJ               
đ i ể m                  VERB                
c h ỉ                    VERB                
v à o                    ADP                 
b ả n                    NOUN                
n h ậ n                  VERB                
t ộ i                    NOUN                
.                        PUNCT               


In [7]:
checkpoint = ModelCheckpoint('../checkpoints_vi/multibert_pos_checkpoint.hdf5', 
                             verbose=1, monitor='val_ignore_acc',
                             save_best_only=True, mode='max', save_weights_only=True)

In [8]:
import tensorflow.keras.backend as K
def ignore_acc(y_true_class, y_pred_class, class_to_ignore=0):
    y_pred_class = K.cast(K.argmax(y_pred_class, axis=-1), 'int32')
    y_true_class = K.cast(y_true_class, 'int32')
    ignore_mask = K.cast(K.not_equal(y_true_class, class_to_ignore), 'int32')
    matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
    accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
    return accuracy

In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=[ignore_acc])

In [11]:
model.fit(train_dataset, epochs=epochs, steps_per_epoch=np.ceil(len(train_examples) / batch_size),
          validation_data=dev_dataset, validation_steps=np.ceil(len(dev_examples) / batch_size),
          callbacks=[checkpoint])

Epoch 1/20
Epoch 00001: val_ignore_acc did not improve from 0.99866
Epoch 2/20
Epoch 00002: val_ignore_acc improved from 0.99866 to 0.99942, saving model to ../checkpoints_vi/multibert_pos_checkpoint.hdf5
Epoch 3/20
Epoch 00003: val_ignore_acc did not improve from 0.99942
Epoch 4/20
Epoch 00004: val_ignore_acc did not improve from 0.99942
Epoch 5/20
Epoch 00005: val_ignore_acc did not improve from 0.99942
Epoch 6/20
Epoch 00006: val_ignore_acc did not improve from 0.99942
Epoch 7/20
Epoch 00007: val_ignore_acc improved from 0.99942 to 0.99951, saving model to ../checkpoints_vi/multibert_pos_checkpoint.hdf5
Epoch 8/20
Epoch 00008: val_ignore_acc did not improve from 0.99951
Epoch 9/20
Epoch 00009: val_ignore_acc improved from 0.99951 to 0.99967, saving model to ../checkpoints_vi/multibert_pos_checkpoint.hdf5
Epoch 10/20
Epoch 00010: val_ignore_acc did not improve from 0.99967
Epoch 11/20
Epoch 00011: val_ignore_acc did not improve from 0.99967
Epoch 12/20

KeyboardInterrupt: 