In [1]:
import transformers
from transformers import TFBertForTokenClassification, TFXLMRobertaForTokenClassification
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
import glob
import os
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import MBERT_Tokenizer, XLMR_Tokenizer, read_conll
import utils.utils as utils
import utils.pos_utils as pos_utils
import utils.fine_tuning_utils as fine_tune_utils
import utils.model_utils as model_utils
import data_preparation.data_preparation_pos as data_preparation

### Training language and general setup

In [2]:
data_path = "../data/ud/"
short_model_name = "xlm-roberta"
experiment = "tfm"
task = "pos"
checkpoints_path = "E:/TFM_CCIL/checkpoints/"
        
training_lang = fine_tune_utils.get_global_training_state(data_path, short_model_name, experiment, checkpoints_path)
trainer = fine_tune_utils.Trainer(training_lang, data_path, task, short_model_name)

No languages remaining 

Already trained:     Bulgarian  English  Russian  Slovak  Croatian  Chinese  Vietnamese  Finnish  Basque  Japanese  Korean  Turkish  Arabic  Hebrew

Cannot train:        Thai

Retrain language? y
Language to re-train: English


### Model setup

In [3]:
# Model parameters
max_length = 256
batch_size = 8
learning_rate = 2e-5
epochs = 20
tagset = pos_utils.get_ud_tags()
num_labels = len(tagset)

# Model creation
trainer.build_model(max_length, batch_size, learning_rate, epochs, num_labels, tagset=tagset, eval_batch_size=64)

# Checkpoint for best model weights
trainer.setup_checkpoint(checkpoints_path)

Some weights of the model checkpoint at jplu/tf-xlm-roberta-base were not used when initializing TFXLMRobertaForTokenClassification: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFXLMRobertaForTokenClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-base and are newly initialized: ['dropout_38', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully built tf-xlm-roberta-base
Checkpoint file: E:/TFM_CCIL/checkpoints/en/tf-xlm-roberta-base_pos_checkpoint.hdf5


### Data preparation

In [4]:
trainer.prepare_data()

print("Train examples:", len(trainer.train_data))

# Print an example sentence for sanity
example_batch = trainer.train_dataset.as_numpy_iterator().next()
for token, label in zip(example_batch[0]["input_ids"][0], example_batch[1][0]):
    if not token:
        break
    elif token == example_batch[0]["input_ids"][0][10]:
        print("...")
        break
    print("{:<25}{:<20}".format(trainer.tokenizer.decode(int(token)), tagset[label]))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Train examples: 4287
"                        PUNCT               
Get                      VERB                
out                      ADP                 
of                       ADP                 
here                     ADV                 
"                        PUNCT               
meant                    VERB                
to                       PART                
make                     VERB                
someone                  NOUN                
...


### Training

In [6]:
# model.load_weights(checkpoint_dir + model_name + "_pos_checkpoint.hdf5")

In [7]:
model.fit(train_dataset, epochs=epochs, steps_per_epoch=np.ceil(len(train_examples) / batch_size),
          validation_data=dev_dataset, validation_steps=np.ceil(len(dev_examples) / batch_size),
          callbacks=[checkpoint])

Epoch 1/20
Epoch 00001: val_ignore_acc improved from -inf to 0.93807, saving model to E:/TFM_CCIL/checkpoints/he/tf-xlm-roberta-base_pos_checkpoint.hdf5
Epoch 2/20
Epoch 00002: val_ignore_acc improved from 0.93807 to 0.95742, saving model to E:/TFM_CCIL/checkpoints/he/tf-xlm-roberta-base_pos_checkpoint.hdf5
Epoch 3/20
Epoch 00003: val_ignore_acc improved from 0.95742 to 0.96128, saving model to E:/TFM_CCIL/checkpoints/he/tf-xlm-roberta-base_pos_checkpoint.hdf5
Epoch 4/20
Epoch 00004: val_ignore_acc improved from 0.96128 to 0.96267, saving model to E:/TFM_CCIL/checkpoints/he/tf-xlm-roberta-base_pos_checkpoint.hdf5
Epoch 5/20
Epoch 00005: val_ignore_acc improved from 0.96267 to 0.96349, saving model to E:/TFM_CCIL/checkpoints/he/tf-xlm-roberta-base_pos_checkpoint.hdf5
Epoch 6/20
Epoch 00006: val_ignore_acc improved from 0.96349 to 0.96696, saving model to E:/TFM_CCIL/checkpoints/he/tf-xlm-roberta-base_pos_checkpoint.hdf5
Epoch 7/20
Epoch 00007: val_ignore_acc improved from 0.96696 to 0.9

<tensorflow.python.keras.callbacks.History at 0xc6b49c74e0>