In [1]:
import transformers
from transformers import TFBertForTokenClassification
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
import glob
import os
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import ABSATokenizer, convert_examples_to_tf_dataset, read_conll
import utils.utils as utils
from utils.pos_utils import ignore_acc

### Training language setup

In [2]:
path = "../data/ud/"

code_dicts = utils.make_lang_code_dicts("../utils/lang_codes.xlsx")
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]

file = open("../data_exploration/pos_table.txt", "r")
all_langs = [line.split("&")[1].strip() for line in file.readlines()]
trained_langs = [code_to_name[x.split("\\")[1]] for x in glob.glob("E:/TFM_CCIL/checkpoints/*/*pos.hdf5")]
cannot_train_langs = []
remaining_langs = []
for lang in all_langs:
    # Check if there are train and dev sets available
    if glob.glob(path + name_to_code[lang] + "/*train.conllu") and glob.glob(path + name_to_code[lang] + "/*dev.conllu"):
        if lang not in trained_langs:
            remaining_langs.append(lang)
    else:
        cannot_train_langs.append(lang)

if remaining_langs:
    training_lang = remaining_langs[0]
    print("{:<20}".format("Training language:"), training_lang, "\n")
    training_lang = name_to_code[training_lang]
    print(IPython.utils.text.columnize(["Already trained:   "] + trained_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Not yet trained:   "] + remaining_langs[1:], displaywidth=150))
    print(IPython.utils.text.columnize(["Cannot train:      "] + cannot_train_langs, displaywidth=150))
else:
    print("No languages remaining", "\n")
    print(IPython.utils.text.columnize(["Cannot train:      "] + cannot_train_langs, displaywidth=150))

No languages remaining 

Cannot train:        Thai



### Model setup

In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
# Model parameters
max_length = 256
batch_size = 8
learning_rate = 2e-5
epochs = 20
model_name = "bert-base-multilingual-cased"
tagset = ["O", "_", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
num_labels = len(tagset)

# Model creation
tokenizer = ABSATokenizer.from_pretrained(model_name)
config = transformers.BertConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained(model_name,
                                                     config=config)

# Checkpoint for best model weights
checkpoint_dir = "E:/TFM_CCIL/checkpoints/" + training_lang + "/"
if not os.path.isdir(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoint = ModelCheckpoint(checkpoint_dir + model_name + "_pos_checkpoint.hdf5", 
                             verbose=1, monitor="val_ignore_acc",
                             save_best_only=True, mode="max", save_weights_only=True)

# Model compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=[ignore_acc])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data preparation

In [5]:
datasets = {}

for dataset_name in ["train", "dev"]:
    # look for the data file
    try:
        file_path = glob.glob(path + training_lang + "/*-{}.conllu".format(dataset_name))[0]
    except IndexError:
        raise Exception("Could not find " + dataset_name + " data file")
        
    # Load and extract info
    conllu_data = read_conll(file_path)
    examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(conllu_data[0], 
                                                                                                 conllu_data[1],
                                                                                                 conllu_data[2])]
    # In case some example is over max length
    examples = [example for example in examples if len(tokenizer.subword_tokenize(example["tokens"], 
                                                                                  example["tags"])[0]) <= max_length]
    
    # Convert to TF dataset
    dataset = convert_examples_to_tf_dataset(examples=examples, tokenizer=tokenizer, tagset=tagset, max_length=max_length)
    if dataset_name == "train":
        dataset = dataset.shuffle(100000, reshuffle_each_iteration=True).batch(batch_size).repeat(epochs)
    else:
        dataset = dataset.batch(batch_size)
        
    datasets[dataset_name] = (examples, dataset)
    
train_examples, train_dataset = datasets["train"]
dev_examples, dev_dataset = datasets["dev"]

# Print an example sentence for sanity
example_batch = train_dataset.as_numpy_iterator().next()
for token, label in zip(example_batch[0]["input_ids"][0], example_batch[1][0]):
    if not token:
        break
    elif token == example_batch[0]["input_ids"][0][10]:
        print("...")
        break
    print("{:<25}{:<20}".format(tokenizer.decode(int(token)), tagset[label]))

K a p                    NOUN                
# # ı                    NOUN                
d                        NOUN                
# # ı ş                  NOUN                
# # a r ı                NOUN                
# # d a n                NOUN                
v u                      VERB                
# # r u l                VERB                
# # u y                  VERB                
# # o r                  VERB                
...


### Training

In [6]:
model.fit(train_dataset, epochs=epochs, steps_per_epoch=np.ceil(len(train_examples) / batch_size),
          validation_data=dev_dataset, validation_steps=np.ceil(len(dev_examples) / batch_size),
          callbacks=[checkpoint])

Epoch 1/20
Epoch 00001: val_ignore_acc improved from -inf to 0.95012, saving model to E:/TFM_CCIL/checkpoints/ar/bert-base-multilingual-cased_pos_checkpoint.hdf5
Epoch 2/20
Epoch 00002: val_ignore_acc improved from 0.95012 to 0.95754, saving model to E:/TFM_CCIL/checkpoints/ar/bert-base-multilingual-cased_pos_checkpoint.hdf5
Epoch 3/20
Epoch 00003: val_ignore_acc improved from 0.95754 to 0.95812, saving model to E:/TFM_CCIL/checkpoints/ar/bert-base-multilingual-cased_pos_checkpoint.hdf5
Epoch 4/20
Epoch 00004: val_ignore_acc improved from 0.95812 to 0.96214, saving model to E:/TFM_CCIL/checkpoints/ar/bert-base-multilingual-cased_pos_checkpoint.hdf5
Epoch 5/20
Epoch 00005: val_ignore_acc did not improve from 0.96214
Epoch 6/20
Epoch 00006: val_ignore_acc improved from 0.96214 to 0.96280, saving model to E:/TFM_CCIL/checkpoints/ar/bert-base-multilingual-cased_pos_checkpoint.hdf5
Epoch 7/20
Epoch 00007: val_ignore_acc improved from 0.96280 to 0.96367, saving model to E:/TFM_CCIL/checkpoin

<tensorflow.python.keras.callbacks.History at 0x601c586908>