In [1]:
import numpy as np
import pandas as pd
import glob
import os
import tensorflow as tf
import transformers
from transformers import TFBertForTokenClassification
from tqdm.notebook import tqdm
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import ABSATokenizer, convert_examples_to_tf_dataset, read_conll
import utils.utils as utils
import utils.pos_utils as pos_utils

### Training language setup

In [8]:
code_dicts = utils.make_lang_code_dicts("../utils/lang_codes.xlsx")
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]

results_path = "../results/results_pos.xlsx"

# Look for languages that have PoS weights but are not in the results file
file = open("../data_exploration/pos_table.txt", "r")
all_langs = [line.split("&")[1].strip() for line in file.readlines()]
trained_langs = [code_to_name[x.split("\\")[1]] for x in glob.glob("E:/TFM_CCIL/checkpoints/*/*pos.hdf5")]

if os.path.isfile(results_path):
    results = pd.read_excel(results_path, sheet_name=None)
    remaining_langs = [lang for lang in trained_langs if lang not in results["Accuracy"].columns]
else:
    remaining_langs = trained_langs
    
untrained_langs = [lang for lang in all_langs if lang not in trained_langs]
evaluated_langs = [lang for lang in trained_langs if lang not in remaining_langs]
    
if remaining_langs:
    training_lang = remaining_langs[0]
    print("Evaluating with:   ", training_lang, "\n")
    training_lang = name_to_code[training_lang]
    print(IPython.utils.text.columnize(["Already evaluated:"] + evaluated_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Not yet evaluated:"] + remaining_langs[1:], displaywidth=150))
    print(IPython.utils.text.columnize(["Still to train:   "] + untrained_langs, displaywidth=150))
else:
    print("No languages remaining", "\n")
    print(IPython.utils.text.columnize(["Already evaluated:"] + evaluated_langs, displaywidth=150))
    print(IPython.utils.text.columnize(["Still to train:   "] + untrained_langs, displaywidth=150))

Evaluating with:    Bulgarian 

Already evaluated:  English

Not yet evaluated:  Basque  Hebrew  Croatian  Russian  Slovak  Vietnamese

Still to train:     Chinese  Thai  Finnish  Japanese  Korean  Turkish  Arabic



### Model setup

In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
# Model parameters
max_length = 256
batch_size = 256
model_name = "bert-base-multilingual-cased"
tagset = ["O", "_", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
num_labels = len(tagset)
label_map = {label: i for i, label in enumerate(tagset)}

# Model creation and loading weights
tokenizer = ABSATokenizer.from_pretrained(model_name)
config = transformers.BertConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained(model_name,
                                                     config=config)
weights_path = "E:/TFM_CCIL/checkpoints/" + training_lang + "/"
weights_filename = model_name + "_pos.hdf5"
model.load_weights(weights_path + weights_filename)
print("Using weights from", weights_path + weights_filename)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using weights from E:/TFM_CCIL/checkpoints/en/bert-base-multilingual-cased_pos.hdf5


### Evaluation

In [7]:
data_dir = "../data/ud/"
pos_eval = []

for directory in tqdm(os.listdir(data_dir)):
    # Load and preprocess
    path = os.path.join(data_dir, directory)
    test_examples, test_dataset = pos_utils.load_data(path, batch_size, tokenizer, tagset)
    
    # Predict
    preds = model.predict(test_dataset, steps=np.ceil(len(test_examples) / batch_size), verbose=1)
    
    # Postprocessing
    tokens, labels, filtered_preds, logits = pos_utils.filter_padding_tokens(test_examples, preds, label_map, tokenizer)
    subword_locations = pos_utils.find_subword_locations(tokens)
    new_tokens, new_labels, new_preds = pos_utils.reconstruct_subwords(subword_locations, tokens, labels, 
                                                                       filtered_preds, logits)
    
    # Metrics
    accuracy = (np.array(new_labels) == np.array(new_preds)).mean()
    pos_eval.append((directory, accuracy))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




Build the table for this training language

In [10]:
pos_eval = np.array(pos_eval, dtype=object)
table = pd.DataFrame({"Language": pos_eval[:,0],
                      "Accuracy": pos_eval[:,1]})
table["Language"] = table["Language"].apply(lambda x: code_to_name[x])

Reorder so that language types are grouped

In [11]:
file = open("../data_exploration/pos_table.txt", "r")
lang_order = [line.split("&")[1].strip() for line in file.readlines()]
table["sort"] = table["Language"].apply(lambda x: lang_order.index(x))
table = table.sort_values(by=["sort"]).drop("sort", axis=1).reset_index(drop=True)

In [12]:
table

Unnamed: 0,Language,Accuracy
0,Bulgarian,0.847113
1,English,0.959244
2,Russian,0.847445
3,Slovak,0.835124
4,Croatian,0.848186
5,Chinese,0.573477
6,Vietnamese,0.583711
7,Thai,0.3743
8,Finnish,0.856917
9,Basque,0.695659


Update results excel file

In [13]:
results_path = "../results/results_pos.xlsx"

if os.path.isfile(results_path):
    results = pd.read_excel(results_path, sheet_name=None)
else:
    results = dict.fromkeys(table.columns[1:].values, pd.DataFrame({"Language": table["Language"].values}))

In [14]:
with pd.ExcelWriter(results_path) as writer:
    full_training_lang = code_to_name[training_lang]
    for sheet_name, df in results.items():
        # Add each the column for each metric in the corresponding sheet
        df[full_training_lang] = table[sheet_name]
        df.to_excel(writer, index=False, sheet_name=sheet_name)

In [4]:
data_dir = "../data/ud/"
pos_eval = {}
for directory in tqdm(os.listdir(data_dir)):
    path = os.path.join(data_dir, directory)
    batch_size = 256 # Doesn't really matter here
    test_examples, test_dataset = load_data(path, batch_size, tokenizer, tagset)
    preds = model.predict(test_dataset, steps=np.ceil(len(test_examples) / batch_size))
    tokens, labels, filtered_preds, logits = filter_padding_tokens(test_examples, preds, label_map, tokenizer)
    subword_locations = find_subword_locations(tokens)
    new_tokens, new_labels, new_preds = reconstruct_subwords(subword_locations, tokens, labels, filtered_preds, logits)
    pos_eval[directory] = (np.array(new_labels) == np.array(new_preds)).mean()

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [5]:
pos_eval  

{'ar': 0.7342296756782806,
 'bg': 0.8471126939709998,
 'en': 0.959243908565687,
 'eu': 0.6956593090998605,
 'fi': 0.8569173454568416,
 'he': 0.5951641672722469,
 'hr': 0.8481863149216817,
 'ja': 0.4850788182873404,
 'ko': 0.5937047756874095,
 'ru': 0.8474453686005062,
 'sl': 0.8351243475591035,
 'th': 0.37430017467640075,
 'tr': 0.6696060799443059,
 'vi': 0.5837113545955754,
 'zh': 0.5734765351389213}

In [6]:
{k: str(round(v * 100, 2)) + "%" for k, v in sorted(pos_eval.items(), key=lambda item: item[1], reverse=True)}

{'en': '95.92%',
 'fi': '85.69%',
 'hr': '84.82%',
 'ru': '84.74%',
 'bg': '84.71%',
 'sl': '83.51%',
 'ar': '73.42%',
 'eu': '69.57%',
 'tr': '66.96%',
 'he': '59.52%',
 'ko': '59.37%',
 'vi': '58.37%',
 'zh': '57.35%',
 'ja': '48.51%',
 'th': '37.43%'}

Load results excel

In [None]:
results_path = "../results/results_pos.xlsx"

if os.path.isfile(results_path):
    results = pd.read_excel(results_path, sheet_name=None)
else:
    results = {}

Check if the sheet already exists

In [None]:
sheet = "results_pos_" + training_lang

if sheet in results:
    raise Exception("Sheet already exists and would be overwritten, aborting")
else:
    results[sheet] = pd.DataFrame({"Language": list(pos_eval.keys()), "Test_acc": list(pos_eval.values())})

Save all sheets into excel file

In [None]:
with pd.ExcelWriter("../results/results_pos.xlsx") as writer:
    for sheet_name, df in results.items():
        df.to_excel(writer, index=False, sheet_name=sheet_name)