In [1]:
import numpy as np
import pandas as pd
import glob
import os
import tensorflow as tf
import transformers
from transformers import TFBertForTokenClassification
from tqdm.notebook import tqdm
import IPython

import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import ABSATokenizer, convert_examples_to_tf_dataset, read_conll
import utils.utils as utils
import utils.pos_utils as pos_utils

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [3]:
data_dir = "../data/ud/"
results_path = "../results/balanced_length/results_pos_balanced_length.xlsx"
basic_stats = pd.read_excel("../data_exploration/pos_basic_stats.xlsx")
en_ref = basic_stats.loc[basic_stats["language"] == "English", "test_avg_tokens"].values[0]

code_dicts = utils.make_lang_code_dicts()
code_to_name = code_dicts["code_to_name"]
name_to_code = code_dicts["name_to_code"]

# Model parameters
max_length = 256
batch_size = 256
model_name = "bert-base-multilingual-cased"
tagset = ["O", "_", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
num_labels = len(tagset)
label_map = {label: i for i, label in enumerate(tagset)}

# Model creation
tokenizer = ABSATokenizer.from_pretrained(model_name)
config = transformers.BertConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained(model_name,
                                                     config=config)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def balance_lengths(test_data, target_mean, tokenizer):
    test_examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(test_data[0],
                                                                                                      test_data[1],
                                                                                                      test_data[2])]
    lengths = [len(tokenizer.subword_tokenize(e["tokens"], e["tags"])[0]) for e in test_examples]
    df = pd.DataFrame({"id": test_data[0], "length": lengths})
    df = df.sort_values("length").reset_index(drop=True)
    
    if target_mean < df["length"].mean():
        while df["length"].mean() > target_mean:
            df = df.drop(df.index[-1])
    else:
        while df["length"].mean() < target_mean:
            df = df.drop(df.index[0])
        
    lost = len(test_examples) - df.shape[0]
    print("Examples lost:", lost)
    print("Examples remaining:", df.shape[0])
    test_examples = [example for example in test_examples if example["id"] in df["id"].values]
    return test_examples, lost

In [6]:
rel_lengths = pd.read_excel("../data_exploration/relative_lengths.xlsx")

In [8]:
langs = ["Bulgarian", "Slovak", "Croatian", "Vietnamese", "Basque", "Hebrew"]


for weights_filepath in tqdm(glob.glob("E:/TFM_CCIL/checkpoints/*/*_pos.hdf5")):
    training_lang = weights_filepath.split("\\")[1]
    
    # Load weights
    model.load_weights(weights_filepath)
    print("\nUsing weights from", weights_filepath)
    
    
    # Evaluation
    pos_eval = []

    for directory in tqdm(langs):
        print(directory)
        directory = name_to_code[directory]
        
        # Load and preprocess
        path = os.path.join(data_dir, directory)
        test_data = read_conll(glob.glob(path + "/*-{}.conllu".format("test"))[0])
        rel_dif = rel_lengths.loc[rel_lengths["Language"] == code_to_name[directory], 
                                      "Relative Difference (%)"].values[0] / 100
        target_mean = (1 + rel_dif) * en_ref
        test_examples, lost = balance_lengths(test_data, target_mean, tokenizer)
        if len(test_examples) == 0 or lost == 0:
            continue
        test_dataset = pos_utils.convert_examples_to_tf_dataset(examples=test_examples, tokenizer=tokenizer, 
                                                                tagset=tagset, max_length=256)
        test_dataset = test_dataset.batch(batch_size)

        # Predict
        preds = model.predict(test_dataset, steps=np.ceil(len(test_examples) / batch_size), verbose=1)

        # Postprocessing
        tokens, labels, filtered_preds, logits = pos_utils.filter_padding_tokens(test_examples, preds, label_map, tokenizer)
        subword_locations = pos_utils.find_subword_locations(tokens)
        new_tokens, new_labels, new_preds = pos_utils.reconstruct_subwords(subword_locations, tokens, labels, 
                                                                           filtered_preds, logits)

        # Metrics
        accuracy = (np.array(new_labels) == np.array(new_preds)).mean()
        pos_eval.append((directory, accuracy))
        
        
    # Build table
    pos_eval = np.array(pos_eval, dtype=object)
    table = pd.DataFrame({"Language": pos_eval[:,0],
                          "Accuracy": pos_eval[:,1]})
    table["Language"] = table["Language"].apply(lambda x: code_to_name[x])
    file = open("../data_exploration/pos_table.txt", "r")
    lang_order = [line.split("&")[1].strip() for line in file.readlines()]
    table["sort"] = table["Language"].apply(lambda x: lang_order.index(x))
    table = table.sort_values(by=["sort"]).drop("sort", axis=1).reset_index(drop=True)
    
    # Update results file
    if os.path.isfile(results_path):
        results = pd.read_excel(results_path, sheet_name=None)
    else:
        results = dict.fromkeys(table.columns[1:].values, pd.DataFrame({"Language": table["Language"].values}))
        
    with pd.ExcelWriter(results_path) as writer:
        full_training_lang = code_to_name[training_lang]
        for sheet_name, df in results.items():
            # Add each the column for each metric in the corresponding sheet
            df[full_training_lang] = table[sheet_name]
            df.to_excel(writer, index=False, sheet_name=sheet_name)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


Using weights from E:/TFM_CCIL/checkpoints\ar\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\bg\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\en\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\eu\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\fi\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\he\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\hr\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\ja\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\ko\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\ru\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\sk\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\tr\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\vi\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214


Using weights from E:/TFM_CCIL/checkpoints\zh\bert-base-multilingual-cased_pos.hdf5


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Bulgarian
Examples lost: 299
Examples remaining: 817
Slovak
Examples lost: 322
Examples remaining: 739
Croatian
Examples lost: 336
Examples remaining: 800
Vietnamese
Examples lost: 514
Examples remaining: 286
Basque
Examples lost: 277
Examples remaining: 1522
Hebrew
Examples lost: 277
Examples remaining: 214




In [9]:
results = pd.read_excel(results_path, sheet_name=None)

In [10]:
unbalanced_results = pd.read_excel("../results/results_pos.xlsx", sheet_name=None)

In [11]:
new_results = {"Accuracy": pd.concat([results["Accuracy"], 
                                     unbalanced_results["Accuracy"].loc[~unbalanced_results["Accuracy"]["Language"].isin(
                                         results["Accuracy"]["Language"].values
                                     )]], ignore_index=True)}

In [12]:
new_results["Accuracy"]

Unnamed: 0,Language,Arabic,Bulgarian,English,Basque,Finnish,Hebrew,Croatian,Japanese,Korean,Russian,Slovak,Turkish,Vietnamese,Chinese
0,Bulgarian,0.74236,0.989163,0.856586,0.66368,0.765407,0.815187,0.908171,0.657973,0.582978,0.892855,0.879994,0.683766,0.629868,0.616791
1,Slovak,0.749185,0.884192,0.839894,0.725583,0.797534,0.780889,0.940731,0.638573,0.604227,0.883135,0.969881,0.709555,0.595421,0.599824
2,Croatian,0.716129,0.898679,0.844265,0.709327,0.791103,0.776405,0.969353,0.610273,0.589555,0.889766,0.890157,0.697365,0.587053,0.577125
3,Vietnamese,0.5496,0.589497,0.585018,0.57674,0.558556,0.589225,0.584204,0.553264,0.510788,0.582847,0.578097,0.545664,0.87556,0.549735
4,Basque,0.573987,0.640736,0.69974,0.950939,0.716373,0.69292,0.683152,0.700884,0.631452,0.679808,0.624235,0.675892,0.555815,0.495798
5,Hebrew,0.641962,0.594037,0.62998,0.477292,0.527166,0.972137,0.612984,0.480635,0.437448,0.622179,0.570632,0.509891,0.483422,0.452215
6,English,0.610965,0.789626,0.959244,0.591309,0.671063,0.746986,0.732102,0.566378,0.5395,0.735117,0.712195,0.609081,0.556895,0.510613
7,Russian,0.734463,0.910213,0.847445,0.692359,0.79377,0.795578,0.905512,0.674278,0.652425,0.963269,0.876427,0.722839,0.624425,0.652064
8,Chinese,0.373243,0.563297,0.573477,0.52183,0.532524,0.530376,0.548307,0.560915,0.483166,0.51618,0.491104,0.523185,0.511697,0.919729
9,Thai,0.446455,0.49738,0.3743,0.465356,0.41358,0.576029,0.478479,0.454607,0.392126,0.496708,0.446097,0.408295,0.510637,0.491781


In [13]:
with pd.ExcelWriter(results_path) as writer:
        for sheet_name, df in new_results.items():
            df.to_excel(writer, index=False, sheet_name=sheet_name)

Recalculate baselines

In [34]:
baselines = []

for lang in tqdm(langs):
    lang = name_to_code[lang]
    
    # Load and preprocess
    path = os.path.join(data_dir, lang)
    test_data = read_conll(glob.glob(path + "/*-{}.conllu".format("test"))[0])
    rel_dif = rel_lengths.loc[rel_lengths["Language"] == code_to_name[lang], 
                                  "Relative Difference (%)"].values[0] / 100
    target_mean = (1 + rel_dif) * en_ref
    test_examples, lost = balance_lengths(test_data, target_mean, tokenizer)
    
    # Metrics
    tags = np.array([example["tags"] for example in test_examples]).sum()
    acc = tags.count(max(set(tags), key=tags.count)) / len(tags)
    baselines.append((code_to_name[lang], acc))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Examples lost: 299
Examples remaining: 817
Examples lost: 322
Examples remaining: 739
Examples lost: 336
Examples remaining: 800
Examples lost: 514
Examples remaining: 286
Examples lost: 277
Examples remaining: 1522
Examples lost: 277
Examples remaining: 214



In [35]:
baselines = pd.DataFrame(np.array(baselines), columns=["Language", "Accuracy"])
baselines["Accuracy"] = pd.to_numeric(baselines["Accuracy"])
baselines = utils.order_table(baselines)
baselines

Unnamed: 0,Language,Accuracy
0,Bulgarian,0.229535
1,Slovak,0.250462
2,Croatian,0.254007
3,Vietnamese,0.333667
4,Basque,0.254235
5,Hebrew,0.191697


In [36]:
old_baselines = pd.read_excel("../results/baselines_pos.xlsx").rename(columns={"Baseline": "Accuracy"})

In [37]:
baselines = utils.order_table(pd.concat([baselines, old_baselines[~old_baselines["Language"].isin(langs)]]))

In [38]:
baselines

Unnamed: 0,Language,Accuracy
0,Bulgarian,0.229535
1,English,0.190719
2,Russian,0.253061
3,Slovak,0.250462
4,Croatian,0.254007
5,Chinese,0.252627
6,Vietnamese,0.333667
7,Thai,0.271123
8,Finnish,0.26689
9,Basque,0.254235


In [39]:
pos_baselines_path = "../results/balanced_length/baselines_pos_balanced_length.xlsx"

with pd.ExcelWriter(pos_baselines_path) as writer:
    for metric in baselines.columns[1:]:
        baselines[["Language", metric]].rename(columns={metric: "Baseline"}).to_excel(writer, index=False, sheet_name=metric)