In [1]:
import numpy as np
import pandas as pd
import glob
import os
import tensorflow as tf
import transformers
from transformers import TFBertForTokenClassification
from tqdm.notebook import tqdm
from pos_utils import load_data, filter_padding_tokens, find_subword_locations, reconstruct_subwords

import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import ABSATokenizer, convert_examples_to_tf_dataset, read_conll

## Zero-shot

In [2]:
import tensorflow.keras.backend as K
def ignore_acc(y_true_class, y_pred_class, class_to_ignore=0):
    y_pred_class = K.cast(K.argmax(y_pred_class, axis=-1), 'int32')
    y_true_class = K.cast(y_true_class, 'int32')
    ignore_mask = K.cast(K.not_equal(y_true_class, class_to_ignore), 'int32')
    matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
    accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
    return accuracy

In [3]:
training_lang = "en"
model_name = "bert-base-multilingual-cased"

tagset = ["O", "_", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", 
          "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
num_labels = len(tagset)
label_map = {label: i for i, label in enumerate(tagset)}

tokenizer = ABSATokenizer.from_pretrained(model_name)
config = transformers.BertConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained(model_name,
                                                     config=config)
weights_path = "../checkpoints_" + training_lang + "/"
model.load_weights(weights_path + [file for file in os.listdir(weights_path) if "checkpoint" not in file][0])
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, metrics=[ignore_acc])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertForTokenClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
data_dir = "../data/ud/"
pos_eval = {}
for directory in tqdm(os.listdir(data_dir)):
    path = os.path.join(data_dir, directory)
    batch_size = 256 # Doesn't really matter here
    test_examples, test_dataset = load_data(path, batch_size, tokenizer, tagset)
    preds = model.predict(test_dataset, steps=np.ceil(len(test_examples) / batch_size))
    tokens, labels, filtered_preds, logits = filter_padding_tokens(test_examples, preds, label_map, tokenizer)
    subword_locations = find_subword_locations(tokens)
    new_tokens, new_labels, new_preds = reconstruct_subwords(subword_locations, tokens, labels, filtered_preds, logits)
    pos_eval[directory] = (np.array(new_labels) == np.array(new_preds)).mean()

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [5]:
pos_eval  

{'ar': 0.7342296756782806,
 'bg': 0.8471126939709998,
 'en': 0.959243908565687,
 'eu': 0.6956593090998605,
 'fi': 0.8569173454568416,
 'he': 0.5951641672722469,
 'hr': 0.8481863149216817,
 'ja': 0.4850788182873404,
 'ko': 0.5937047756874095,
 'ru': 0.8474453686005062,
 'sl': 0.8351243475591035,
 'th': 0.37430017467640075,
 'tr': 0.6696060799443059,
 'vi': 0.5837113545955754,
 'zh': 0.5734765351389213}

In [6]:
{k: str(round(v * 100, 2)) + "%" for k, v in sorted(pos_eval.items(), key=lambda item: item[1], reverse=True)}

{'en': '95.92%',
 'fi': '85.69%',
 'hr': '84.82%',
 'ru': '84.74%',
 'bg': '84.71%',
 'sl': '83.51%',
 'ar': '73.42%',
 'eu': '69.57%',
 'tr': '66.96%',
 'he': '59.52%',
 'ko': '59.37%',
 'vi': '58.37%',
 'zh': '57.35%',
 'ja': '48.51%',
 'th': '37.43%'}

Load results excel

In [None]:
results_path = "../results/results_pos.xlsx"

if os.path.isfile(results_path):
    results = pd.read_excel(results_path, sheet_name=None)
else:
    results = {}

Check if the sheet already exists

In [None]:
sheet = "results_pos_" + training_lang

if sheet in results:
    raise Exception("Sheet already exists and would be overwritten, aborting")
else:
    results[sheet] = pd.DataFrame({"Language": list(pos_eval.keys()), "Test_acc": list(pos_eval.values())})

Save all sheets into excel file

In [None]:
with pd.ExcelWriter("../results/results_pos.xlsx") as writer:
    for sheet_name, df in results.items():
        df.to_excel(writer, index=False, sheet_name=sheet_name)

## Example

In [None]:
test_data = read_conll("../data/ud/fi/fi_pud-ud-test.conllu")
test_examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(test_data[0], 
                                                                                                  test_data[1],
                                                                                                  test_data[2])]

In [None]:
batch_size = 256
test_dataset = convert_examples_to_tf_dataset(examples=test_examples, tokenizer=tokenizer, tagset=tagset, max_length=256)
test_dataset = test_dataset.batch(batch_size)

In [None]:
example_batch = test_dataset.as_numpy_iterator().next()

for token, label in zip(example_batch[0]["input_ids"][0], example_batch[1][0]):
    if token == 0:
        break
    print("{:<25}{:<20}".format(tokenizer.decode(int(token)), tagset[label]))

In [None]:
model.evaluate(test_dataset)

In [None]:
preds = model.predict(test_dataset, steps=np.ceil(len(test_examples) / batch_size), verbose=1)

In [None]:
filtered_preds = []
labels = []
tokens = []
logits = []

for i in range(len(test_examples)):
    example_tokens, example_labels, _ = tokenizer.subword_tokenize(test_examples[i]["tokens"], test_examples[i]["tags"])
    example_labels = [label_map[label] for label in example_labels]
    example_preds = preds[0].argmax(axis=-1)[i, :len(example_labels)]
    example_logits = preds[0][i, :len(example_labels)]
    filtered_preds.extend(example_preds)
    labels.extend(example_labels)
    tokens.extend(example_tokens)
    logits.extend(example_logits)

In [None]:
(np.array(labels) == np.array(filtered_preds)).mean()

In [None]:
flattened = []
iterator = test_dataset.as_numpy_iterator()
accuracies = []
temp_preds = filtered_preds.copy()

for batch in iterator:
    batch_labels = batch[1][batch[1] != 0]
    flattened.extend(batch_labels)
    accuracies.append((np.array(batch[1][batch[1] != 0]) == np.array(temp_preds[:len(batch_labels)])).mean())
    temp_preds = temp_preds[len(batch_labels):]

In [None]:
np.mean(accuracies)

In [None]:
(np.array(labels) == np.array(flattened)).mean()

In [None]:
start = None
end = None
subword_locations = []

for i in range(len(tokens)):
    if tokens[i].startswith("##") and not(tokens[i-1].startswith("##")):
        start = i - 1
    if not(tokens[i].startswith("##")) and tokens[i-1].startswith("##"):
        end = i
        subword_locations.append((start, end))

In [None]:
truths = []
final_most_voted = []
final_avg = []
final_first = []
final_random = []
final_max_prob = []
final_random_equi = []

for start, end in subword_locations:
    if len(set(filtered_preds[start:end])) > 1:
        print(start, end)
        print("Tokens:", tokens[start:end])
        print("Predictions:", filtered_preds[start:end])
        print("Truth:", labels[start])
        truths.append(labels[start])
        
        most_voted = max(set(filtered_preds[start:end]), key=filtered_preds[start:end].count)
        final_most_voted.append(most_voted)
        avg = sum(logits[start:end]).argmax()
        final_avg.append(avg)
        final_first.append(filtered_preds[start])
        final_random.append(np.random.choice(filtered_preds[start:end]))
        temp = np.array([(M.max(), M.argmax()) for M in logits[start:end]])
        final_max_prob.append(temp[temp[:,0].argmax(), 1])
        final_random_equi.append(np.random.choice(list(set(filtered_preds[start:end]))))

In [None]:
print("Most voted:", (np.array(truths) == np.array(final_most_voted)).mean())
print("Logit average:", (np.array(truths) == np.array(final_avg)).mean())
print("Always first:", (np.array(truths) == np.array(final_first)).mean())
print("Random choice:", (np.array(truths) == np.array(final_random)).mean())
print("Highest probability:", (np.array(truths) == np.array(final_max_prob)).mean())
print("Equiprobable random:", (np.array(truths) == np.array(final_random_equi)).mean())

In [None]:
new_tokens = []
new_preds = []
new_labels = []
prev_end = 0

for start, end in subword_locations:
    if len(set(filtered_preds[start:end])) > 1:
        # Subword predictions do not all agree
        prediction = sum(logits[start:end]).argmax()
    else:
        prediction = filtered_preds[start]
    new_preds += filtered_preds[prev_end:start] + [prediction]
    token = "".join(tokens[start:end]).replace("##", "")
    new_tokens += tokens[prev_end:start] + [token]
    new_labels += labels[prev_end:start] + [labels[start]]
    prev_end = end
    
# Last subword onwards
new_preds += filtered_preds[prev_end:]
new_tokens += tokens[prev_end:]
new_labels += labels[prev_end:]

In [None]:
for token, label in zip(new_tokens[:20], new_labels[:20]):
    print(token, tagset[label])

In [None]:
for token, label in zip(tokens[:30], labels[:30]):
    print(token, tagset[label])

In [None]:
(np.array(new_labels) == np.array(new_preds)).mean()