In [9]:
from transformers import AutoTokenizer
import pandas as pd
import tqdm
import json
import re
import numpy as np

In [17]:
prompt1 = f"ᑭᓯᐊᓂᖅ{word_separator_token}ᑭ{morph_boundary_token}ᓯᐊᓂᖅ"

In [18]:
prompt1

'ᑭᓯᐊᓂᖅ<__word-separator>ᑭ<__morph-boundary>ᓯᐊᓂᖅ'

In [19]:
tokenizer.tokenize(prompt1)

['▁ᑭᓯᐊᓂ',
 'ᖅ',
 '<__word-separator>',
 '▁ᑭ',
 '<__morph-boundary>',
 '▁',
 'ᓯᐊ',
 'ᓂᖅ']

In [3]:
# Tokenizer
model_name = 'cis-lmu/glot500-base'
word_separator_token = "<__word-separator>"
morph_boundary_token = "<__morph-boundary>"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
assert tokenizer.add_tokens([morph_boundary_token, word_separator_token])

In [8]:
print(tokenizer.encode(word_separator_token))
print(tokenizer.encode(morph_boundary_token))

[0, 401146, 2]
[0, 401145, 2]


In [32]:
# Load the test data as pandas dataframe
path2data_dir = '/home/mathias/Desktop/HI/hpc/inuktitut/llm_segm/reimplementation/data/data_full/test.iu.csv'
path2data_out = '/home/mathias/Desktop/HI/hpc/inuktitut/llm_segm/reimplementation/tokenizer_eval'
lang = 'iu'
data_test = pd.read_csv(path2data_dir, sep="\t", header=None)

In [33]:
def eval_word(ground, predicted):
    tp = fp = fn = 0
    gi = pi = 0
    while gi < len(ground) and pi < len(predicted):
        g = ground[gi]
        p = predicted[pi]
        if g == p:
            if g == "@":
                tp += 1
            gi += 1
            pi += 1
        elif g == "@":
            fn += 1
            gi += 1
        elif p == "@":
            fp += 1
            pi += 1
        else:
            assert False, (ground, predicted)
    assert gi == len(ground) and pi == len(predicted)
    return tp, fp, fn

In [34]:
def test_tokenizer(tokenizer, test_data):
    TP = FP = FN = 0
    ACC = 0
    predictions = []
    for ground in tqdm.tqdm(test_data):
        word = "".join(ground.split("@"))
        predicted = "@".join(tokenizer.tokenize(word))[1:]
        #print(ground)
        #print(prediction)
        #input()
        predictions.append(predicted)
        if ground == predicted:
            ACC += 1
        tp, fp, fn = eval_word(ground, predicted)
        TP += tp
        FP += fp
        FN += fn
    return ACC, TP, FP, FN, predictions   

In [35]:
ACC, TP, FP, FN, predictions = test_tokenizer(tokenizer, data_test[1].tolist())

100%|████████████████████████████████████| 5124/5124 [00:00<00:00, 21733.95it/s]


In [48]:
def to_csv(ACC, TP, FP, FN, predictions):
    predictions = [re.sub("@", " ", p) for p in predictions]
    df = pd.DataFrame({"word": data_test[0].tolist(), "predictions": predictions})
    df.to_csv(f"{path2data_out}/preds.{lang}.csv", header=None, index=False, sep="\t")

    P = TP / (TP + FP)
    R = TP / (TP + FN)
    F1 = 2 * P * R / (P + R)

    with open(f"{path2data_out}/results.{lang}.json", "w") as results_f:
            results_map = {
                "ACC": np.round(100 * ACC / len(data_test), 2),
                "Prec": np.round(100 * P, 2),
                "Rcl": np.round(100 * R, 2),
                "F1": np.round(100 * F1, 2),
                "test-len": len(data_test),
                "LANG": lang,
            }
            json.dump(results_map, results_f, indent=4)

In [49]:
string1 = "ᐱᓕᕆᖃᑎᖃᕐᓂᖃᖅᑐᑦ"
tokenizer.tokenize(string1)

['▁ᐱᓕᕆᖃᑎ', 'ᖃᕐᓂ', 'ᖃ', 'ᖅᑐᑦ']

In [50]:
to_csv(ACC, TP, FP, FN, predictions)