In [1]:
import torch
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset

import numpy as np
from pprint import pprint

from src.data.dataset import GlossSeq2PolishDataset
from src.settings import MODELS_DIR

In [2]:
%load_ext autoreload
%autoreload 2

# Base T5

In [24]:
DEVICE = "cuda"

In [3]:
torch.cuda.empty_cache()

In [4]:
pt_dataset = GlossSeq2PolishDataset()

In [5]:
pt_dataset[0]

GlossSeq2PolishRecord(gloss_sequence=[GlossAnnotationRecord(start=39040, end=39440, text='MYŚLEĆ 2.1 P:Z;L:Z (NA PRZEMIAN)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39440, end=39640, text='JAK 1.2 P:I;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39640, end=39920, text='POMYSŁ 1.3 P:AZ;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39920, end=40200, text='WSKAZ-JA 1.1 P:L;L;Ø (Z WIDOCZNYM KCIUKIEM)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=40200, end=41120, text='MOŻNA 1.1 P:B;L:B  (MOŻE/MOŻLIWE)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='

## HuggingFace dataset creation

In [6]:
SOURCE_LANG = "gloss_pl"
TARGET_LANG = "pl"

gen_kwargs = {"pt_dataset": pt_dataset, "source_lang": SOURCE_LANG, "target_lang": TARGET_LANG}

def gen(pt_dataset, source_lang, target_lang):
    for record in pt_dataset:
        gloss_concatenated_text = " ".join(
            [g.text for g in record.gloss_sequence]
        )
        yield {
            "task_label": record.polish_annotation.task_label,
            "translation": {
                source_lang: gloss_concatenated_text,
                target_lang: record.polish_annotation.text
            }
        }

hf_dataset = Dataset.from_generator(gen, gen_kwargs=gen_kwargs)

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
hf_dataset = hf_dataset.class_encode_column("task_label")

Casting to class labels:   0%|          | 0/39623 [00:00<?, ? examples/s]

In [8]:
hf_dataset

Dataset({
    features: ['task_label', 'translation'],
    num_rows: 39623
})

In [9]:
hf_dataset = hf_dataset.train_test_split(test_size=0.2, stratify_by_column="task_label")
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['task_label', 'translation'],
        num_rows: 31698
    })
    test: Dataset({
        features: ['task_label', 'translation'],
        num_rows: 7925
    })
})

In [10]:
hf_dataset["test"].to_json("hf_dataset_test.json")

Creating json from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

2434762

## T5 model setup

In [26]:
CHECKPOINT = "allegro/plt5-small"

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, device=DEVICE)

## Dataset preparation

In [27]:
PREFIX = "translate Gloss to Polish: "

prepro_fn_kwargs = {"source_lang": SOURCE_LANG, "target_lang": TARGET_LANG, "prefix": PREFIX}

def preprocess_function(examples, source_lang, target_lang, prefix):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_hf_dataset = hf_dataset.map(preprocess_function, batched=True, fn_kwargs=prepro_fn_kwargs)

Map:   0%|          | 0/31698 [00:00<?, ? examples/s]

Map:   0%|          | 0/7925 [00:00<?, ? examples/s]

In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=CHECKPOINT)

## Evaluation metrics setup

In [29]:
metric = evaluate.load("sacrebleu")

In [30]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [32]:
model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)

In [38]:
PROMT = "nie wiem co robię powinno działać bez odbioru"

inputs = tokenizer(PROMT, return_tensors="pt").input_ids
print(inputs)

outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
print(outputs)

result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(result)

tensor([[  279,  4067,   358,  3032,   313,  3060,  8135,   485, 25666,     1]])
tensor([[    0, 49999,     1]])
['規']


In [17]:
assert "we" == "stop"

AssertionError: 

## T5 based translator training

In [None]:
EPOCHS = 2
BATCH_SIZE = 16

training_args = Seq2SeqTrainingArguments(
    output_dir=MODELS_DIR / f"gloss2polish-translator-{CHECKPOINT.replace('/', '-')}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hf_dataset["train"],
    eval_dataset=tokenized_hf_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
translator = pipeline("translation", model=model, tokenizer=tokenizer, device="cuda")

In [None]:
examples = [
    ("NUM: OSIEMNAŚCIE 1.1 P:3;L:B", "Osiemnastego?"),
    ("NIE*MIEĆ (= %) 1.4 P:B;L:B", "Nie bardzo."),
    ("WSKAZ: B (JA) NIE*WIEDZIEĆ/NIE*UMIEĆ 1.1 P:B;L:B", "Nie wiem."),
    ("PODOBNY 5.3 P:B;L:Ø WOŁAĆ 1.2 P:B;L:B (POLICZEK) POMAGAĆ/POMOC 1.5 ® P:B;L:B (MNIE)/ DO SIEBIE", "To chyba wołanie o pomoc."),
    ("UWAGA 1.2 P:Z;L:Ø KREW 1.1 P:G;L:A ###", "Na jego kolanie widać krew."),
    ("MUSIEĆ 2.1 P:L;L:Ø SZUKAĆ 2.5 P:V;L:V NAUCZYCIELKA P:E;L:E JĘZYK 1.1 P:Z;L:Ø ANGLIA/ANGIELSKI 1.1 P:B;L:Ø", "Musimy szukać nauczyciela języka angielskiego."),
    ("WSKAZ: Z WSZYSTKIE KIERUNKI SPAĆ/ZASNĄĆ 2.1 P:LP;L:Ø GŁUCHY 1.1 P:N;L:Ø G: A+B (MŁOTEK/WALIĆ/UDERZAĆ) ROBIĆ 1.1 P:C;L:C WSKAZ: Z (WSZYSTKIE KIERUNKI/STRZELAĆ/JEŹDZIĆ/WŁAŚNIE/TO/W-TYM-ROKU) IDENTYF: (CHARLIE CHAPLIN)+WĄS+KSZTAŁT-C DOBRZE 1.1 P:O;L:O ROBIĆ 1.1 P:C;L:C WSKAZ: Z (WSZYSTKIE KIERUNKI/STRZELAĆ/JEŹDZIĆ/WŁAŚNIE/TO/W-TYM-ROKU)", "Tam spał głuchy, nadal uderzali w drzwi.")
]
for gloss_pl_concatenated_text, polish_text in examples:
    translation_result = translator(PREFIX + gloss_pl_concatenated_text)
    print()
    pprint({
        "gloss_pl": gloss_pl_concatenated_text,
        "translation_result": translation_result,
        "ground_truth": polish_text,
    })