In [1]:
%load_ext autoreload
%autoreload 2

## `sdadas/mt5-base-translator-en-pl`

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

sdadas_translator = pipeline("translation", model="sdadas/mt5-base-translator-en-pl")

In [3]:
sdadas_translator("hi, how are you doin, where you at? oh you got plans? don't say that")

Your input_length: 24 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


[{'translation_text': 'Cześć, jak się masz, gdzie jesteś? Masz plany? Nie'}]

# Base T5

In [4]:
import torch

torch.cuda.empty_cache()

In [5]:
from src.data.dataset import GlossSeq2PolishDataset

pt_dataset = GlossSeq2PolishDataset()

In [6]:
pt_dataset[0]

GlossSeq2PolishRecord(gloss_sequence=[GlossAnnotationRecord(start=39040, end=39440, text='MYŚLEĆ 2.1 P:Z;L:Z (NA PRZEMIAN)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39440, end=39640, text='JAK 1.2 P:I;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39640, end=39920, text='POMYSŁ 1.3 P:AZ;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39920, end=40200, text='WSKAZ-JA 1.1 P:L;L;Ø (Z WIDOCZNYM KCIUKIEM)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=40200, end=41120, text='MOŻNA 1.1 P:B;L:B  (MOŻE/MOŻLIWE)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='

## HuggingFace dataset creation

In [7]:
from datasets import Dataset

SOURCE_LANG = "gloss_pl"
TARGET_LANG = "pl"

gen_kwargs = {"pt_dataset": pt_dataset, "source_lang": SOURCE_LANG, "target_lang": TARGET_LANG}

def gen(pt_dataset, source_lang, target_lang):
    for record in pt_dataset:
        gloss_concatenated_text = " ".join(
            [g.text for g in record.gloss_sequence]
        )
        yield {
            "task_label": record.polish_annotation.task_label,
            "translation": {
                source_lang: gloss_concatenated_text,
                target_lang: record.polish_annotation.text
            }
        }

hf_dataset = Dataset.from_generator(gen, gen_kwargs=gen_kwargs)

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
hf_dataset = hf_dataset.class_encode_column("task_label")

Casting to class labels:   0%|          | 0/39623 [00:00<?, ? examples/s]

In [9]:
hf_dataset

Dataset({
    features: ['task_label', 'translation'],
    num_rows: 39623
})

In [10]:
hf_dataset = hf_dataset.train_test_split(test_size=0.2, stratify_by_column="task_label")
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['task_label', 'translation'],
        num_rows: 31698
    })
    test: Dataset({
        features: ['task_label', 'translation'],
        num_rows: 7925
    })
})

## T5 model setup

In [11]:
from transformers import AutoTokenizer

CHECKPOINT = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

## Dataset preparation

In [12]:
PREFIX = "translate Gloss to Polish: "

prepro_fn_kwargs = {"source_lang": SOURCE_LANG, "target_lang": TARGET_LANG, "prefix": PREFIX}

def preprocess_function(examples, source_lang, target_lang, prefix):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_hf_dataset = hf_dataset.map(preprocess_function, batched=True, fn_kwargs=prepro_fn_kwargs)

Map:   0%|          | 0/31698 [00:00<?, ? examples/s]

Map:   0%|          | 0/7925 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=CHECKPOINT)

## Evaluation metrics setup

In [14]:
import evaluate

metric = evaluate.load("sacrebleu")

In [15]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## T5 based translator training

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)

In [17]:
EPOCHS = 50
BATCH_SIZE = 16

training_args = Seq2SeqTrainingArguments(
    output_dir=f"gloss2polish-translator-{CHECKPOINT.replace('/', '-')}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hf_dataset["train"],
    eval_dataset=tokenized_hf_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m254290[0m ([33mmig-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,3.0134,2.606991,0.3499,13.7263
2,2.7174,2.382024,0.6532,14.195
3,2.559,2.256695,0.9635,14.1717
4,2.4546,2.169906,1.1446,14.2083
5,2.372,2.096344,1.2942,14.1192
6,2.2817,2.045133,1.578,13.851
7,2.2495,1.996534,1.5398,13.8607
8,2.1862,1.962043,1.8611,13.8019
9,2.15,1.928662,1.8514,13.9379
10,2.1011,1.898373,2.1423,13.9028




TrainOutput(global_step=99100, training_loss=1.9512445812052122, metrics={'train_runtime': 19156.228, 'train_samples_per_second': 82.735, 'train_steps_per_second': 5.173, 'total_flos': 5.360509799261798e+16, 'train_loss': 1.9512445812052122, 'epoch': 50.0})

In [18]:
translator = pipeline("translation", model=model, tokenizer=tokenizer, device="cuda")



In [19]:
from pprint import pprint

examples = [
    ("NUM: OSIEMNAŚCIE 1.1 P:3;L:B", "Osiemnastego?"),
    ("NIE*MIEĆ (= %) 1.4 P:B;L:B", "Nie bardzo."),
    ("WSKAZ: B (JA) NIE*WIEDZIEĆ/NIE*UMIEĆ 1.1 P:B;L:B", "Nie wiem."),
    ("PODOBNY 5.3 P:B;L:Ø WOŁAĆ 1.2 P:B;L:B (POLICZEK) POMAGAĆ/POMOC 1.5 ® P:B;L:B (MNIE)/ DO SIEBIE", "To chyba wołanie o pomoc."),
    ("UWAGA 1.2 P:Z;L:Ø KREW 1.1 P:G;L:A ###", "Na jego kolanie widać krew."),
    ("MUSIEĆ 2.1 P:L;L:Ø SZUKAĆ 2.5 P:V;L:V NAUCZYCIELKA P:E;L:E JĘZYK 1.1 P:Z;L:Ø ANGLIA/ANGIELSKI 1.1 P:B;L:Ø", "Musimy szukać nauczyciela języka angielskiego."),
    ("WSKAZ: Z WSZYSTKIE KIERUNKI SPAĆ/ZASNĄĆ 2.1 P:LP;L:Ø GŁUCHY 1.1 P:N;L:Ø G: A+B (MŁOTEK/WALIĆ/UDERZAĆ) ROBIĆ 1.1 P:C;L:C WSKAZ: Z (WSZYSTKIE KIERUNKI/STRZELAĆ/JEŹDZIĆ/WŁAŚNIE/TO/W-TYM-ROKU) IDENTYF: (CHARLIE CHAPLIN)+WĄS+KSZTAŁT-C DOBRZE 1.1 P:O;L:O ROBIĆ 1.1 P:C;L:C WSKAZ: Z (WSZYSTKIE KIERUNKI/STRZELAĆ/JEŹDZIĆ/WŁAŚNIE/TO/W-TYM-ROKU)", "Tam spał głuchy, nadal uderzali w drzwi.")
]
for gloss_pl_concatenated_text, polish_text in examples:
    translation_result = translator(PREFIX + gloss_pl_concatenated_text)
    print()
    pprint({
        "gloss_pl": gloss_pl_concatenated_text,
        "translation_result": translation_result,
        "ground_truth": polish_text,
    })


{'gloss_pl': 'NUM: OSIEMNAŚCIE 1.1 P:3;L:B',
 'ground_truth': 'Osiemnastego?',
 'translation_result': [{'translation_text': 'Osiemnastego.'}]}

{'gloss_pl': 'NIE*MIEĆ (= %) 1.4 P:B;L:B',
 'ground_truth': 'Nie bardzo.',
 'translation_result': [{'translation_text': 'Nie ma.'}]}

{'gloss_pl': 'WSKAZ: B (JA) NIE*WIEDZIEĆ/NIE*UMIEĆ 1.1 P:B;L:B',
 'ground_truth': 'Nie wiem.',
 'translation_result': [{'translation_text': 'Ja nie wiem.'}]}

{'gloss_pl': 'PODOBNY 5.3 P:B;L:Ø WOŁAĆ 1.2 P:B;L:B (POLICZEK) POMAGAĆ/POMOC '
             '1.5 ® P:B;L:B (MNIE)/ DO SIEBIE',
 'ground_truth': 'To chyba wołanie o pomoc.',
 'translation_result': [{'translation_text': 'Woa, pomaga.'}]}

{'gloss_pl': 'UWAGA 1.2 P:Z;L:Ø KREW 1.1 P:G;L:A ###',
 'ground_truth': 'Na jego kolanie widać krew.',
 'translation_result': [{'translation_text': 'Uwaga, krew.'}]}

{'gloss_pl': 'MUSIEĆ 2.1 P:L;L:Ø SZUKAĆ 2.5 P:V;L:V NAUCZYCIELKA P:E;L:E JĘZYK '
             '1.1 P:Z;L:Ø ANGLIA/ANGIELSKI 1.1 P:B;L:Ø',
 'ground_truth': 'Mu