# Add Fassa Ladin to NLLB-200

The training set consists of 862 parallel Fassa Ladin-Italian-English sentences. Due to memory limits, we use a batch size of 16 sentences. Since Ladin is not included in the pre-trained NLLB-200 model, we assign it the language code of Friulian, in order to leverage the similarities between these two languages. We investigate two main approaches to add Fassa Ladin:
- Pivot-based Transfer Learning
- Multilingual Translation

## Requirements

In [None]:
!pip install sentencepiece transformers sacrebleu bert-score -q

In [None]:
import pandas as pd
import csv
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import trange
import sacrebleu
from bert_score import BERTScorer
import gc
import random
import numpy as np
import torch
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup

## Data

In [None]:
!wget https://raw.githubusercontent.com/jo-valer/machine-translation-ladin-fascian/main/data/train.tsv
!wget https://raw.githubusercontent.com/jo-valer/machine-translation-ladin-fascian/main/data/dev.tsv
!wget https://raw.githubusercontent.com/jo-valer/machine-translation-ladin-fascian/main/data/test_id.tsv
!wget https://raw.githubusercontent.com/jo-valer/machine-translation-ladin-fascian/main/data/test_ood.tsv

In [None]:
df_train = pd.read_csv('train.tsv', sep="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_dev = pd.read_csv('dev.tsv', sep="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_test = pd.read_csv('test_id.tsv', sep="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_test_ood = pd.read_csv('test_ood.tsv', sep="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

## Metrics

In [None]:
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)
scorer = BERTScorer(model_type='bert-base-multilingual-cased')

## Model

In [None]:
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M').cuda()

Hyperparameters, optimizer and scheduler.

In [None]:
# TRANSFER LEARNING APPROACH
# 'en' = Pivot-based Transfer Learning
# 'multi' = Multilingual Translation
training_lang = 'multi'

MODEL_SAVE_PATH = '/nllb-' + training_lang

batch_size = 16
max_length = 128
warmup_steps = 500
training_steps = 1800

# Use just a subset of dev set, for faster validation
df_dev = df_dev.sample(48)

optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1.5e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)

scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)

### Evaluation loop

In [None]:
def translate(text, src_lang='fur_Latn', tgt_lang='eng_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
    """Translate a sentence."""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    outputs = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def eval_loop(data=df_dev, lang='multi'):
    """
    Evaluate the model using Ladin dev sentences.
    BLEU score is the metric used.
    """
    model.eval()
    with torch.no_grad():
        if lang == 'multi':
            en_translated = [translate(t, 'fur_Latn', 'eng_Latn')[0] for t in data.ladin]
            it_translated = [translate(t, 'fur_Latn', 'ita_Latn')[0] for t in data.ladin]
            lld_translated_en = [translate(t, 'eng_Latn', 'fur_Latn')[0] for t in data.english]
            lld_translated_it = [translate(t, 'ita_Latn', 'fur_Latn')[0] for t in data.italian]
            en_bleu_score = bleu_calc.corpus_score(en_translated, [data['english'].tolist()]).score
            it_bleu_score = bleu_calc.corpus_score(it_translated, [data['italian'].tolist()]).score
            lld_en_bleu_score = bleu_calc.corpus_score(lld_translated_en, [data['ladin'].tolist()]).score
            lld_it_bleu_score = bleu_calc.corpus_score(lld_translated_it, [data['ladin'].tolist()]).score
            avg_bleu_score = (en_bleu_score+it_bleu_score+lld_en_bleu_score+lld_it_bleu_score)/4
        elif lang == 'en':
            en_translated = [translate(t, 'fur_Latn', 'eng_Latn')[0] for t in data.ladin]
            lld_translated_en = [translate(t, 'eng_Latn', 'fur_Latn')[0] for t in data.english]
            en_bleu_score = bleu_calc.corpus_score(en_translated, [data['english'].tolist()]).score
            lld_en_bleu_score = bleu_calc.corpus_score(lld_translated_en, [data['ladin'].tolist()]).score
            avg_bleu_score = (en_bleu_score+lld_en_bleu_score)/2
    return avg_bleu_score

### Training

Function to get random batches of data. Each batch has a randomly selected pair of languages, in a single direction. The code for training has been adapted from https://github.com/adaptNMT/adaptMLLM and https://github.com/slone-nlp/myv-nmt.

In [None]:
LANGS_EN = [('english', 'eng_Latn'), ('ladin', 'fur_Latn')]

PAIRS = [(('italian', 'ita_Latn'), ('ladin', 'fur_Latn')),
         (('english', 'eng_Latn'), ('ladin', 'fur_Latn')),
         (('ladin', 'fur_Latn'), ('italian', 'ita_Latn')),
         (('ladin', 'fur_Latn'), ('english', 'eng_Latn'))]

def get_batch_pairs(batch_size, data=df_train, lang='multi'):
    if lang=='multi':
        [((l1, long1), (l2, long2))] = random.sample(PAIRS, 1)
    elif lang=='en':
        (l1, long1), (l2, long2) = random.sample(LANGS_EN, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(item[l1])
        yy.append(item[l2])
    return xx, yy, long1, long2

Training loop.

In [None]:
def cleanup():
    """
    Function to clean up the memory, avoiding out-of-memory errors.
    """
    gc.collect()
    torch.cuda.empty_cache()

cleanup()
losses = []
dev_scores = []
best_score = 0
x, y, loss = None, None, None

tq = trange(len(losses), training_steps+1)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size, lang=training_lang)
    try:
        model.train()
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100 # Ignore the pad_token_id when computing the loss

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 100 == 0:
        avg_bleu_score = eval_loop(lang=training_lang)
        print(f"epoch {i}: avg Loss = {np.mean(losses[-100:]):.3f} / dev BLEU = {avg_bleu_score:.2f}")
        dev_scores.append(avg_bleu_score)

    # Save the model if the eval BLEU score is better than the previous best
    if i % 100 == 0 and i > 500 and avg_bleu_score>best_score:
        best_score = avg_bleu_score
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

Plots of the training.

In [None]:
import matplotlib.pyplot as plt

plt.plot(pd.Series(losses))
plt.plot(pd.Series(losses).ewm(100).mean())

plt.show()

In [None]:
sampled_losses = pd.Series(losses).iloc[::100]
sampled_losses = sampled_losses.reset_index(drop=True)
plt.plot(sampled_losses)

dev_scores_series = pd.Series(dev_scores)

plt.plot(dev_scores_series)
plt.show()