# Preliminary Experiments
Firstly, we evaluate the performance of the pre-trained models in Italian-to-English translation, in order to have a reference for subsequent experiments. We also evaluate the performance of the models to translate from Ladin to English, either considering Ladin sentences as if they were written in Italian, French, or Friulian. 

## Requirements

In [None]:
!pip install sentencepiece transformers sacrebleu bert-score -q

In [None]:
import pandas as pd
import csv
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer, M2M100ForConditionalGeneration, M2M100Tokenizer
from tqdm.auto import tqdm
import sacrebleu
from bert_score import BERTScorer

## Data

In [None]:
!wget https://raw.githubusercontent.com/jo-valer/machine-translation-ladin-fascian/main/data/test_id.tsv

In [None]:
df_test = pd.read_csv('test_id.tsv', sep="\t", quoting=csv.QUOTE_NONE)

## Metrics

We use three metrics: BLEU, chrF++ and BERTScore F1.

In [None]:
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)
scorer = BERTScorer(model_type='bert-base-multilingual-cased')

## OPUS-MT

### Italian-to-English model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-it-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-it-en").cuda()
model.eval()

In [None]:
lang_codes = {
    "it": ["italian", "it"],
    "en": ["english", "en"],
    "lld": ["ladin", "lld"]
}

def translate(text, src_lang='lld_Latn', tgt_lang='eng_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
    """Translate a sentence."""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    outputs = model.generate(
        **inputs.to(model.device),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def test_loop(data=df_test, column='en_translated', src='lld', tgt='en', data_src='lld'):
    data[column] = [translate(t, lang_codes[src][1], lang_codes[tgt][1])[0] for t in tqdm(data[lang_codes[data_src][0]])]
    bleu_score = bleu_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    chrf_score = chrf_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    P, R, F1 = scorer.score(data[column].tolist(), data[lang_codes[tgt][0]].tolist())
    print(f"BLEU = {bleu_score:.2f} / chrF++ = {chrf_score:.2f} / BERTscoreF1 = {(F1.mean()*100):.2f}")

Test Italian to English translation.

In [None]:
test_loop(column='en_translated', src='it', tgt='en', data_src='it')

Test Ladin to English translation, with Ladin sentences considered as if they were written in Italian.

In [None]:
test_loop(column='en_translated_it', src='it', tgt='en', data_src='lld')

### English-to-Italian model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-it")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-it").cuda()
model.eval()

In [None]:
lang_codes = {
    "it": ["italian", "it"],
    "en": ["english", "en"],
    "lld": ["ladin", "lld"]
}

def translate(text, src_lang='lld_Latn', tgt_lang='eng_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
    """Translate a sentence."""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    outputs = model.generate(
        **inputs.to(model.device),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def test_loop(data=df_test, column='en_translated', src='lld', tgt='en', data_src='lld'):
    data[column] = [translate(t, lang_codes[src][1], lang_codes[tgt][1])[0] for t in tqdm(data[lang_codes[data_src][0]])]
    bleu_score = bleu_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    chrf_score = chrf_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    P, R, F1 = scorer.score(data[column].tolist(), data[lang_codes[tgt][0]].tolist())
    print(f"BLEU = {bleu_score:.2f} / chrF++ = {chrf_score:.2f} / BERTscoreF1 = {(F1.mean()*100):.2f}")

Test English to Italian translation.

In [None]:
test_loop(column='it_translated', src='en', tgt='it', data_src='en')

## M2M-100

In [None]:
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").cuda()
model.eval()

In [None]:
lang_codes = {
    "it": ["italian", "it"],
    "en": ["english", "en"],
    "fr": ["french", "fr"],
    "lld": ["ladin", "lld"]
}

def translate(text, src_lang='lld_Latn', tgt_lang='eng_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
    """Translate a sentence."""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    outputs = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.get_lang_id(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def test_loop(data=df_test, column='en_translated', src='lld', tgt='en', data_src='lld'):
    data[column] = [translate(t, lang_codes[src][1], lang_codes[tgt][1])[0] for t in tqdm(data[lang_codes[data_src][0]])]
    bleu_score = bleu_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    chrf_score = chrf_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    P, R, F1 = scorer.score(data[column].tolist(), data[lang_codes[tgt][0]].tolist())
    print(f"BLEU = {bleu_score:.2f} / chrF++ = {chrf_score:.2f} / BERTscoreF1 = {(F1.mean()*100):.2f}")

Test Italian to English translation.

In [None]:
test_loop(column='en_translated', src='it', tgt='en', data_src='it')

English to Italian translation.

In [None]:
test_loop(column='it_translated', src='en', tgt='it', data_src='en')

Test Ladin to English translation, with Ladin sentences considered as if they were written in Italian.

In [None]:
test_loop(column='en_translated_it', src='it', tgt='en', data_src='lld')

Test Ladin to English translation, with Ladin sentences considered as if they were written in French.

In [None]:
test_loop(column='en_translated_fr', src='fr', tgt='en', data_src='lld')

## NLLB-200

In [None]:
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M').cuda()
model.eval()

In [None]:
lang_codes = {
    "it": ["italian", "ita_Latn"],
    "en": ["english", "eng_Latn"],
    "fr": ["french", "fra_Latn"],
    "lld": ["ladin", "lld_Latn"],
    "fur": ["friulian", "fur_Latn"]
}

def translate(text, src_lang='lld_Latn', tgt_lang='eng_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
    """Translate a sentence."""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    outputs = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def test_loop(data=df_test, column='en_translated', src='lld', tgt='en', data_src='lld'):
    data[column] = [translate(t, lang_codes[src][1], lang_codes[tgt][1])[0] for t in tqdm(data[lang_codes[data_src][0]])]
    bleu_score = bleu_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    chrf_score = chrf_calc.corpus_score(data[column].tolist(), [data[lang_codes[tgt][0]].tolist()]).score
    P, R, F1 = scorer.score(data[column].tolist(), data[lang_codes[tgt][0]].tolist())
    print(f"BLEU = {bleu_score:.2f} / chrF++ = {chrf_score:.2f} / BERTscoreF1 = {(F1.mean()*100):.2f}")

Test Italian to English translation.

In [None]:
test_loop(column='en_translated', src='it', tgt='en', data_src='it')

Test English to Italian translation.

In [None]:
test_loop(column='it_translated', src='en', tgt='it', data_src='en')

Test Ladin to English translation, with Ladin sentences considered as if they were written in Italian.

In [None]:
test_loop(column='en_translated_it', src='it', tgt='en', data_src='lld')

Test Ladin to English translation, with Ladin sentences considered as if they were written in French.

In [None]:
test_loop(column='en_translated_fr', src='fr', tgt='en', data_src='lld')

Test Ladin to English translation, with Ladin sentences considered as if they were written in Friulian.

In [None]:
test_loop(column='en_translated_fur', src='fur', tgt='en', data_src='lld')