## Fine-tuning the Romance Model with 80k train_dataset (20k per language: en, fr, pt, it)

## Also : Romance model baseline with the french test_dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Configuring the environment

In [2]:
!pip install datasets evaluate sacremoses sentencepiece transformers wandb

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses, evaluate
Successfully installed evaluate-0.4.3 sacremoses-0.1.1


In [None]:
! pip uninstall wandb
import os
os.environ["WANDB_DISABLED"] = "true"

Found existing installation: wandb 0.19.11
Uninstalling wandb-0.19.11:
  Would remove:
    /usr/local/bin/wandb
    /usr/local/bin/wb
    /usr/local/lib/python3.11/dist-packages/package_readme.md
    /usr/local/lib/python3.11/dist-packages/wandb-0.19.11.dist-info/*
    /usr/local/lib/python3.11/dist-packages/wandb/*
Proceed (Y/n)? Y
  Successfully uninstalled wandb-0.19.11


### Imports

In [3]:
import os
import random
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer
from transformers import MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback
import evaluate

In [4]:
# FRENCH DATASET

# Configuration
en_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-fr.clean.en'
fr_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-fr.clean.fr'
model_name =  'Helsinki-NLP/opus-mt-en-ROMANCE'
max_length = 128
random.seed(42)

# Read and clean the files
with open(en_path, 'r', encoding='utf-8') as f_en, open(fr_path, 'r', encoding='utf-8') as f_fr:
    en_lines = f_en.readlines()
    fr_lines = f_fr.readlines()

pairs = [
    (f">>fr<< {en.strip()}", fr.strip())
    for en, fr in zip(en_lines, fr_lines)
    if en.strip() and fr.strip()
]

random.shuffle(pairs)

# Split into train/dev/test
train_pairs = pairs[:20_000]
dev_pairs = pairs[20_000:22_500]
test_pairs = pairs[90_000:100_000]

# Create Hugging Face Datasets
def make_dataset(pairs):
    return Dataset.from_list([{"translation": {"en": en, "fr": fr}} for en, fr in pairs])

dataset = DatasetDict({
    "train": make_dataset(train_pairs),
    "validation": make_dataset(dev_pairs),
    "test": make_dataset(test_pairs)
})

raw_test_dataset = dataset["test"]

# Tokenization
tokenizer = MarianTokenizer.from_pretrained(model_name)


def preprocess(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["fr"] for ex in examples["translation"]]

    model_inputs = tokenizer(src_texts, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset_fr = dataset.map(preprocess, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset_fr['test'][100:200])

{'translation': [{'en': '>>fr<< For financial year 2002 the monetary reserve shall be reduced to EUR 250 million.', 'fr': "Pour l'exercice budgétaire 2002, le montant de la réserve monétaire est ramené à 250 millions d'euros."}, {'en': '>>fr<< amending Decision 2001/76/EC in respect of export credits for ships', 'fr': "modifiant la décision 2001/76/CE en ce qui concerne les crédits à l'exportation de navires"}, {'en': '>>fr<< Having regard to Council Regulation (EEC) No 804/68 of 27 June 1968 on the common organization of the market in milk and milk products (1), as last amended by Regulation (EEC) No 1557/84 (2), and in particular Articles 13 (3) and 17 (4) thereof,', 'fr': 'vu le règlement (CEE) no 804/68 du Conseil, du 27 juin 1968, portant organisation commune des marchés dans le secteur du lait et des produits laitiers (1), modifié en dernier lieu par le règlement (CEE) no 1557/84 (2), et notamment son article 13 para- graphe 3 et son article 17 paragraphe 4,'}, {'en': '>>fr<< Whe

In [5]:
# SPANISH DATASET

# Configuration

en_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-es.clean.en'
es_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-es.clean.es'
model_name =  'Helsinki-NLP/opus-mt-en-ROMANCE'
max_length = 128
random.seed(42)

# Read and clean the files
with open(en_path, 'r', encoding='utf-8') as f_en, open(es_path, 'r', encoding='utf-8') as f_es:
    en_lines = f_en.readlines()
    es_lines = f_es.readlines()

pairs = [
    (f">>es<< {en.strip()}", es.strip())
    for en, es in zip(en_lines, es_lines)
    if en.strip() and es.strip()
]

random.shuffle(pairs)

# Split into train/dev/test
train_pairs = pairs[:20_000]
dev_pairs = pairs[20_000:22_500]


# Create Hugging Face Datasets
def make_dataset(pairs):
    return Dataset.from_list([{"translation": {"en": en, "es": es}} for en, es in pairs])

dataset = DatasetDict({
    "train": make_dataset(train_pairs),
    "validation": make_dataset(dev_pairs),
})

# Tokenization
tokenizer = MarianTokenizer.from_pretrained(model_name)


def preprocess(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["es"] for ex in examples["translation"]]

    model_inputs = tokenizer(src_texts, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset_es = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset_es['train'][8059])
print(tokenized_dataset_es['validation'][68])

{'translation': {'en': '>>es<< Any formulator may identify an existing active substance in accordance with the first subparagraph, except for the requirements in points 5 and 6 of Annex I.', 'es': 'Cualquier formulador podrá identificar una sustancia activa existente de conformidad con el párrafo primero, excepto respecto de los requisitos que figuran en los puntos 5 y 6 del anexo I.'}, 'input_ids': [16, 4299, 1024, 20210, 265, 4167, 90, 1961, 1773, 4826, 13, 867, 44, 4, 313, 8765, 2, 4371, 24, 4, 1199, 13, 1298, 253, 10, 315, 8, 1669, 22, 3, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 

In [6]:
# ITALIAN DATASET


# Configuration
en_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-it.clean.en'
it_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-it.clean.it'
model_name =  'Helsinki-NLP/opus-mt-en-ROMANCE'
max_length = 128
random.seed(42)

# Read and clean the files
with open(en_path, 'r', encoding='utf-8') as f_en, open(it_path, 'r', encoding='utf-8') as f_it:
    en_lines = f_en.readlines()
    it_lines = f_it.readlines()

pairs = [
    (f">>it<< {en.strip()}", it.strip())
    for en, it in zip(en_lines, it_lines)
    if en.strip() and it.strip()
]

random.shuffle(pairs)

# Split into train/dev/test
train_pairs = pairs[:20_000]
dev_pairs = pairs[20_000:22_500]


# Create Hugging Face Datasets
def make_dataset(pairs):
    return Dataset.from_list([{"translation": {"en": en, "it": it}} for en, it in pairs])

dataset = DatasetDict({
    "train": make_dataset(train_pairs),
    "validation": make_dataset(dev_pairs),
})

# Tokenization
tokenizer = MarianTokenizer.from_pretrained(model_name)


def preprocess(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["it"] for ex in examples["translation"]]

    model_inputs = tokenizer(src_texts, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset_it = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset_it['train'][8059])
print(tokenized_dataset_it['validation'][68])

{'translation': {'en': '>>it<< (4) For the exclusion of interbank liabilities from the reserve base, any standard deduction to be applied to liabilities with a maturity of up to two years within the debt securities category should be based on the euro area-wide macro ratio between (i) the stock of relevant instruments issued by credit institutions and held by other credit institutions and by the ECB and participating NCBs and (ii) the total amount outstanding of such instruments issued by credit institutions.', 'it': "(4) Al fine di escludere le passività interbancarie dall'aggregato soggetto a riserva, ogni detrazione forfettaria applicabile alle passività con scadenza fino a due anni rientranti nella categoria dei titoli di debito deve essere determinata sulla base di un macrocoefficiente relativo all'intera area dell'euro, ottenuto come rapporto tra i) l'ammontare dei corrispondenti strumenti emessi dagli enti creditizi e detenuti da altri enti creditizi, nonché dalla BCE e dalle BC

In [7]:
# PORTUGUESE DATASET


# Configuration
en_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-pt.clean.en'
pt_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-pt.clean.pt'
model_name =  'Helsinki-NLP/opus-mt-en-ROMANCE'
max_length = 128
random.seed(42)

# Read and clean the files
with open(en_path, 'r', encoding='utf-8') as f_en, open(pt_path, 'r', encoding='utf-8') as f_pt:
    en_lines = f_en.readlines()
    pt_lines = f_pt.readlines()

pairs = [
    (f">>pt<< {en.strip()}", pt.strip())
    for en, pt in zip(en_lines, pt_lines)
    if en.strip() and pt.strip()
]

random.shuffle(pairs)

# Split into train/dev/test
train_pairs = pairs[:20_000]
dev_pairs = pairs[20_000:22_500]


# Create Hugging Face Datasets
def make_dataset(pairs):
    return Dataset.from_list([{"translation": {"en": en, "pt": pt}} for en, pt in pairs])

dataset = DatasetDict({
    "train": make_dataset(train_pairs),
    "validation": make_dataset(dev_pairs),
})


tokenizer = MarianTokenizer.from_pretrained(model_name)


def preprocess(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["pt"] for ex in examples["translation"]]

    model_inputs = tokenizer(src_texts, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset_pt = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset_pt['train'][8059])
print(tokenized_dataset_pt['validation'][68])

{'translation': {'en': '>>pt<< [6] As an indication, the number of projects/programmes approved under MEDA-I (support to EIB-interventions excluded) for Egypt was 4, for Jordan 6, and for Morocco 26.', 'pt': '[6] A título indicativo, o número de projectos/programas aprovados no âmbito do MEDA I (excluindo o apoio às intervenções do BEI) elevou-se a 4 para o Egipto, 6 para a Jordânia e 26 para Marrocos.'}, 'input_ids': [45, 25655, 351, 90, 10738, 2, 4, 467, 8, 1339, 97, 10708, 6, 2628, 294, 20108, 156, 15, 404, 23, 40357, 12, 19314, 15, 2796, 10568, 6, 12080, 39, 24, 7299, 77, 2493, 24, 11545, 3664, 10, 24, 12322, 20714, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000

In [None]:
from datasets import concatenate_datasets

# Rename the translation fields
def standardize_translation(example, tgt_lang):
    return {
        "translation": {
            "en": example["translation"]["en"],
            "tgt": example["translation"][tgt_lang]
        },
        "input_ids": example["input_ids"],
        "attention_mask": example["attention_mask"],
        "labels": example["labels"]
    }


langs = {
    "fr": tokenized_dataset_fr["train"],
    "it": tokenized_dataset_it["train"],
    "es": tokenized_dataset_es["train"],
    "pt": tokenized_dataset_pt["train"]
}

standardized_datasets = []

for lang_code, dataset in langs.items():
    standardized = dataset.map(lambda x: standardize_translation(x, tgt_lang=lang_code))
    standardized_datasets.append(standardized)

# Concatenate the separate datasets into a multilingual training set
multilingual_train = concatenate_datasets(standardized_datasets)

print(f"Dataset size: {len(multilingual_train)}")

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset size: 80000


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
print(multilingual_train[500])

{'translation': {'en': '>>fr<< Water shall be considered to be "cold" when its temperature is in the range 0º to 30º C.', 'tgt': "L'eau est dite froide lorsque sa température est comprise entre 0 ºC et 30 ºC."}, 'input_ids': [14, 5331, 255, 48, 1233, 12, 48, 58, 46069, 138, 271, 124, 6194, 28, 13, 4, 2007, 708, 76, 12, 388, 76, 141, 3, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 6

In [None]:
# Standardization function
def standardize_translation(example, tgt_lang):
    return {
        "translation": {
            "en": example["translation"]["en"],
            "tgt": example["translation"][tgt_lang]
        },
        "input_ids": example["input_ids"],
        "attention_mask": example["attention_mask"],
        "labels": example["labels"]
    }

# Language-specific validation datasets
langs_val = {
    "fr": tokenized_dataset_fr["validation"],
    "it": tokenized_dataset_it["validation"],
    "es": tokenized_dataset_es["validation"],
    "pt": tokenized_dataset_pt["validation"]
}

standardized_val_datasets = []

for lang_code, dataset in langs_val.items():
    standardized = dataset.map(lambda x: standardize_translation(x, tgt_lang=lang_code))
    standardized_val_datasets.append(standardized)

# Concatenate them into a multilingual validation set
multilingual_validation = concatenate_datasets(standardized_val_datasets)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
print(multilingual_validation[6000])

{'translation': {'en': '>>es<< For each category of contracting authority which is not given in Annex IV, the statistical report shall detail at least:', 'tgt': 'Por cada categoría de poderes adjudicadores distintos de los que figuran en el anexo IV, el informe estadístico precisará como mínimo:'}, 'input_ids': [16, 412, 539, 3236, 8, 17677, 2080, 106, 28, 78, 877, 13, 1669, 3132, 2, 4, 7511, 416, 255, 6973, 86, 1138, 37, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 6500

### **TRAINING**

In [None]:
# Load pretrained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt-MULTILING-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    load_best_model_at_end=True,
    report_to = None
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=multilingual_train,
    eval_dataset=multilingual_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

In [None]:
# Fine-tuning
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2405,0.206413
2,0.1839,0.199874
3,0.1552,0.196652


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=15000, training_loss=0.19320848388671874, metrics={'train_runtime': 2655.3273, 'train_samples_per_second': 90.384, 'train_steps_per_second': 5.649, 'total_flos': 8135607582720000.0, 'train_loss': 0.19320848388671874, 'epoch': 3.0})

In [None]:
print(raw_test_dataset[100])

{'translation': {'en': '>>fr<< For financial year 2002 the monetary reserve shall be reduced to EUR 250 million.', 'fr': "Pour l'exercice budgétaire 2002, le montant de la réserve monétaire est ramené à 250 millions d'euros."}}


In [None]:
def rename_fr_to_tgt(example):
    example["translation"]["tgt"] = example["translation"].pop("fr")
    return example

test_dataset = raw_test_dataset.map(rename_fr_to_tgt)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
print(test_dataset[100])

{'translation': {'en': '>>fr<< For financial year 2002 the monetary reserve shall be reduced to EUR 250 million.', 'tgt': "Pour l'exercice budgétaire 2002, le montant de la réserve monétaire est ramené à 250 millions d'euros."}}


Testing

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch
from tqdm import tqdm

# Load the fine-tuned model and tokenizer
model_dir = "/content/marianmt-MULTILING-finetuned/checkpoint-15000"
model = MarianMTModel.from_pretrained(model_dir)
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Extracting raw English and French sentences
raw_test_en = [ex["translation"]["en"] for ex in test_dataset]
raw_test_fr = [ex["translation"]["tgt"] for ex in test_dataset]

# Generation of the translations with progress bar
translated_fr = []

for sentence in tqdm(raw_test_en, desc="Translating"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, num_beams=4)
    translated = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    translated_fr.append(translated)


# Save the translated sentences to a file
with open("/content/FT-Multiling_Model_test_generated.fr", "w", encoding="utf-8") as f:
    for line in translated_fr:
        f.write(line + "\n")



Translating: 100%|██████████| 10000/10000 [58:26<00:00,  2.85it/s]


In [None]:
print(translated_fr[100:200])

["Pour l'exercice 2002, la réserve monétaire est ramenée à 250 millions d'euros.", "modifiant la décision 2001/76/CE en ce qui concerne les crédits à l'exportation pour les navires", 'vu le règlement (CEE) no 804/68 du Conseil, du 27 juin 1968, portant organisation commune des marchés dans le secteur du lait et des produits laitiers (1), modifié en dernier lieu par le règlement (CEE) no 1557/84 (2), et notamment son article 13 paragraphe 3 et son article 17 paragraphe 4,', "considérant que des renseignements complémentaires ont été demandés concernant certaines dénominations notifiées par les États membres en vertu de l'article 17 du règlement (CEE) n° 2081/92 du Conseil afin de s'assurer qu'elles satisfont aux dispositions des articles 2 et 4 dudit règlement; que ces renseignements complémentaires montrent que les dénominations sont conformes aux dispositions desdits articles; qu'il convient, dès lors, d'enregistrer et d'en ajouter à l'annexe du règlement (CE) n° 1107/96 de la Commiss

In [None]:
bleu = evaluate.load("sacrebleu")
references = [[ex["translation"]["tgt"]] for ex in test_dataset]
results = bleu.compute(predictions=translated_fr, references=references)
print("BLEU score on test set:", results["score"])

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

BLEU score on test set: 59.72078409406448


## 2. **Testing MarianMT ROMANCE Base**

In [None]:
!pip install -q sacremoses
from transformers import MarianMTModel, MarianTokenizer
import torch
from tqdm import tqdm

In [None]:
model_dir = 'Helsinki-NLP/opus-mt-en-ROMANCE'
model = MarianMTModel.from_pretrained(model_dir)
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Extracting raw English and French sentences
raw_test_en = [ex["translation"]["en"] for ex in test_dataset]
raw_test_fr = [ex["translation"]["tgt"] for ex in test_dataset]

# Generation of the translations with progress bar
translated_fr = []

for sentence in tqdm(raw_test_en, desc="Translating"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, num_beams=4)
    translated = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    translated_fr.append(translated)


# Save the translated sentences to a file
with open("/content/BASE-Multiling_Model_test_generated.fr", "w", encoding="utf-8") as f:
    for line in translated_fr:
        f.write(line + "\n")


Translating: 100%|██████████| 10000/10000 [1:00:41<00:00,  2.75it/s]


In [None]:
print(translated_fr[100:200])

["Pour l'exercice 2002, la réserve monétaire est ramenée à 250 millions d'euros.", "modifiant la décision 2001/76/CE en ce qui concerne les crédits à l'exportation pour les navires", 'vu le règlement (CEE) n° 804/68 du Conseil, du 27 juin 1968, portant organisation commune des marchés dans le secteur du lait et des produits laitiers (1), modifié en dernier lieu par le règlement (CEE) n° 1557/84 (2), et notamment son article 13 paragraphe 3 et son article 17 paragraphe 4,', "considérant que des informations complémentaires ont été demandées concernant certaines désignations notifiées par les États membres en vertu de l'article 17 du règlement (CEE) n° 2081/92 du Conseil afin de s'assurer qu'elles sont conformes aux articles 2 et 4 dudit règlement; que ces informations complémentaires montrent que les désignations sont conformes auxdits articles; qu'il convient dès lors de les enregistrer et de les ajouter à l'annexe du règlement (CE) n° 1107/96 de la Commission (2), modifié par le règle

In [None]:
!pip install sacrebleu
!pip install evaluate
import evaluate
bleu = evaluate.load("sacrebleu")
references = [[ex["translation"]["tgt"]] for ex in test_dataset]
results = bleu.compute(predictions=translated_fr, references=references)
print("BLEU score on test set:", results["score"])

BLEU score on test set: 58.06101865925001
