# Fine-tuning the ROMANCE model with the 80k training dataset only in french

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets evaluate wandb sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: p

In [3]:
!pip install -q sacremoses

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m604.2/897.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

Found existing installation: wandb 0.19.11
Uninstalling wandb-0.19.11:
  Would remove:
    /usr/local/bin/wandb
    /usr/local/bin/wb
    /usr/local/lib/python3.11/dist-packages/package_readme.md
    /usr/local/lib/python3.11/dist-packages/wandb-0.19.11.dist-info/*
    /usr/local/lib/python3.11/dist-packages/wandb/*
Proceed (Y/n)? Y
  Successfully uninstalled wandb-0.19.11


In [4]:
import os
import random
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer
from transformers import MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback
import evaluate
import torch
from tqdm import tqdm

In [6]:
# Configuration
en_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-fr.clean.en'
fr_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-fr.clean.fr'
model_name = "Helsinki-NLP/opus-mt-en-fr"
max_length = 128
random.seed(42)

# Read and clean the files
with open(en_path, 'r', encoding='utf-8') as f_en, open(fr_path, 'r', encoding='utf-8') as f_fr:
    en_lines = f_en.readlines()
    fr_lines = f_fr.readlines()

pairs = [(en.strip(), fr.strip()) for en, fr in zip(en_lines, fr_lines) if en.strip() and fr.strip()]
random.shuffle(pairs)

# Split into train/dev/test
train_pairs = pairs[:80_000]
dev_pairs = pairs[80_000:90_000]
test_pairs = pairs[90_000:100_000]

# Add >>fr<< tag to test set only
test_pairs_tagged = [(f'>>fr<< {en}', fr) for en, fr in test_pairs]

# Create Hugging Face Datasets
def make_dataset(pairs):
    return Dataset.from_list([{"translation": {"en": en, "fr": fr}} for en, fr in pairs])

dataset = DatasetDict({
    "train": make_dataset(train_pairs),
    "validation": make_dataset(dev_pairs),
    "test": make_dataset(test_pairs_tagged)
})

# Tokenization
tokenizer = MarianTokenizer.from_pretrained(model_name)


def preprocess(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["fr"] for ex in examples["translation"]]

    model_inputs = tokenizer(src_texts, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess, batched=True)

print("Done: `tokenized_dataset['train']`, `['validation']`, `['test']` are ready.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Done: `tokenized_dataset['train']`, `['validation']`, `['test']` are ready.


In [None]:
test = dataset["test"]

In [None]:
print(test[100])

{'translation': {'en': '>>fr<< For financial year 2002 the monetary reserve shall be reduced to EUR 250 million.', 'fr': "Pour l'exercice budgétaire 2002, le montant de la réserve monétaire est ramené à 250 millions d'euros."}}


In [None]:
# Load pretrained model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt-en-fr-finetuned-ROMmodel",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    load_best_model_at_end=True,
    report_to = None
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [None]:
# Fine-tuning
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8481,0.522841
2,0.483,0.426235
3,0.403,0.39997


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=15000, training_loss=0.5780505615234375, metrics={'train_runtime': 2652.0511, 'train_samples_per_second': 90.496, 'train_steps_per_second': 5.656, 'total_flos': 8135607582720000.0, 'train_loss': 0.5780505615234375, 'epoch': 3.0})

Testing

In [None]:
# Load the fine-tuned model and tokenizer
model_dir = "/content/marianmt-en-fr-finetuned-ROMmodel/checkpoint-15000"
model = MarianMTModel.from_pretrained(model_dir)
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Extracting raw English and French sentences
raw_test_en = [ex["translation"]["en"] for ex in dataset["test"]]
raw_test_fr = [ex["translation"]["fr"] for ex in dataset["test"]]

# Generation of the translations with progress bar
translated_fr = []

for sentence in tqdm(raw_test_en, desc="Translating"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, num_beams=4)
    translated = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    translated_fr.append(translated)


Translating: 100%|██████████| 10000/10000 [1:23:10<00:00,  2.00it/s]


In [None]:
# Save the translated sentences to a file
with open("/content/FTModel_test_generated.fr", "w", encoding="utf-8") as f:
    for line in translated_fr:
        f.write(line + "\n")


In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=translated_fr, references=[[ref] for ref in raw_test_fr])
print("BLEU score on test set:", results["score"])

BLEU score on test set: 1.226575953823729


In [None]:
print(translated_fr[100:200])

['for For l Or fairly " fonds de ourselves Nous>>es<< em quem que EUR 250 million.', 'los wanted 2001 l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l sua l introduce l sua l sua l introduce l sua l introduce l sua l introduce l introduce l sua l introduce l introduce l introduce l sua l introduce l introduce l introduce l introduce l introduce l introduce l introduce l introduce l introduce l introduce l sua l introduce l introduce l introduce l introduce l introduce l introduce l introduce l introduce l introduce l sua l introduce l introduce l', 'vista les? through I Council Regulation (CEE) No 804/68 de quand June 68, el of necessary système, of principal (á de products la de products la de products (1),>>es<< commeated por Regulation (CEE) No 15 5784, la les? through de their article 13 (3) la de their article,', 'si con with home>>fr<<s evaluated ets can ainsi ets can ainsi ets can ainsi et

In [None]:
print(test[100:200])

{'translation': [{'en': '>>fr<< For financial year 2002 the monetary reserve shall be reduced to EUR 250 million.', 'fr': "Pour l'exercice budgétaire 2002, le montant de la réserve monétaire est ramené à 250 millions d'euros."}, {'en': '>>fr<< amending Decision 2001/76/EC in respect of export credits for ships', 'fr': "modifiant la décision 2001/76/CE en ce qui concerne les crédits à l'exportation de navires"}, {'en': '>>fr<< Having regard to Council Regulation (EEC) No 804/68 of 27 June 1968 on the common organization of the market in milk and milk products (1), as last amended by Regulation (EEC) No 1557/84 (2), and in particular Articles 13 (3) and 17 (4) thereof,', 'fr': 'vu le règlement (CEE) no 804/68 du Conseil, du 27 juin 1968, portant organisation commune des marchés dans le secteur du lait et des produits laitiers (1), modifié en dernier lieu par le règlement (CEE) no 1557/84 (2), et notamment son article 13 para- graphe 3 et son article 17 paragraphe 4,'}, {'en': '>>fr<< Whe

In [None]:
print(dataset["test"][100])

{'translation': {'en': '>>fr<< For financial year 2002 the monetary reserve shall be reduced to EUR 250 million.', 'fr': "Pour l'exercice budgétaire 2002, le montant de la réserve monétaire est ramené à 250 millions d'euros."}}
