# Testing the base Model (en-fr) in the test_dataset (fr)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Configuring the environment

In [2]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
C

In [3]:
!pip install -q sacremoses

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
os.environ["WANDB_DISABLED"]="true"

In [5]:
import os
import random
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer
from transformers import MarianMTModel, MarianTokenizer
import torch
from tqdm import tqdm

In [8]:
# Configuration
en_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-fr.clean.en'
fr_path = '/content/drive/MyDrive/Machine_Translation/POSTER/JRC-Acquis.en-fr.clean.fr'
model_name = "Helsinki-NLP/opus-mt-en-fr"
max_length = 128
random.seed(42)

# Read and clean the files
with open(en_path, 'r', encoding='utf-8') as f_en, open(fr_path, 'r', encoding='utf-8') as f_fr:
    en_lines = f_en.readlines()
    fr_lines = f_fr.readlines()

pairs = [(en.strip(), fr.strip()) for en, fr in zip(en_lines, fr_lines) if en.strip() and fr.strip()]
random.shuffle(pairs)

# Split into train/dev/test
train_pairs = pairs[:80_000]
dev_pairs = pairs[80_000:90_000]
test_pairs = pairs[90_000:100_000]

# Create Hugging Face Datasets
def make_dataset(pairs):
    return Dataset.from_list([{"translation": {"en": en, "fr": fr}} for en, fr in pairs])

dataset = DatasetDict({
    "train": make_dataset(train_pairs),
    "validation": make_dataset(dev_pairs),
    "test": make_dataset(test_pairs)
})

# Tokenization
tokenizer = MarianTokenizer.from_pretrained(model_name)


def preprocess(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["fr"] for ex in examples["translation"]]

    model_inputs = tokenizer(src_texts, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess, batched=True)


print("Done: tokenized_dataset['train'], ['validation'], ['test'] are ready.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Done: tokenized_dataset['train'], ['validation'], ['test'] are ready.


In [None]:
# Load the fine-tuned model and tokenizer
model_dir = "Helsinki-NLP/opus-mt-en-fr"
model = MarianMTModel.from_pretrained(model_dir)
tokenizer = MarianTokenizer.from_pretrained(model_dir)
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Extracting raw English and French sentences
raw_test_en = [ex["translation"]["en"] for ex in dataset["test"]]
raw_test_fr = [ex["translation"]["fr"] for ex in dataset["test"]]

# Generation of the translations with progress bar
translated_fr = []

for sentence in tqdm(raw_test_en, desc="Translating"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, num_beam=4)
    translated = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    translated_fr.append(translated)

Translating: 100%|██████████| 10000/10000 [54:27<00:00,  3.06it/s]


In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=translated_fr, references=[[ref] for ref in raw_test_fr])
print("BLEU score on test set:", results["score"])

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

BLEU score on test set: 58.52396979581673
