In [1]:
import pandas as pd # Not strictly used, but good for general data manipulation
import os
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM, # For sequence-to-sequence models like translation
    Seq2SeqTrainingArguments, # Specific training args for seq2seq
    Seq2SeqTrainer, # Specific trainer for seq2seq
    DataCollatorForSeq2Seq # Handles padding and shifting labels for seq2seq
)
import evaluate # For metrics like BLEU
import numpy as np

# --- Crucial for OOM issues: Clear CUDA cache at the very start ---
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared at script start!")

  from .autonotebook import tqdm as notebook_tqdm


CUDA cache cleared at script start!


In [2]:
print("--- 1. Configuration Section ---")

# --- Language Pair ---
source_lang = "en" # English
target_lang = "fr" # French
# Note: Ensure the model and dataset support these languages.

# --- Model & Tokenizer Configuration ---
# Using a MarianMT model specifically pre-trained for English to French.
# These models are typically named like "Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
model_checkpoint = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"

# Directory to save your fine-tuned model and tokenizer
output_model_dir = "./en-fr-translator-model"

# --- Training Hyperparameters ---
# <<< CRITICAL FOR GPU MEMORY >>>
# Start with a small batch size for translation models due to high memory usage.
# If OOM, reduce further (e.g., 4, then 2).
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3 # Start with a few epochs. Translation can take time.

# Maximum token lengths for input (source) and output (target) sentences.
# Long sentences consume more memory. Keep these reasonable.
max_input_length = 128
max_target_length = 128

# Determine the device for training (GPU if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

--- 1. Configuration Section ---
Using device: cuda


In [5]:
# --- 2. Data Loading & Initial Inspection ---
print("\n--- 2. Data Loading & Initial Inspection ---")

print(f"Loading 'opus_books' dataset for {source_lang}-{target_lang}...")
raw_datasets = load_dataset("opus_books", f"{source_lang}-{target_lang}")

print(f"Dataset '{source_lang}-{target_lang}' loaded. Available splits: {raw_datasets.keys()}")

# --- IMPORTANT: Select a subset for faster training/testing ---
# This section is made robust to handle missing 'validation' or 'test' splits.
print(f"Selecting subsets from available splits (max train: 50k, max val/test: 5k)...")

subset_raw_datasets = DatasetDict()

# Process 'train' split
if "train" in raw_datasets:
    subset_raw_datasets["train"] = raw_datasets["train"].select(range(min(len(raw_datasets["train"]), 50000)))
else:
    print("Warning: 'train' split not found in dataset. Cannot proceed without training data.")
    # You might want to exit or raise an error here if 'train' is absolutely necessary.

# Process 'validation' split
if "validation" in raw_datasets:
    subset_raw_datasets["validation"] = raw_datasets["validation"].select(range(min(len(raw_datasets["validation"]), 5000)))
else:
    print("Info: 'validation' split not found in dataset.")

# Process 'test' split
if "test" in raw_datasets:
    subset_raw_datasets["test"] = raw_datasets["test"].select(range(min(len(raw_datasets["test"]), 5000)))
else:
    print("Info: 'test' split not found in dataset.")

# Fallback: If 'validation' is missing but 'test' exists, use 'test' as validation
if "train" in subset_raw_datasets and "validation" not in subset_raw_datasets and "test" in subset_raw_datasets:
    print("Using 'test' split as 'validation' split for training as 'validation' was not found.")
    subset_raw_datasets["validation"] = subset_raw_datasets["test"]
elif "train" in subset_raw_datasets and "validation" not in subset_raw_datasets and "test" not in subset_raw_datasets:
    # If neither validation nor test exists, create a small validation set from train
    print("No 'validation' or 'test' split found. Creating a small validation set from 'train'.")
    train_size = len(subset_raw_datasets["train"])
    val_size = min(int(train_size * 0.1), 5000) # Take 10% of train, max 5000
    if train_size > val_size:
        subset_raw_datasets["validation"] = subset_raw_datasets["train"].select(range(train_size - val_size, train_size))
        subset_raw_datasets["train"] = subset_raw_datasets["train"].select(range(train_size - val_size))
    else:
        print("Warning: Training set too small to create a separate validation split.")


# Assign the subsetted datasets back to raw_datasets for the rest of the script
raw_datasets = subset_raw_datasets

print(f"Dataset subset loaded with splits: {raw_datasets.keys()}")
print(raw_datasets) # Show the final structure of the subsetted dataset

# Example inspection of the first training sample (make robust to structure)
if "train" in raw_datasets and len(raw_datasets["train"]) > 0:
    print(f"\nExample from train split:")
    # Check if the structure is like 'translation' dictionary (opus_books) or direct lang keys (NeelB)
    first_example = raw_datasets['train'][0]
    if "translation" in first_example:
        print(f"  {source_lang.upper()}: {first_example['translation'][source_lang]}")
        print(f"  {target_lang.upper()}: {first_example['translation'][target_lang]}")
    else: # Assume direct language keys like in NeelB/nepali_parallel_corpus
        print(f"  {source_lang.upper()}: {first_example[source_lang]}")
        print(f"  {target_lang.upper()}: {first_example[target_lang]}")
else:
    print("\nNo 'train' split or empty 'train' split in subsetted dataset for example inspection.")


--- 2. Data Loading & Initial Inspection ---
Loading 'opus_books' dataset for en-fr...
Dataset 'en-fr' loaded. Available splits: dict_keys(['train'])
Selecting subsets from available splits (max train: 50k, max val/test: 5k)...
Info: 'validation' split not found in dataset.
Info: 'test' split not found in dataset.
No 'validation' or 'test' split found. Creating a small validation set from 'train'.
Dataset subset loaded with splits: dict_keys(['train', 'validation'])
DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 45000
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 5000
    })
})

Example from train split:
  EN: The Wanderer
  FR: Le grand Meaulnes


In [6]:
print(f"\n--- 3. Tokenizer & Model Initialization ---")

# Load AutoTokenizer specific to the chosen model.
# This tokenizer knows how to handle both source and target languages for MarianMT.
print(f"Loading tokenizer for '{model_checkpoint}'...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load AutoModelForSeq2SeqLM: This is the Encoder-Decoder model architecture.
print(f"Loading model '{model_checkpoint}'...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.to(device) # Move model to GPU/CPU
print("Model loaded and moved to device.")


--- 3. Tokenizer & Model Initialization ---
Loading tokenizer for 'Helsinki-NLP/opus-mt-en-fr'...




Loading model 'Helsinki-NLP/opus-mt-en-fr'...
Model loaded and moved to device.


In [7]:
print("\n--- 4. Data Preprocessing (Tokenization & Formatting) ---")

# Preprocessing function to tokenize both source and target sentences.
def preprocess_function(examples):
    # Tokenize source language sentences
    inputs = [ex[source_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length" # Pad to max_input_length
    )

    # Tokenize target language sentences
    labels = [ex[target_lang] for ex in examples["translation"]]
    # The tokenizer used for target should often be the same as for the model.
    # We also add decoder_input_ids for training.
    with tokenizer.as_target_tokenizer(): # Context manager for target language tokenization
        labels = tokenizer(
            labels,
            max_length=max_target_length,
            truncation=True,
            padding="max_length" # Pad to max_target_length
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Preprocessing dataset (tokenizing English and French sentences)...")
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names, # Remove original columns to save memory
    desc=f"Tokenizing {source_lang}-{target_lang} dataset"
)

# Set the format to PyTorch tensors for DataLoader
tokenized_datasets.set_format("torch")

print("Dataset preprocessing complete:")
print(tokenized_datasets)


--- 4. Data Preprocessing (Tokenization & Formatting) ---
Preprocessing dataset (tokenizing English and French sentences)...


Tokenizing en-fr dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45000/45000 [00:11<00:00, 3947.97 examples/s]
Tokenizing en-fr dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 4274.75 examples/s]

Dataset preprocessing complete:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 45000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})





In [8]:
print("\n--- 5. Data Collator & Metrics ---")

# Data collator for sequence-to-sequence tasks.
# It performs dynamic padding and shifts labels for the decoder.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define Evaluation Metrics (BLEU score)
print("Defining evaluation metrics (BLEU)...")
metric = evaluate.load("sacrebleu") # sacrebleu is a robust BLEU implementation

# Function to compute metrics during evaluation.
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # If the model outputs tuples, take the first element (logits)
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 (Hugging Face's default for padding in labels) with tokenizer's pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process the decoded texts (e.g., remove extra whitespace)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels] # sacrebleu expects list of lists for references

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]} # 'score' key contains the BLEU score


--- 5. Data Collator & Metrics ---
Defining evaluation metrics (BLEU)...


In [11]:
print("\n--- 6. Training Setup & Execution ---")

# Define Training Arguments for the Hugging Face Trainer.
training_args = Seq2SeqTrainingArguments(
    output_dir=output_model_dir,
    eval_strategy="epoch",  # Evaluate at the end of each training epoch
    save_strategy="epoch",        # Save model checkpoint at the end of each epoch
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1, #NUM_EPOCHS,
    weight_decay=0.01,
    predict_with_generate=True,   # IMPORTANT: Enables generation during evaluation steps
    fp16=torch.cuda.is_available(), # Enable mixed precision training if GPU available (faster, less VRAM)
    report_to="none",             # Disables integration with external logging tools
    load_best_model_at_end=True,  # Load the best model based on metric_for_best_model
    metric_for_best_model="bleu", # Monitor BLEU score for selecting the best model
    greater_is_better=True,       # For BLEU, a higher score is better
    gradient_checkpointing=True,  # <<< IMPORTANT FOR GPU MEMORY >>> Reduces memory by recomputing activations.
)

# Initialize the Hugging Face Trainer.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start the Training Process.
print("\n--- Training Process Initiated ---")
try:
    train_result = trainer.train()
    print("\nTraining complete!")
    # Save the fine-tuned model and its tokenizer
    trainer.save_model(output_model_dir)
    tokenizer.save_pretrained(output_model_dir) # Save the tokenizer too!
    print(f"Fine-tuned model and tokenizer saved to: {output_model_dir}")
except RuntimeError as e:
    print(f"\nERROR: Training failed due to a RuntimeError: {e}")
    print("This often indicates a CUDA out-of-memory error. Try reducing BATCH_SIZE or max_input/target_length.")
except Exception as e:
    print(f"\nERROR: An unexpected error occurred during training: {e}")


--- 6. Training Setup & Execution ---

--- Training Process Initiated ---


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu
1,0.51,0.322316,37.106666


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].



Training complete!
Fine-tuned model and tokenizer saved to: ./en-fr-translator-model


In [12]:
print("\n--- 7. Final Evaluation & Inference ---")

# Evaluate the model on the held-out test set (if available).
if "test" in tokenized_datasets:
    print("Evaluating on the held-out test set...")
    test_results = trainer.evaluate(tokenized_datasets["test"])
    print(f"Test Set Evaluation Results: {test_results}")
else:
    print("No 'test' split available for final evaluation.")

print("\n--- Quick Inference Test with Pipeline ---")
from transformers import pipeline

try:
    # Load the fine-tuned model and tokenizer into a Hugging Face pipeline.
    # The 'translation_en_to_fr' pipeline automatically handles tokenization,
    # model inference, and text decoding for translation.
    translator = pipeline(
        f"translation_{source_lang}_to_{target_lang}",
        model=output_model_dir, # Loads your saved model
        tokenizer=output_model_dir, # Loads your saved tokenizer
        device=0 if torch.cuda.is_available() else -1 # Use GPU 0 if available, else CPU
    )

    sample_texts = [
        "Hello, how are you today?",
        "This is a great example of machine translation.",
        "The quick brown fox jumps over the lazy dog."
    ]

    print("\nTranslating sample texts:")
    for i, text in enumerate(sample_texts):
        translation = translator(text)
        print(f"  Original {source_lang}: {text}")
        print(f"  Translated {target_lang}: {translation[0]['translation_text']}")
        print("-" * 30)

except Exception as e:
    print(f"\nERROR during quick inference test: {e}")
    print("Please ensure the model and tokenizer were saved correctly and `pipeline` can be initialized.")


--- 7. Final Evaluation & Inference ---
No 'test' split available for final evaluation.

--- Quick Inference Test with Pipeline ---


Device set to use cuda:0



Translating sample texts:
  Original en: Hello, how are you today?
  Translated fr: Bonjour, comment allez-vous aujourd'hui?
------------------------------
  Original en: This is a great example of machine translation.
  Translated fr: Cest un grand exemple de traduction automatique.
------------------------------
  Original en: The quick brown fox jumps over the lazy dog.
  Translated fr: Le rapide renard brun saute sur le chien paresseux.
------------------------------
