Libraries Install

In [None]:
!pip install -U transformers datasets sentencepiece sacrebleu evaluate accelerate
!pip install torch --index-url https://download.pytorch.org/whl/cu121


Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 

Dataset Load & Cleaning (EXACT file)

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load xlsx
df = pd.read_excel("Marma dataset.xlsx")

# Rename columns (standard naming)
df = df.rename(columns={
    "Marma": "source",
    "Bangla": "target"
})

# Basic cleaning
df["source"] = df["source"].astype(str).str.strip()
df["target"] = df["target"].astype(str).str.strip()

df = df.dropna().reset_index(drop=True)

print(df.head())
print("Total sentences:", len(df))

                            target  \
0                আমি সকালে চা খাই।   
1             তুমি দুপুরে ভাত খাও।   
2                 সে  জল পান করছে।   
3  আমরা সন্ধ্যায় বাড়িতে ফিরে আসি।   
4            তারা রাতে ঘুমাতে যায়।   

                                              source  
0                       ငါ မနက်မှာ လက်ဖက်ရည် စားတယ်။  
1                       နင် မနေ့လည်မှာ ထမင်း စားတယ်။  
2                                  သူ ရေ သောက်နေတယ်။  
3  ကျွန်ုပ်တို့သည် ညနေခင်းတွင် အိမ်သို့ ပြန်လာကြသည်။  
4                      သူတို့ ညမှာ အိပ်ဖို့ သွားတယ်။  
Total sentences: 2099


Train / Validation / Test Split (80 / 10 / 10)

In [None]:
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1, seed=42)

temp = dataset["train"].train_test_split(test_size=0.1111, seed=42)
# 0.1111 of 90% ≈ 10%

dataset = DatasetDict({
    "train": temp["train"],
    "validation": temp["test"],
    "test": dataset["test"]
})

dataset


DatasetDict({
    train: Dataset({
        features: ['target', 'source'],
        num_rows: 1679
    })
    validation: Dataset({
        features: ['target', 'source'],
        num_rows: 210
    })
    test: Dataset({
        features: ['target', 'source'],
        num_rows: 210
    })
})

mT5 Tokenizer + Prefix

In [None]:
from transformers import MT5Tokenizer, MT5ForConditionalGeneration

tokenizer_mt5 = MT5Tokenizer.from_pretrained("google/mt5-small")
model_mt5 = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess_mt5(batch):
    inputs = ["translate Marma to Bangla: " + s for s in batch["source"]]

    model_inputs = tokenizer_mt5(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer_mt5(
        batch["target"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_mt5 = dataset.map(preprocess_mt5, batched=True)


Map:   0%|          | 0/1679 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Data Collator (mT5)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator_mt5 = DataCollatorForSeq2Seq(
    tokenizer=tokenizer_mt5,
    model=model_mt5
)


mT5 Trainer

In [None]:
from transformers import Seq2SeqTrainer, TrainingArguments, GenerationConfig

training_args = TrainingArguments(
    output_dir="./mt5_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True, # Enable mixed precision training
    push_to_hub=False,
    report_to=["none"]
)

# Manually add predict_with_generate for the Trainer to use during evaluation.
# This bypasses a TypeError when adding it directly to TrainingArguments init.
training_args.predict_with_generate = True

# Workaround for AttributeError: 'TrainingArguments' object has no attribute 'generation_config'
# Manually add a default GenerationConfig to the training_args object.
# This ensures the Seq2SeqTrainer finds the attribute it expects.
# If you need specific generation parameters during evaluation/prediction,
# you can define them here, e.g., max_length=128, num_beams=4.
training_args.generation_config = GenerationConfig(max_length=128, num_beams=1)

trainer_mt5 = Seq2SeqTrainer(
    model=model_mt5,
    args=training_args,
    train_dataset=tokenized_mt5["train"],
    eval_dataset=tokenized_mt5["validation"],
    tokenizer=tokenizer_mt5,
    data_collator=data_collator_mt5
)

# trainer_mt5.compute_metrics = compute_metrics_mt5

  trainer_mt5 = Seq2SeqTrainer(


In [None]:
trainer_mt5.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'pad_token_id': 0}.


Step,Training Loss


TrainOutput(global_step=315, training_loss=0.0, metrics={'train_runtime': 142.7535, 'train_samples_per_second': 35.285, 'train_steps_per_second': 2.207, 'total_flos': 665828048240640.0, 'train_loss': 0.0, 'epoch': 3.0})

Train & Evaluate

In [None]:
mt5_results = trainer_mt5.evaluate(tokenized_mt5["test"], max_length=128, num_beams=1)
print(mt5_results)

{'eval_loss': nan, 'eval_runtime': 1.5451, 'eval_samples_per_second': 135.91, 'eval_steps_per_second': 9.061, 'epoch': 3.0}


Inference Test (mT5)

In [None]:
text = "ဒီနေ့ အရမ်းပူတယ်"

# Ensure the decoder_start_token_id is set for encoder-decoder generation
# This is crucial for T5-like models
# Also explicitly pass it to generate for robustness

inputs = tokenizer_mt5(
    "translate Marma to Bangla: " + text,
    return_tensors="pt"
).to(model_mt5.device)

outputs = model_mt5.generate(
    **inputs,
    max_length=100,
    decoder_start_token_id=tokenizer_mt5.pad_token_id
)

print(tokenizer_mt5.decode(outputs[0], skip_special_tokens=True))




mT5 Metrics

In [None]:
!pip install evaluate nltk



In [None]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Load All Metrics




In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")
ter = evaluate.load("ter")
chrf = evaluate.load("chrf")
meteor = evaluate.load("meteor")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def compute_metrics_mt5(eval_preds):
    preds, labels = eval_preds

    # -100 → pad_token_id (important!)
    labels = [
        [(l if l != -100 else tokenizer_mt5.pad_token_id) for l in label]
        for label in labels
    ]

    decoded_preds = tokenizer_mt5.batch_decode(
        preds, skip_special_tokens=True
    )
    decoded_labels = tokenizer_mt5.batch_decode(
        labels, skip_special_tokens=True
    )

    bleu_score = bleu.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels]
    )["score"]

    ter_score = ter.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["score"]

    chrf_score = chrf.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        word_order=0     # chrF
    )["score"]

    chrfpp_score = chrf.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        word_order=2     # chrF++
    )["score"]

    meteor_score = meteor.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["meteor"]

    return {
        "BLEU": bleu_score,
        "TER": ter_score,
        "chrF": chrf_score,
        "chrF++": chrfpp_score,
        "METEOR": meteor_score
    }


In [None]:
from transformers import Seq2SeqTrainingArguments, GenerationConfig

generation_config = GenerationConfig(
    max_length=128,
    num_beams=1,
    decoder_start_token_id=tokenizer_mt5.pad_token_id # Explicitly set here
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_debug",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,

    predict_with_generate=True,      # 🔴 MUST
    generation_max_length=128,       # 🔴 MUST
    generation_config=generation_config, # Pass the GenerationConfig object
    fp16=True,
    report_to="none"
)

In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

data_collator_mt5 = DataCollatorForSeq2Seq(
    tokenizer=tokenizer_mt5,
    model=model_mt5
)

trainer_mt5 = Seq2SeqTrainer(
    model=model_mt5,
    args=training_args,
    train_dataset=tokenized_mt5["train"],
    eval_dataset=tokenized_mt5["test"],   # test directly
    tokenizer=tokenizer_mt5,
    data_collator=data_collator_mt5,
    compute_metrics=compute_metrics_mt5   # 👈 এখানেই attach
)


  trainer_mt5 = Seq2SeqTrainer(


In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

data_collator_mt5 = DataCollatorForSeq2Seq(
    tokenizer=tokenizer_mt5,
    model=model_mt5
)

trainer_mt5 = Seq2SeqTrainer(
    model=model_mt5,
    args=training_args,
    train_dataset=tokenized_mt5["train"],
    eval_dataset=tokenized_mt5["test"],   # test directly
    tokenizer=tokenizer_mt5,
    data_collator=data_collator_mt5,
    compute_metrics=compute_metrics_mt5   # 👈 এখানেই attach
)

results = trainer_mt5.evaluate(decoder_start_token_id=tokenizer_mt5.pad_token_id)
print(results)

  trainer_mt5 = Seq2SeqTrainer(


{'eval_loss': nan, 'eval_model_preparation_time': 0.0031, 'eval_BLEU': 0.0, 'eval_TER': 100.0, 'eval_chrF': 0.0, 'eval_chrF++': 0.0, 'eval_METEOR': 0.0, 'eval_runtime': 142.3757, 'eval_samples_per_second': 1.475, 'eval_steps_per_second': 0.372}
