Libraries Install (Colab)

In [None]:
!pip install -U transformers datasets sentencepiece sacrebleu evaluate accelerate
!pip install torch --index-url https://download.pytorch.org/whl/cu121


Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.3.5-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading transformers-5.0.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo

Dataset Load & Cleaning (EXACT file)

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load xlsx
df = pd.read_excel("/content/Marma dataset.xlsx")

# Rename columns (standard naming)
df = df.rename(columns={
    "Marma": "source",
    "Bangla": "target"
})

# Basic cleaning
df["source"] = df["source"].astype(str).str.strip()
df["target"] = df["target"].astype(str).str.strip()

df = df.dropna().reset_index(drop=True)

print(df.head())
print("Total sentences:", len(df))


                            target  \
0                আমি সকালে চা খাই।   
1             তুমি দুপুরে ভাত খাও।   
2                 সে  জল পান করছে।   
3  আমরা সন্ধ্যায় বাড়িতে ফিরে আসি।   
4            তারা রাতে ঘুমাতে যায়।   

                                              source  
0                       ငါ မနက်မှာ လက်ဖက်ရည် စားတယ်။  
1                       နင် မနေ့လည်မှာ ထမင်း စားတယ်။  
2                                  သူ ရေ သောက်နေတယ်။  
3  ကျွန်ုပ်တို့သည် ညနေခင်းတွင် အိမ်သို့ ပြန်လာကြသည်။  
4                      သူတို့ ညမှာ အိပ်ဖို့ သွားတယ်။  
Total sentences: 2099


3️⃣ Train / Validation / Test Split (80 / 10 / 10)

In [None]:
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1, seed=42)

temp = dataset["train"].train_test_split(test_size=0.1111, seed=42)
# 0.1111 of 90% ≈ 10%

dataset = DatasetDict({
    "train": temp["train"],
    "validation": temp["test"],
    "test": dataset["test"]
})

dataset


DatasetDict({
    train: Dataset({
        features: ['target', 'source'],
        num_rows: 1679
    })
    validation: Dataset({
        features: ['target', 'source'],
        num_rows: 210
    })
    test: Dataset({
        features: ['target', 'source'],
        num_rows: 210
    })
})

In [None]:
dataset["train"].to_csv("train.csv")
dataset["validation"].to_csv("valid.csv")
dataset["test"].to_csv("test.csv")

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

33197

MODEL–1: NLLB

NLLB Tokenizer

In [None]:
from transformers import AutoTokenizer

model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    src_lang="my_MM",
    tgt_lang="bn_IN"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

NLLB Preprocessing

In [None]:
max_len = 128

def preprocess_nllb(batch):
    # Tokenize the source text (Marma)
    # Explicitly set the source language for the input text.
    model_inputs = tokenizer(
        batch["source"],
        max_length=max_len,
        truncation=True,
        padding="max_length",
        src_lang="my_MM" # Explicitly set source language for input
    )

    # Tokenize the target text (Bangla) for labels
    # For labels, we set the `src_lang` to the target language ('bn_IN')
    # to ensure the correct target language token is prepended.
    labels = tokenizer(
        batch["target"],
        max_length=max_len,
        truncation=True,
        padding="max_length",
        src_lang="bn_IN" # Explicitly set source language to target language for labels
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_nllb = dataset.map(preprocess_nllb, batched=True)

Map:   0%|          | 0/1679 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

NLLB Model Load

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)




pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")

def compute_metrics_nllb(eval_preds):
    preds, labels = eval_preds

    labels = [
        [(l if l != -100 else tokenizer.pad_token_id) for l in label]
        for label in labels
    ]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "BLEU": bleu.compute(
            predictions=decoded_preds,
            references=[[l] for l in decoded_labels]
        )["score"]
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Training Setup

In [None]:
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    IntervalStrategy
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb_marma_bn",
    # evaluation_strategy=IntervalStrategy.STEPS, # Temporarily removed to fix TypeError
    per_device_train_batch_size=4, # Reduced batch size
    gradient_accumulation_steps=2, # Added gradient accumulation
    per_device_eval_batch_size=4, # Further reduced eval batch size
    learning_rate=2e-5,
    num_train_epochs=10,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=200,
    save_total_limit=2,
    fp16=True,
    predict_with_generate=True,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

trainer_nllb = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_nllb["train"],
    eval_dataset=tokenized_nllb["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_nllb
)



In [None]:
trainer_nllb.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 10312 has 14.74 GiB memory in use. Of the allocated memory 14.53 GiB is allocated by PyTorch, and 76.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids("__ben_Beng__")

In [None]:
print("forced_bos_token_id =", model.config.forced_bos_token_id)


In [None]:
print(trainer_nllb.compute_metrics)

In [None]:
results = trainer_nllb.evaluate(tokenized_nllb["test"])
print(results)

Evaluation (BLEU + TER)

In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")
ter = evaluate.load("ter")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_score = bleu.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels]
    )

    ter_score = ter.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    return {
        "bleu": bleu_score["score"],
        "ter": ter_score["score"]
    }

trainer_nllb.compute_metrics = compute_metrics
trainer_nllb.evaluate(tokenized_nllb["test"])

METEOR

In [None]:
!pip install nltk


In [None]:
import evaluate
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")


Inference

In [None]:
text = "ငါ "

inputs = tokenizer(text, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_length=100
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Load All Metrics

In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")
ter = evaluate.load("ter")
chrf = evaluate.load("chrf")
meteor = evaluate.load("meteor")


Unified compute_metrics Function

In [None]:
def compute_metrics_nllb(eval_preds):
    preds, labels = eval_preds

    # Replace -100 with pad token
    labels = [
        [(l if l != -100 else tokenizer.pad_token_id) for l in label]
        for label in labels
    ]

    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True
    )
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )

    # BLEU
    bleu_score = bleu.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels]
    )["score"]

    # TER
    ter_score = ter.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["score"]

    # chrF
    chrf_score = chrf.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        word_order=0   # chrF
    )["score"]

    # chrF++
    chrfpp_score = chrf.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        word_order=2   # chrF++
    )["score"]

    # METEOR
    meteor_score = meteor.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["meteor"]

    return {
        "BLEU": bleu_score,
        "TER": ter_score,
        "chrF": chrf_score,
        "chrF++": chrfpp_score,
        "METEOR": meteor_score
    }



In [None]:
trainer_nllb.compute_metrics = compute_metrics_nllb
results = trainer_nllb.evaluate()
print(results)