Libraries

In [None]:
pip install torch transformers sentencepiece pandas numpy tqdm sacremoses


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


STEP 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from tqdm import tqdm

STEP 3: Load Excel Dataset

In [None]:
df = pd.read_excel("Marma dataset.xlsx")

df = df[["Marma", "Bangla"]]
df = df.dropna().reset_index(drop=True)

print(df.head())
print("Total samples:", len(df))


                                               Marma  \
0                       ငါ မနက်မှာ လက်ဖက်ရည် စားတယ်။   
1                       နင် မနေ့လည်မှာ ထမင်း စားတယ်။   
2                                  သူ ရေ သောက်နေတယ်။   
3  ကျွန်ုပ်တို့သည် ညနေခင်းတွင် အိမ်သို့ ပြန်လာကြသည်။   
4                      သူတို့ ညမှာ အိပ်ဖို့ သွားတယ်။   

                             Bangla  
0                 আমি সকালে চা খাই।  
1              তুমি দুপুরে ভাত খাও।  
2                  সে  জল পান করছে।  
3   আমরা সন্ধ্যায় বাড়িতে ফিরে আসি।  
4             তারা রাতে ঘুমাতে যায়।  
Total samples: 2099


STEP 3.1: Dataset Split

In [None]:
from datasets import Dataset

hf_dataset = Dataset.from_pandas(df)
dataset = hf_dataset.train_test_split(test_size=0.2)
temp = dataset["test"].train_test_split(test_size=0.5)

train_dataset = dataset["train"]
valid_dataset = temp["train"]
test_dataset = temp["test"]


In [None]:
train_dataset.to_csv("train.csv")
valid_dataset.to_csv("valid.csv")
test_dataset.to_csv("test.csv")


Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

34302

STEP 4: Text Normalization

In [None]:
import re
import pandas as pd

# ================================
# Text Normalization Function
# ================================
def normalize_text(text):
    """
    This function cleans and normalizes text by:
    - Converting to string
    - Removing extra whitespace
    - Fixing punctuation spacing
    - Removing unnecessary symbols
    """
    text = str(text)

    # Remove extra spaces, tabs, newlines
    text = re.sub(r"\s+", " ", text).strip()

    # Normalize Bangla punctuation
    text = text.replace(" ।", "।")
    text = text.replace(" ,", ",")
    text = text.replace(" ?", "?")
    text = text.replace(" !", "!")

    # Remove unwanted characters (keep language + punctuation)
    text = re.sub(r"[^\u0980-\u09FF\u1000-\u109F.,?!। ]", "", text)

    return text


# ================================
# Apply Preprocessing
# ================================
df["Marma"] = df["Marma"].apply(normalize_text)
df["Bangla"] = df["Bangla"].apply(normalize_text)

# ================================
# Remove empty rows
# ================================
df = df.dropna()
df = df[(df["Marma"] != "") & (df["Bangla"] != "")]

# ================================
# Remove duplicate sentence pairs
# ================================
df = df.drop_duplicates()

# ================================
# Length Filtering (important for MT)
# ================================
MAX_TOKENS = 100

df = df[df["Marma"].str.split().str.len() <= MAX_TOKENS]
df = df[df["Bangla"].str.split().str.len() <= MAX_TOKENS]

# ================================
# Reset index after cleaning
# ================================
df = df.reset_index(drop=True)

print("Final dataset size after preprocessing:", len(df))



Final dataset size after preprocessing: 2061


STEP 5: Load mBART Tokenizer

In [None]:
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

SRC_LANG = "my_MM"   # Marma placeholder
TGT_LANG = "bn_IN"   # Bangla


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

STEP 6: Tokenization Function

In [None]:
MAX_LEN = 128

def tokenize_data(src_text, tgt_text):
    tokenizer.src_lang = SRC_LANG

    model_inputs = tokenizer(
        src_text,
        max_length=MAX_LEN,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    tokenizer.tgt_lang = TGT_LANG
    labels = tokenizer(
        tgt_text,
        max_length=MAX_LEN,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    model_inputs["labels"] = labels["input_ids"]

    return {k: v.squeeze() for k, v in model_inputs.items()}


STEP 7: Dataset Class

In [None]:
class MarmaBanglaDataset(Dataset):
    def __init__(self, data_hf):
        self.data_hf = data_hf

    def __len__(self):
        return len(self.data_hf)

    def __getitem__(self, idx):
        item = self.data_hf[idx]
        src = item["Marma"]
        tgt = item["Bangla"]
        return tokenize_data(src, tgt)

STEP 8: DataLoader

In [None]:
BATCH_SIZE = 4   # GPU ছোট হলে 2 করো

train_colab_dataset = MarmaBanglaDataset(train_dataset)
dataloader = DataLoader(train_colab_dataset, batch_size=BATCH_SIZE, shuffle=True)

STEP 9: Load mBART Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.to(device)

print("Using device:", device)


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Using device: cuda


STEP 10: Optimizer

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)


STEP 11: Training Loop

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

Epoch 1: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.22]


Epoch 1 Average Loss: 1.3875


Epoch 2: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.13]


Epoch 2 Average Loss: 0.1499


Epoch 3: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.0476]


Epoch 3 Average Loss: 0.0965


Epoch 4: 100%|██████████| 420/420 [05:35<00:00,  1.25it/s, loss=0.1]


Epoch 4 Average Loss: 0.0637


Epoch 5: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.0374]


Epoch 5 Average Loss: 0.0433


Epoch 6: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.00328]


Epoch 6 Average Loss: 0.0317


Epoch 7: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.0173]


Epoch 7 Average Loss: 0.0241


Epoch 8: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.00693]


Epoch 8 Average Loss: 0.0195


Epoch 9: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.00545]


Epoch 9 Average Loss: 0.0152


Epoch 10: 100%|██████████| 420/420 [05:36<00:00,  1.25it/s, loss=0.00408]

Epoch 10 Average Loss: 0.0118





In [None]:
generated_tokens = model.generate(
    input_ids=batch["input_ids"],
    attention_mask=batch["attention_mask"],
    max_length=128
)


In [None]:
preds = tokenizer.batch_decode(
    generated_tokens, skip_special_tokens=True
)

refs = tokenizer.batch_decode(
    batch["labels"], skip_special_tokens=True
)
refs = [[r] for r in refs]  # BLEU format


In [None]:
print("TRAIN sample:", train_dataset[0]["Marma"])
print("TEST sample :", test_dataset[0]["Marma"])


TRAIN sample: ဘယ်ဟာ ပိုပြီး ဈေးကြီးလဲ။
TEST sample : ၄၄၅) နင့်မာ အကြိုက်ဆုံး လက်ဖက် နည်းလမ်း ဇာလေး။


STEP 12: Save Model

In [None]:
model.save_pretrained("marma_bn_mbart_model")
tokenizer.save_pretrained("marma_bn_mbart_model")




('marma_bn_mbart_model/tokenizer_config.json',
 'marma_bn_mbart_model/special_tokens_map.json',
 'marma_bn_mbart_model/sentencepiece.bpe.model',
 'marma_bn_mbart_model/added_tokens.json',
 'marma_bn_mbart_model/tokenizer.json')

STEP 13: Inference (Translation)

In [None]:
def translate_marma_to_bangla(sentence):
    model.eval()
    tokenizer.src_lang = SRC_LANG

    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        padding=True
    ).to(device)

    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[TGT_LANG],
        max_length=128
    )

    return tokenizer.decode(
        generated_tokens[0],
        skip_special_tokens=True
    )


Test

In [None]:
print(translate_marma_to_bangla("ငါ နောက် အခါလေမာ စာအုပ် ကလပ် အစည်းအဝေး တစ်ခုကို သွားမယ်။ "))


আমি পরের সপ্তাহে একটি বই ক্লাবে যাচ্ছি।


MT Evaluation Metrics

STEP 1: Install Required Libraries

In [None]:
pip install sacrebleu nltk evaluate




In [None]:
import sacrebleu

# The variables test_preds and test_refs are not yet defined.
# They will be generated by the 'generate_predictions_for_dataset' function.
# Please execute the subsequent cells to define this function and run the full evaluation.
# Leaving this cell as is, will result in a NameError until test_preds and test_refs are defined.

In [None]:
def generate_predictions_for_dataset(model, tokenizer, dataset, device, src_lang, tgt_lang, max_len):
    model.eval()
    predictions = []
    references = []

    # Create a DataLoader for the dataset
    eval_dataloader = DataLoader(MarmaBanglaDataset(dataset), batch_size=BATCH_SIZE, shuffle=False)

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Generating predictions"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate tokens
            tokenizer.src_lang = src_lang
            generated_tokens = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
                max_length=max_len
            )

            # Decode predictions
            preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            predictions.extend(preds)

            # Decode references (labels)
            # sacrebleu expects references as a list of lists (each inner list contains one reference translation)
            refs = tokenizer.batch_decode(
                labels, skip_special_tokens=True
            )
            references.extend([[r] for r in refs])
    return predictions, references

In [None]:
sample_sentence = "မင်္ဂလာနံနက်စောစော အိပ်ယာထပြီးပါပြီ။" # Example sentence, replace with your desired input

tokenizer.src_lang = SRC_LANG
inputs = tokenizer(
    sample_sentence,
    return_tensors="pt",
    padding=True
).to(device)

generated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id[TGT_LANG],
    max_length=128,
    num_beams=5,
    length_penalty=1.0,
    early_stopping=True
)

In [None]:
import evaluate

# Generate predictions
test_preds, test_refs = generate_predictions_for_dataset(
    model, tokenizer, test_dataset, device, SRC_LANG, TGT_LANG, MAX_LEN
)

# BLEU (Bangla-safe)
bleu = sacrebleu.corpus_bleu(
    test_preds,
    test_refs,
    tokenize="intl"
)
print("Test BLEU:", bleu.score)

# TER
ter = sacrebleu.corpus_ter(test_preds, test_refs)
print("Test TER:", ter.score)

# chrF++
chrf_pp = sacrebleu.corpus_chrf(
    test_preds,
    test_refs,
    word_order=2
)
print("Test chrF++:", chrf_pp.score)

# METEOR
meteor = evaluate.load("meteor")
test_refs_meteor = [ref[0] for ref in test_refs]

meteor_score = meteor.compute(
    predictions=test_preds,
    references=test_refs_meteor
)
print("Test METEOR:", meteor_score["meteor"])

Generating predictions: 100%|██████████| 53/53 [00:21<00:00,  2.51it/s]


Test BLEU: 37.99178428257963
Test TER: 22.411953041622198
Test chrF++: 79.16485514705697


Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Test METEOR: 0.5301894582333108
