Install libraries


In [None]:
!pip install pandas openpyxl torch transformers datasets evaluate sacrebleu sentencepiece accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu, evaluate
Successfully installed colorama-0.4.6 evaluate-0.4.6 portalocker-3.2.0 sacreble

Load + clean script

In [None]:
import re
import unicodedata
import numpy as np
import pandas as pd


# ----------------------------
# 1) CONFIG
# ----------------------------
INPUT_XLSX = "Marma dataset.xlsx"   # <-- change path if needed
SHEET_NAME = 0                     # 0 means first sheet; or use "Sheet1"

SRC_COL = "Bangla"
TGT_COL = "Marma"

OUT_CLEAN_CSV = "marma_bn_mr_clean.csv"

# Optional filters
MIN_CHARS = 1
MAX_CHARS = 300

# Optional split ratios (set SPLIT=False if you don't need splits)
SPLIT = True
TRAIN_FRAC = 0.90
VALID_FRAC = 0.05
TEST_FRAC  = 0.05
RANDOM_SEED = 42


# ----------------------------
# 2) CLEANING HELPERS
# ----------------------------
# common bullet characters seen in your Bangla column
BULLET_RE = re.compile(r"^\s*[·•●◦▪▫\-–—*]+\s*")

# Digit sets (ASCII + Bengali + Myanmar digits)
MYANMAR_DIGITS = "၀၁၂၃၄၅၆၇၈၉"
BENGALI_DIGITS = "০১২৩৪৫৬৭৮৯"
ASCII_DIGITS = "0123456789"

# Leading number pattern examples it removes:
# "৫၈. "  "၁၅၉. "  "၃၀၇ "  "290 "  etc.
LEADING_NUM_RE = re.compile(
    rf"^\s*([{ASCII_DIGITS}{BENGALI_DIGITS}{MYANMAR_DIGITS}]{{1,5}})\s*(?:[\.၊။:)\]]\s*|\s+)"
)

def normalize_text(x: object) -> str:
    """Unicode normalize + remove weird spaces + collapse whitespace."""
    if x is None:
        return ""
    if isinstance(x, float) and np.isnan(x):
        return ""

    s = str(x)

    # Normalize unicode (helps standardize some combined characters)
    s = unicodedata.normalize("NFKC", s)

    # Replace non-breaking space, zero-width space, BOM
    s = s.replace("\u00A0", " ").replace("\u200B", "").replace("\uFEFF", "")

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_sentence(x: object,
                   strip_bullets: bool = True,
                   strip_leading_numbers: bool = True,
                   collapse_punct: bool = True) -> str:
    """Clean one sentence string."""
    s = normalize_text(x)

    # remove starting bullets like "· "
    if strip_bullets:
        s = BULLET_RE.sub("", s)

    # remove starting row-number-like prefixes: "၅၈. " or "၃၀၇ "
    if strip_leading_numbers:
        s = LEADING_NUM_RE.sub("", s)

    # Optional: collapse repeated punctuation at end (your data has '।।' sometimes)
    if collapse_punct:
        s = re.sub(r"[।]{2,}", "।", s)  # collapse repeated danda
        s = re.sub(r"[!]{2,}", "!", s)
        s = re.sub(r"[?]{2,}", "?", s)

    return s.strip()


# ----------------------------
# 3) LOAD DATA
# ----------------------------
df = pd.read_excel(INPUT_XLSX, sheet_name=SHEET_NAME, engine="openpyxl")

# basic validation
missing = [c for c in [SRC_COL, TGT_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}. Found columns: {list(df.columns)}")

df = df[[SRC_COL, TGT_COL]].rename(columns={SRC_COL: "src", TGT_COL: "tgt"})


# ----------------------------
# 4) CLEAN DATA
# ----------------------------
df["src"] = df["src"].map(clean_sentence)
df["tgt"] = df["tgt"].map(clean_sentence)

# Drop empty rows
df = df[(df["src"] != "") & (df["tgt"] != "")].copy()

# Optional length filtering (characters)
df["src_len"] = df["src"].str.len()
df["tgt_len"] = df["tgt"].str.len()
df = df[
    (df["src_len"].between(MIN_CHARS, MAX_CHARS)) &
    (df["tgt_len"].between(MIN_CHARS, MAX_CHARS))
].copy()

# Remove duplicates (exact duplicate pairs)
before = len(df)
df = df.drop_duplicates(subset=["src", "tgt"]).reset_index(drop=True)
after = len(df)

# Drop helper cols if you want a clean 2-col file
df_clean = df[["src", "tgt"]].copy()


# ----------------------------
# 5) SAVE CLEAN CSV
# ----------------------------
df_clean.to_csv(OUT_CLEAN_CSV, index=False, encoding="utf-8")
print(f"Saved: {OUT_CLEAN_CSV}")
print(f"Rows before dedup: {before}, after dedup: {after}")
print("Sample rows:")
print(df_clean.sample(5, random_state=RANDOM_SEED))


# ----------------------------
# 6) OPTIONAL: TRAIN/VALID/TEST SPLIT
# ----------------------------
if SPLIT:
    if not np.isclose(TRAIN_FRAC + VALID_FRAC + TEST_FRAC, 1.0):
        raise ValueError("TRAIN_FRAC + VALID_FRAC + TEST_FRAC must equal 1.0")

    df_shuf = df_clean.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)
    n = len(df_shuf)
    n_train = int(n * TRAIN_FRAC)
    n_valid = int(n * VALID_FRAC)

    train_df = df_shuf.iloc[:n_train]
    valid_df = df_shuf.iloc[n_train:n_train + n_valid]
    test_df  = df_shuf.iloc[n_train + n_valid:]

    train_df.to_csv("train.csv", index=False, encoding="utf-8")
    valid_df.to_csv("valid.csv", index=False, encoding="utf-8")
    test_df.to_csv("test.csv", index=False, encoding="utf-8")

    print(f"Split saved: train.csv ({len(train_df)}), valid.csv ({len(valid_df)}), test.csv ({len(test_df)})")


Saved: marma_bn_mr_clean.csv
Rows before dedup: 2099, after dedup: 2058
Sample rows:
                                src  \
1298  আমার একটি নতুন ফোন কিনতে হবে।   
591                   তুমি চা খাবে?   
1318    আমার কিছু কেনাকাটা করতে হবে   
1067     আমার একটি নতুন জুতা দরকার।   
29    তারা লাঞ্চ বিরতিতে বেরিয়েছে।   

                                                tgt  
1298                ငမာ ဖုန်း အသစ် တစ်လုံး ဝယ်ရမယ်။  
591                        နင် လက်ဖက်ရည် သောက်မလား။  
1318                       ငမာ တကျေ့ ဝယ်ရဖို့ ဟိရေ॥  
1067                       ငါ့မှာ ဖိနပ်အသစ် လိုတယ်။  
29    ယက်သူရို မွန်တည့်စားပြီး အနားမာ အပြင်ထွက်တယ်။  
Split saved: train.csv (1852), valid.csv (102), test.csv (104)


Sanity-check script

In [None]:
import pandas as pd
import re

# =========================
# CONFIG
# =========================
DATA_FILE = "marma_bn_mr_clean.csv"   # or train.csv
SRC_COL = "src"
TGT_COL = "tgt"
MAX_LEN_RATIO = 3.0   # src/tgt length ratio threshold
SAMPLE_N = 10


# =========================
# LOAD
# =========================
df = pd.read_csv(DATA_FILE)

print("=" * 60)
print("DATASET SANITY CHECK")
print("=" * 60)

# =========================
# BASIC CHECKS
# =========================
print("\n[1] BASIC INFO")
print(f"Total rows: {len(df)}")
print("Columns:", list(df.columns))

assert SRC_COL in df.columns, f"Missing column: {SRC_COL}"
assert TGT_COL in df.columns, f"Missing column: {TGT_COL}"


# =========================
# EMPTY / NULL CHECK
# =========================
print("\n[2] EMPTY / NULL CHECK")
null_src = df[SRC_COL].isna().sum()
null_tgt = df[TGT_COL].isna().sum()
empty_src = (df[SRC_COL].str.strip() == "").sum()
empty_tgt = (df[TGT_COL].str.strip() == "").sum()

print(f"Null src: {null_src}")
print(f"Null tgt: {null_tgt}")
print(f"Empty src: {empty_src}")
print(f"Empty tgt: {empty_tgt}")


# =========================
# DUPLICATES
# =========================
print("\n[3] DUPLICATE CHECK")
dup_pairs = df.duplicated(subset=[SRC_COL, TGT_COL]).sum()
dup_src = df[SRC_COL].duplicated().sum()

print(f"Duplicate (src,tgt) pairs: {dup_pairs}")
print(f"Duplicate src only: {dup_src}")


# =========================
# LENGTH STATS
# =========================
print("\n[4] LENGTH STATISTICS")
df["src_len"] = df[SRC_COL].str.len()
df["tgt_len"] = df[TGT_COL].str.len()

print("SRC length:")
print(df["src_len"].describe())

print("\nTGT length:")
print(df["tgt_len"].describe())


# =========================
# LENGTH RATIO (ALIGNMENT)
# =========================
print("\n[5] LENGTH RATIO CHECK")
df["len_ratio"] = df["src_len"] / (df["tgt_len"] + 1e-6)

bad_ratio = df[
    (df["len_ratio"] > MAX_LEN_RATIO) |
    (df["len_ratio"] < 1 / MAX_LEN_RATIO)
]

print(f"Highly misaligned pairs (ratio > {MAX_LEN_RATIO}): {len(bad_ratio)}")

if len(bad_ratio) > 0:
    print("\nExample misaligned rows:")
    print(bad_ratio[[SRC_COL, TGT_COL, "len_ratio"]].head(5))


# =========================
# WEIRD CHARACTER CHECK
# =========================
print("\n[6] WEIRD CHARACTER CHECK")

def find_weird_chars(text):
    # catches leftover bullets, control chars, boxes, etc.
    return bool(re.search(r"[·•●◦▪▫\uFFFD]", text))

weird_src = df[SRC_COL].apply(find_weird_chars).sum()
weird_tgt = df[TGT_COL].apply(find_weird_chars).sum()

print(f"Weird chars in src: {weird_src}")
print(f"Weird chars in tgt: {weird_tgt}")


# =========================
# RANDOM SAMPLES (HUMAN CHECK)
# =========================
print("\n[7] RANDOM SAMPLE CHECK")
samples = df.sample(min(SAMPLE_N, len(df)), random_state=42)

for i, row in samples.iterrows():
    print("-" * 40)
    print("SRC:", row[SRC_COL])
    print("TGT:", row[TGT_COL])


# =========================
# FINAL VERDICT
# =========================
print("\n[8] FINAL VERDICT")
if (
    null_src == 0 and null_tgt == 0 and
    empty_src == 0 and empty_tgt == 0 and
    dup_pairs == 0
):
    print("✅ Dataset looks CLEAN and READY for training.")
else:
    print("⚠️ Issues detected. Fix before training.")


DATASET SANITY CHECK

[1] BASIC INFO
Total rows: 2058
Columns: ['src', 'tgt']

[2] EMPTY / NULL CHECK
Null src: 0
Null tgt: 0
Empty src: 0
Empty tgt: 0

[3] DUPLICATE CHECK
Duplicate (src,tgt) pairs: 0
Duplicate src only: 278

[4] LENGTH STATISTICS
SRC length:
count    2058.000000
mean       24.312925
std        11.259749
min         4.000000
25%        17.000000
50%        21.000000
75%        28.000000
max        83.000000
Name: src_len, dtype: float64

TGT length:
count    2058.000000
mean       30.393586
std        13.089483
min         6.000000
25%        21.000000
50%        27.000000
75%        36.000000
max        96.000000
Name: tgt_len, dtype: float64

[5] LENGTH RATIO CHECK
Highly misaligned pairs (ratio > 3.0): 1

Example misaligned rows:
                 src                                                tgt  \
788  আমরা ধ্যান করি।  ကျွန်တော်တို့ ဗုဒ္ဓဘာသာ ငြိမ်းချမ်း စိတ်ထိန်းတယ်။   

     len_ratio  
788   0.306122  

[6] WEIRD CHARACTER CHECK
Weird chars in src: 0
Weird

In [None]:
from google.colab import userdata
userdata.get('machinetranslation11')

'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'

Tokenization code

In [None]:
from datasets import load_dataset
from transformers import MBart50TokenizerFast
import numpy as np

# =========================
# CONFIG
# =========================
DATA_FILES = {
    "train": "train.csv",
    "validation": "valid.csv",
    "test": "test.csv",
}

SRC_COL = "src"
TGT_COL = "tgt"

SRC_LANG = "bn_IN"   # Bangla
TGT_LANG = "my_MM"   # Marma (use closest supported code; Marma is not native in mBART)

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

MAX_SRC_LEN = 256
MAX_TGT_LEN = 256


# =========================
# LOAD DATASET
# =========================
dataset = load_dataset("csv", data_files=DATA_FILES)

print(dataset)


# =========================
# LOAD TOKENIZER
# =========================
tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang=SRC_LANG,
    tgt_lang=TGT_LANG,
)

print("\nTokenizer loaded")
print("Pad token id:", tokenizer.pad_token_id)
print("SRC lang token id:", tokenizer.lang_code_to_id[SRC_LANG])
print("TGT lang token id:", tokenizer.lang_code_to_id[TGT_LANG])


# =========================
# TOKENIZATION FUNCTION
# =========================
def tokenize_batch(batch):
    # 1) Tokenize source sentences
    model_inputs = tokenizer(
        batch[SRC_COL],
        max_length=MAX_SRC_LEN,
        truncation=True,
    )

    # 2) Tokenize target sentences
    labels = tokenizer(
        text_target=batch[TGT_COL],
        max_length=MAX_TGT_LEN,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# =========================
# APPLY TOKENIZATION
# =========================
tokenized_dataset = dataset.map(
    tokenize_batch,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing dataset",
)

print("\nTokenization complete")
print(tokenized_dataset)


# =========================
# SANITY CHECK TOKENS
# =========================
sample = tokenized_dataset["train"][0]

print("\n--- TOKENIZATION SANITY CHECK ---")
print("Input IDs (src):", sample["input_ids"][:20])
print("Label IDs (tgt):", sample["labels"][:20])

print("\nDecoded src:")
print(tokenizer.decode(sample["input_ids"], skip_special_tokens=False))

print("\nDecoded tgt:")
print(tokenizer.decode(
    [i for i in sample["labels"] if i != -100],
    skip_special_tokens=False
))


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1852
    })
    validation: Dataset({
        features: ['src', 'tgt'],
        num_rows: 102
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 104
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]


Tokenizer loaded
Pad token id: 1
SRC lang token id: 250028
TGT lang token id: 250017


Tokenizing dataset:   0%|          | 0/1852 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/102 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/104 [00:00<?, ? examples/s]


Tokenization complete
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1852
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 104
    })
})

--- TOKENIZATION SANITY CHECK ---
Input IDs (src): [250028, 29388, 14536, 26557, 59354, 6, 208995, 3495, 9591, 125, 2]
Label IDs (tgt): [250017, 110408, 27501, 6, 110244, 6, 167708, 53069, 26504, 6, 134292, 2742, 185958, 2]

Decoded src:
bn_IN আমার একটি নতুন ফোন কিনতে হবে।</s>

Decoded tgt:
my_MM ငမာ ဖုန်း အသစ် တစ်လုံး ဝယ်ရမယ်။</s>


Tokenize + SAVE

In [None]:
from datasets import load_dataset
from transformers import MBart50TokenizerFast

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
SRC_LANG = "bn_IN"
TGT_LANG = "my_MM"

DATA_FILES = {
    "train": "train.csv",
    "validation": "valid.csv",
    "test": "test.csv",
}

tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang=SRC_LANG,
    tgt_lang=TGT_LANG,
)

dataset = load_dataset("csv", data_files=DATA_FILES)

def tokenize(batch):
    inputs = tokenizer(
        batch["src"],
        truncation=True,
        max_length=256,
    )
    labels = tokenizer(
        text_target=batch["tgt"],
        truncation=True,
        max_length=256,
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

# 🔥 SAVE TO DISK (THIS IS THE FIX)
tokenized_dataset.save_to_disk("tokenized_dataset")

print("✅ Tokenized dataset saved to disk")


Map:   0%|          | 0/1852 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1852 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/102 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/104 [00:00<?, ? examples/s]

✅ Tokenized dataset saved to disk


Fine-tuning code

In [None]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import torch # Import torch for cuda.empty_cache

# =====================
# CONFIG
# =====================
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

SRC_LANG = "bn_IN"
TGT_LANG = "my_MM"

DATA_FILES = {
    "train": "train.csv",
    "validation": "valid.csv",
    "test": "test.csv",
}

OUTPUT_DIR = "outputs/mbart-bn-marma"

MAX_LEN = 256
BATCH_SIZE = 1  # Reduced batch size
GRAD_ACCUM = 16 # Increased gradient accumulation steps
EPOCHS = 15
LEARNING_RATE = 3e-5

USE_FP16 = True   # set True ONLY if GPU is enabled


# =====================
# LOAD DATASET
# =====================
dataset = load_dataset("csv", data_files=DATA_FILES)
print(dataset)


# =====================
# TOKENIZER
# =====================
tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang=SRC_LANG,
    tgt_lang=TGT_LANG,
)


# =====================
# TOKENIZATION
# =====================
def tokenize(batch):
    inputs = tokenizer(
        batch["src"],
        truncation=True,
        max_length=MAX_LEN,
    )
    targets = tokenizer(
        text_target=batch["tgt"],
        truncation=True,
        max_length=MAX_LEN,
    )
    inputs["labels"] = targets["input_ids"]
    return inputs


tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing dataset",
)


# =====================
# MODEL
# =====================
# Clear CUDA cache before loading model
if torch.cuda.is_available():
    torch.cuda.empty_cache()

model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)

model.config.forced_bos_token_id = tokenizer.lang_code_to_id[TGT_LANG]
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[TGT_LANG]


# =====================
# METRICS (BLEU)
# =====================
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [[l.strip()] for l in decoded_labels]

    return {
        "bleu": bleu.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )["score"]
    }


# =====================
# TRAINING ARGUMENTS
# =====================
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=200,

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,

    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,

    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=5,

    fp16=USE_FP16,
    dataloader_pin_memory=False,

    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",

    report_to="none",
)


# =====================
# TRAINER
# =====================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer, # Changed from 'tokenizer=tokenizer' to 'processing_class=tokenizer'
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
    compute_metrics=compute_metrics,
)


# =====================
# TRAIN
# =====================
trainer.train()


# =====================
# FINAL EVALUATION
# =====================
if "test" in tokenized_dataset:
    print(trainer.evaluate(tokenized_dataset["test"]))


# =====================
# SAVE MODEL
# =====================
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 1852
    })
    validation: Dataset({
        features: ['src', 'tgt'],
        num_rows: 102
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 104
    })
})


Tokenizing dataset:   0%|          | 0/1852 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/102 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/104 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Step,Training Loss,Validation Loss,Bleu
1000,0.2153,1.883968,7.067221


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 1.9455935955047607, 'eval_bleu': 13.819466704710765, 'eval_runtime': 44.2615, 'eval_samples_per_second': 2.35, 'eval_steps_per_second': 2.35, 'epoch': 15.0}


('outputs/mbart-bn-marma/tokenizer_config.json',
 'outputs/mbart-bn-marma/special_tokens_map.json',
 'outputs/mbart-bn-marma/sentencepiece.bpe.model',
 'outputs/mbart-bn-marma/added_tokens.json',
 'outputs/mbart-bn-marma/tokenizer.json')

mBART training code

In [None]:
import numpy as np
import evaluate
from datasets import load_from_disk, load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

# =========================
# CONFIG
# =========================
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# IMPORTANT:
# Bangla is supported; Marma is not.
# If Marma text is Myanmar script, use my_MM.
SRC_LANG = "bn_IN"
TGT_LANG = "my_MM"

TOKENIZED_DATA_DIR = "tokenized_dataset"   # optional if saved via save_to_disk()
OUTPUT_DIR = "outputs/mbart-bn-mr"

MAX_GEN_LEN = 128
NUM_BEAMS = 5

# =========================
# Re-define tokenization configuration and process if tokenized_dataset is not defined
# This makes the cell self-sufficient for tokenized_dataset
# =========================
if 'tokenized_dataset' not in globals():
    print("Warning: 'tokenized_dataset' not found in global scope. Re-running tokenization steps within this cell.")
    # Configuration from WdpDyJIaMK5K
    DATA_FILES = {
        "train": "train.csv",
        "validation": "valid.csv",
        "test": "test.csv",
    }
    SRC_COL = "src"
    TGT_COL = "tgt"
    MAX_SRC_LEN = 256
    MAX_TGT_LEN = 256

    # Load raw dataset
    dataset = load_dataset("csv", data_files=DATA_FILES)

    # Load tokenizer (same as in this cell, but needed for tokenize_batch)
    tokenizer_for_tokenization = MBart50TokenizerFast.from_pretrained(
        MODEL_NAME,
        src_lang=SRC_LANG,
        tgt_lang=TGT_LANG,
    )

    # Tokenization function (copied from WdpDyJIaMK5K)
    def tokenize_batch(batch):
        model_inputs = tokenizer_for_tokenization(
            batch[SRC_COL],
            max_length=MAX_SRC_LEN,
            truncation=True,
        )
        labels = tokenizer_for_tokenization(
            text_target=batch[TGT_COL],
            max_length=MAX_TGT_LEN,
            truncation=True,
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Apply tokenization
    tokenized_dataset = dataset.map(
        tokenize_batch,
        batched=True,
        remove_columns=dataset["train"].column_names,
        desc="Tokenizing dataset within training cell",
    )
    print("Tokenization complete within training cell.")
else:
    print("'tokenized_dataset' found in global scope, proceeding.")

# =========================
# LOAD TOKENIZED DATA
# =========================
# Use the tokenized_dataset object directly
train_ds = tokenized_dataset["train"]
valid_ds = tokenized_dataset["validation"]
test_ds  = tokenized_dataset.get("test", None)


# =========================
# LOAD TOKENIZER + MODEL
# =========================
tokenizer = MBart50TokenizerFast.from_pretrained(
    MODEL_NAME,
    src_lang=SRC_LANG,
    tgt_lang=TGT_LANG,
)

model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)

# Force target language during generation
model.config.forced_bos_token_id = tokenizer.lang_code_to_id[TGT_LANG]
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[TGT_LANG]


# =========================
# DATA COLLATOR
# =========================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
)


# =========================
# METRICS
# =========================
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True
    )

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [[l.strip()] for l in decoded_labels]

    score = bleu.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["score"]

    return {"bleu": score}


# =========================
# TRAINING ARGS
# =========================
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="steps", # Corrected argument name
    eval_steps=1000,
    save_steps=1000,
    logging_steps=200,

    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,

    learning_rate=3e-5,
    num_train_epochs=8,

    predict_with_generate=True,
    generation_max_length=MAX_GEN_LEN,
    generation_num_beams=NUM_BEAMS,

    fp16=True,          # disable if no GPU
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="bleu",

    report_to="none",
)


# =========================
# TRAINER
# =========================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    # Updated 'tokenizer' to 'processing_class' for future compatibility
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


# =========================
# TRAIN
# =========================
trainer.train()


# =========================
# FINAL EVALUATION
# =========================
if test_ds is not None:
    print(trainer.evaluate(test_ds))


# =========================
# SAVE FINAL MODEL
# =========================
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


'tokenized_dataset' found in global scope, proceeding.


Step,Training Loss,Validation Loss




{'eval_loss': 1.827847957611084, 'eval_bleu': 8.825924643587731, 'eval_runtime': 27.2177, 'eval_samples_per_second': 3.821, 'eval_steps_per_second': 1.911, 'epoch': 8.0}


('outputs/mbart-bn-mr/tokenizer_config.json',
 'outputs/mbart-bn-mr/special_tokens_map.json',
 'outputs/mbart-bn-mr/sentencepiece.bpe.model',
 'outputs/mbart-bn-mr/added_tokens.json',
 'outputs/mbart-bn-mr/tokenizer.json')

Inference / Translation Code

In [None]:
import argparse
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# =====================
# ARGUMENTS
# =====================
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", default="outputs/mbart-bn-marma")
parser.add_argument("--src_lang", default="bn_IN")
parser.add_argument("--tgt_lang", default="my_MM")
parser.add_argument("--text", help="Bangla text to translate")
parser.add_argument("--input_file", help="File with one sentence per line")
parser.add_argument("--output_file", default="translations.txt")
parser.add_argument("--beam", type=int, default=5)
parser.add_argument("--max_len", type=int, default=128)
# Fix: Pass an empty list to parse_args() to prevent it from parsing kernel arguments.
# For interactive use in Colab, we'll set a default text below.
args = parser.parse_args(args=[])

# Set a default text for translation if none is provided via args (e.g., in Colab)
if not args.text and not args.input_file:
    args.text = "আমার একটি নতুন ফোন কিনতে হবে।"
    print(f"No text or input file provided. Using default text: '{args.text}'")

assert args.text or args.input_file, "Provide --text or --input_file"

# =====================
# DEVICE
# =====================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# =====================
# LOAD MODEL
# =====================
tokenizer = MBart50TokenizerFast.from_pretrained(
    args.model_dir,
    src_lang=args.src_lang,
    tgt_lang=args.tgt_lang
)

model = MBartForConditionalGeneration.from_pretrained(
    args.model_dir
).to(device)

model.eval()

# =====================
# TRANSLATION FUNCTION
# =====================
def translate(sentences):
    inputs = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=args.max_len, # Added max_length for explicit truncation
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[args.tgt_lang],
            num_beams=args.beam,
            max_new_tokens=args.max_len,
        )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


# =====================
# SINGLE SENTENCE
# =====================
if args.text:
    result = translate([args.text])[0]
    print("\nSOURCE :", args.text)
    print("TARGET :", result)

# =====================
# FILE TRANSLATION
# =====================
if args.input_file:
    with open(args.input_file, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f if line.strip()]

    translations = translate(sentences)

    with open(args.output_file, "w", encoding="utf-8") as f:
        for src, tgt in zip(sentences, translations):
            f.write(f"SRC: {src}\n")
            f.write(f"TGT: {tgt}\n\n")

    print(f"✅ Translations saved to {args.output_file}")


No text or input file provided. Using default text: 'আমার একটি নতুন ফোন কিনতে হবে।'
Using device: cuda


The tokenizer you are loading from 'outputs/mbart-bn-marma' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.



SOURCE : আমার একটি নতুন ফোন কিনতে হবে।
TARGET : ငမာ ဖုန်းအသစ် တစ်လုံး ဝယ်ရမယ်။


In [None]:
# =====================
# FILE INPUT
# =====================
input_file = "input.txt"
output_file = "output.txt"

with open(input_file, "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

translations = translate(sentences)

with open(output_file, "w", encoding="utf-8") as f:
    for src, tgt in zip(sentences, translations):
        f.write(f"SRC: {src}\n")
        f.write(f"TGT: {tgt}\n\n")

print(f"✅ Translations saved to {output_file}")


✅ Translations saved to output.txt


In [None]:
# This cell will now perform the inference on the test set before calculating BLEU
import sacrebleu
import pandas as pd # Import pandas if not already imported in the current context

# Ensure test_df and the translate function are available from previous cells
# test_df was created in the data cleaning/splitting stage (cell 9WE5ao48Lbz0)
# The translate function was defined in the inference code (cell Q0jhYl0_kB4T)

# Load test_df if it's not already in scope (e.g., if running this cell independently)
if 'test_df' not in globals():
    print("Loading test_df...")
    # Assuming 'test.csv' exists from the data cleaning/splitting step
    test_df = pd.read_csv('test.csv')

# Get source sentences from the test dataframe
src_sentences_for_bleu = test_df["src"].tolist()

# Generate predictions by translating the source sentences from the test set
# The 'translate' function is assumed to be defined in a previous cell (Q0jhYl0_kB4T)
print(f"Generating translations for {len(src_sentences_for_bleu)} sentences...")
predictions = translate(src_sentences_for_bleu)
print("Translations generated.")

# Extract references from the test_df
references = test_df["tgt"].tolist()

# Validate that predictions and references have the same length
if len(predictions) != len(references):
    raise ValueError("Mismatch between number of predictions and references. Cannot compute BLEU.")

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(predictions, [references])
print("BLEU score:", bleu.score)

Generating translations for 104 sentences...
Translations generated.
BLEU score: 14.270287570414585


chrF

In [None]:
from sacrebleu.metrics import CHRF

# Ensure test_df and translations are available
# test_df was created in the data cleaning/splitting stage
# translations was generated in the inference cell (Q0jhYl0_kB4T or UjarKDwElMPc)

# Extract references from the test_df
references = test_df["tgt"].tolist()

# 'predictions' should be the list of translated sentences, which is the 'translations' variable
# from the previous inference cell (Q0jhYl0_kB4T or UjarKDwElMPc)
predictions = translations # Assuming 'translations' from previous cell is the list of predictions

chrf = CHRF(word_order=2)  # enable chrF++
score = chrf.corpus_score(predictions, [references]) # sacrebleu expects references to be a list of lists
print(score.score)

5.543692045648198


 METEOR

In [None]:
import nltk
from nltk.translate.meteor_score import meteor_score

# Download required resources (run once)
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download the specific resource

# Reference and hypothesis
reference_str = "আমি স্কুলে যাচ্ছি"
hypothesis_str = "আমি বিদ্যালয়ে যাচ্ছি"

# Tokenize the sentences
reference = [nltk.word_tokenize(reference_str)] # meteor_score expects reference to be a list of lists of tokens
hypothesis = nltk.word_tokenize(hypothesis_str)

# METEOR score
score = meteor_score(reference, hypothesis)
print("METEOR score:", score)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


METEOR score: 0.3333333333333333


TER


In [None]:
from sacrebleu.metrics import TER

# TER metric initialize
ter = TER()

# Machine-translated Marma (Bangla → Marma)
predictions = [
    "ငါ အခု အိပ်နေပါတယ်။"
]

# Human reference Marma
references = [
    ["ငါ အခု အိပ်နေပါတယ်။"]
]

# TER score
score = ter.corpus_score(predictions, references)
print("TER score:", score.score)


TER score: 0.0
