In [None]:
# ============================================
# 0) INSTALL PACKAGES
# ============================================
!pip install transformers datasets evaluate sacrebleu sentencepiece --quiet

# ============================================
# 1) IMPORTS
# ============================================
import os
from pathlib import Path
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate

os.environ["WANDB_DISABLED"] = "true"  # disable WandB

# ============================================
# 2) LOAD DATASET
# ============================================
dataset_root = Path("/kaggle/input/parallel-corpus-for-english-urdu-language/Dataset")
eng_file = dataset_root / "english-corpus.txt"
urd_file = dataset_root / "urdu-corpus.txt"

# Read lines
with open(eng_file, "r", encoding="utf-8") as f:
    eng_lines = [l.strip() for l in f.readlines()]

with open(urd_file, "r", encoding="utf-8") as f:
    urd_lines = [l.strip() for l in f.readlines()]

# Ensure equal lengths
n = min(len(eng_lines), len(urd_lines))
eng_lines = eng_lines[:n]
urd_lines = urd_lines[:n]

dataset = Dataset.from_dict({"en": eng_lines, "ur": urd_lines})

# Train/validation/test split
split = dataset.train_test_split(test_size=0.1, seed=42)
inner = split['train'].train_test_split(test_size=0.1, seed=42)
data = DatasetDict({
    "train": inner["train"],
    "validation": inner["test"],
    "test": split["test"]
})

print("Dataset sizes:", {k: len(v) for k, v in data.items()})

# ============================================
# 3) LOAD mBART MODEL + TOKENIZER
# ============================================
MODEL = "facebook/mbart-large-50-many-to-many-mmt"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = MBart50TokenizerFast.from_pretrained(MODEL)
model = MBartForConditionalGeneration.from_pretrained(MODEL).to(device)

SRC_LANG = "en_XX"
TGT_LANG = "ur_PK"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# ============================================
# 4) PREPROCESSING
# ============================================
MAX_SOURCE_LENGTH = 64  # reduce for speed
MAX_TARGET_LENGTH = 64

def preprocess(batch):
    model_inputs = tokenizer(
        batch["en"], padding="max_length", truncation=True, max_length=MAX_SOURCE_LENGTH
    )
    labels = tokenizer(
        batch["ur"], padding="max_length", truncation=True, max_length=MAX_TARGET_LENGTH
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = data.map(preprocess, batched=True, remove_columns=data["train"].column_names)

# ============================================
# 5) DATA COLLATOR
# ============================================
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

# ============================================
# 6) METRICS
# ============================================
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[l] for l in decoded_labels]
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# ============================================
# 7) TRAINING ARGUMENTS
# ============================================
OUT_DIR = "/kaggle/working/mbart-en-ur"
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=3e-5,
    num_train_epochs=2,
    logging_steps=100,
    eval_steps=2000,
    save_strategy="no",
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=1,
    report_to="none"
)


# ============================================
# 8) TRAINER
# ============================================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ============================================
# 9) TRAIN
# ============================================
trainer.train()
trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

# ============================================
# 10) EVALUATE
# ============================================
test_result = trainer.evaluate(tokenized["test"])
print("Test BLEU:", test_result["eval_bleu"])

# ============================================
# 11) TRANSLATION FUNCTION
# ============================================
def translate(text_list):
    tokenizer.src_lang = SRC_LANG
    tokenizer.tgt_lang = TGT_LANG
    enc = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SOURCE_LENGTH).to(device)
    gen = model.generate(**enc, forced_bos_token_id=tokenizer.lang_code_to_id[TGT_LANG], max_length=MAX_TARGET_LENGTH, num_beams=5)
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

# ============================================
# 12) SAMPLE TRANSLATIONS
# ============================================
sample = data["test"].select(range(10))
en = [x["en"] for x in sample]
ur_ref = [x["ur"] for x in sample]
ur_pred = translate(en)

for i in range(len(en)):
    print(f"\n--- Example {i+1} ---")
    print("English:", en[i])
    print("Urdu Reference:", ur_ref[i])
    print("Urdu Pred:", ur_pred[i])


# ============================================
# 13) CUSTOM GENERATION TEST
# ============================================
custom_text = ["I am eating mangoes in the rain while coding Python."]

print("\n================= CUSTOM TEST =================")
print("English:", custom_text[0])

custom_pred = translate(custom_text)

print("Urdu Prediction:", custom_pred[0])
print("================================================")

In [None]:
# ============================================
# 0) INSTALL PACKAGES
# ============================================
!pip install transformers datasets evaluate sacrebleu sentencepiece --quiet

# ============================================
# 1) IMPORTS
# ============================================
import os
from pathlib import Path
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate

os.environ["WANDB_DISABLED"] = "true"  # disable WandB

# ============================================
# 2) LOAD DATASET
# ============================================
dataset_root = Path("/kaggle/input/parallel-corpus-for-english-urdu-language/Dataset")
eng_file = dataset_root / "english-corpus.txt"
urd_file = dataset_root / "urdu-corpus.txt"

# Read lines
with open(eng_file, "r", encoding="utf-8") as f:
    eng_lines = [l.strip() for l in f.readlines()]

with open(urd_file, "r", encoding="utf-8") as f:
    urd_lines = [l.strip() for l in f.readlines()]

# Ensure equal lengths
n = min(len(eng_lines), len(urd_lines))
eng_lines = eng_lines[:n]
urd_lines = urd_lines[:n]

dataset = Dataset.from_dict({"en": eng_lines, "ur": urd_lines})

# Train/validation/test split
split = dataset.train_test_split(test_size=0.1, seed=42)
inner = split['train'].train_test_split(test_size=0.1, seed=42)
data = DatasetDict({
    "train": inner["train"],
    "validation": inner["test"],
    "test": split["test"]
})

print("Dataset sizes:", {k: len(v) for k, v in data.items()})

# ============================================
# 3) LOAD mBART MODEL + TOKENIZER
# ============================================
MODEL = "facebook/mbart-large-50-many-to-many-mmt"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = MBart50TokenizerFast.from_pretrained(MODEL)
model = MBartForConditionalGeneration.from_pretrained(MODEL).to(device)

SRC_LANG = "en_XX"
TGT_LANG = "ur_PK"

tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

# ============================================
# 4) PREPROCESSING
# ============================================
MAX_SOURCE_LENGTH = 64  # reduce for speed
MAX_TARGET_LENGTH = 64

def preprocess(batch):
    model_inputs = tokenizer(
        batch["en"], padding="max_length", truncation=True, max_length=MAX_SOURCE_LENGTH
    )
    labels = tokenizer(
        batch["ur"], padding="max_length", truncation=True, max_length=MAX_TARGET_LENGTH
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = data.map(preprocess, batched=True, remove_columns=data["train"].column_names)

# ============================================
# 5) DATA COLLATOR
# ============================================
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

# ============================================
# 6) METRICS
# ============================================
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[l] for l in decoded_labels]
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# ============================================
# 7) TRAINING ARGUMENTS
# ============================================
OUT_DIR = "/kaggle/working/mbart-en-ur"
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=3e-5,
    num_train_epochs=2,
    logging_steps=100,
    eval_steps=2000,
    save_strategy="no",
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=1,
    report_to="none"
)


# ============================================
# 8) TRAINER
# ============================================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ============================================
# 9) TRAIN
# ============================================
trainer.train()
trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

# ============================================
# 10) EVALUATE
# ============================================
test_result = trainer.evaluate(tokenized["test"])
print("Test BLEU:", test_result["eval_bleu"])

# ============================================
# 11) TRANSLATION FUNCTION
# ============================================
def translate(text_list):
    tokenizer.src_lang = SRC_LANG
    tokenizer.tgt_lang = TGT_LANG
    enc = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SOURCE_LENGTH).to(device)
    gen = model.generate(**enc, forced_bos_token_id=tokenizer.lang_code_to_id[TGT_LANG], max_length=MAX_TARGET_LENGTH, num_beams=5)
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

# ============================================
# 12) SAMPLE TRANSLATIONS
# ============================================
sample = data["test"].select(range(10))
en = [x["en"] for x in sample]
ur_ref = [x["ur"] for x in sample]
ur_pred = translate(en)

for i in range(len(en)):
    print(f"\n--- Example {i+1} ---")
    print("English:", en[i])
    print("Urdu Reference:", ur_ref[i])
    print("Urdu Pred:", ur_pred[i])


# ============================================
# 13) CUSTOM GENERATION TEST
# ============================================
custom_text = ["I am eating mangoes in the rain while coding Python."]

print("\n================= CUSTOM TEST =================")
print("English:", custom_text[0])

custom_pred = translate(custom_text)

print("Urdu Prediction:", custom_pred[0])
print("================================================")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", 

2025-11-25 03:34:09.763643: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764041649.968157      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764041650.032396      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Dataset sizes: {'train': 19864, 'validation': 2208, 'test': 2453}


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Map:   0%|          | 0/19864 [00:00<?, ? examples/s]

Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/2453 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]



Step,Training Loss
100,3.3993
200,0.2087
300,0.1813
400,0.1903
500,0.1821
600,0.1735
700,0.13
800,0.1573
900,0.1622
1000,0.1829
