In [6]:
!pip install transformers datasets sacrebleu sentencepiece --quiet


In [7]:
!pip install -U datasets fsspec


Collecting fsspec
  Using cached fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)


In [8]:
from datasets import load_dataset
ds = load_dataset("ai4bharat/samanantar", "ta")
print(ds['train'][0])



{'idx': 0, 'src': 'Some 14 months later, the second calf is born.', 'tgt': 'சுமார் 14 மாதங்கள் கழித்து, இரண்டாம் கன்றை ஈனுகிறது.'}


In [9]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_length = 128  # adjust based on your data

# Set source and target languages — example English to Tamil
src_lang = "en_XX"
tgt_lang = "ta_IN"

tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang


In [10]:
print(ds.column_names)


{'train': ['idx', 'src', 'tgt']}


In [11]:
def preprocess_function(examples):
    inputs = examples["src"]    # source language
    targets = examples["tgt"]   # target language

    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # Replace padding token id's with -100 to ignore in loss calculation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
    ]

    return model_inputs

tokenized_datasets = ds["train"].map(preprocess_function, batched=True)


In [12]:
!pip install --upgrade transformers




In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",          # where to save checkpoints
    # evaluation_strategy="epoch",
    report_to="none",# evaluate after each epoch (optional) - REMOVED
    eval_strategy="no",           # evaluate after each epoch (optional) - ADDED
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

trainer.train()

# Save your fine-tuned model and tokenizer
model.save_pretrained("/content/final_model")
tokenizer.save_pretrained("/content/final_model")


  trainer = Trainer(
