<a href="https://colab.research.google.com/github/himanshudas13/Translate/blob/main/BARTtranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from transformers import MBartTokenizer
from transformers import MBartForConditionalGeneration, MBartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments


In [8]:
import json

translate_dataset='/content/drive/MyDrive/ML-DL/Translate/Odia_poetic_sentences.json'
# Load the dataset from the JSON file
with open(translate_dataset, 'r') as f:
    data = json.load(f)

# Extract source and target texts
source_texts = data['source_texts']
target_texts = data['target_texts']

# Example: Printing the first pair
print("Source (Odia):", source_texts[0])
print("Target (English):", target_texts[0])

Source (Odia): ତୁମେ ଆସିଲେ ମୋ ଜୀବନରେ
Target (English): When you entered my life


In [12]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [13]:
from torch.utils.data import Dataset as TorchDataset

class TranslationDataset(TorchDataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length=128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source = self.source_texts[idx]
        target = self.target_texts[idx]

        # Tokenizing source texts
        source_encodings = self.tokenizer(
                source,
                return_tensors="pt",
                padding='max_length',
                truncation=True,
                max_length=self.max_length,  # Ensure the input texts are truncated or padded to max_length
                add_special_tokens=True,  # Add special tokens for the encoder-decoder model
                padding_side='right'  # Pads to the right (standard for MBart)
            )

        # Tokenizing target texts
        target_encodings = self.tokenizer(
                target,
                return_tensors="pt",
                padding='max_length',
                truncation=True,
                max_length=self.max_length,  # Ensure the input texts are truncated or padded to max_length
                add_special_tokens=True,  # Add special tokens for the encoder-decoder model
                padding_side='right'  # Pads to the right (standard for MBart)
            )

        # Returning the tokenized input and output
        return {
            'input_ids': source_encodings['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': source_encodings['attention_mask'].squeeze(0),  # Remove batch dimension
            'labels': target_encodings['input_ids'].squeeze(0)  # Labels are also the tokenized target
        }

# Prepare the train and evaluation datasets
train_dataset = TranslationDataset(source_texts, target_texts, tokenizer)
eval_dataset = TranslationDataset(source_texts, target_texts, tokenizer)


In [14]:


# Fine-tuning arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=100,
    report_to=None,
    run_name="Translate-Odia-Eng",
    fp16=True,
)





In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Start training
trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,3.7422,0.005889
2,0.0042,0.000694
3,0.0008,0.000621
4,0.0007,0.000576
5,0.0007,0.000552




TrainOutput(global_step=625, training_loss=0.5999337917834521, metrics={'train_runtime': 452.936, 'train_samples_per_second': 5.52, 'train_steps_per_second': 1.38, 'total_flos': 677228052480000.0, 'train_loss': 0.5999337917834521, 'epoch': 5.0})

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)  # Move model to GPU if available
model.eval()

input_text = "ତୁମେ କେଉଁଠାରେ ଥାଓ?"
inputs = tokenizer(input_text, return_tensors="pt").to(device)  # Move inputs to the same device as the model

translated_tokens = model.generate(**inputs)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print("Translation:", translated_text)


Translation: I wish to stay by your side?


In [21]:
model.save_pretrained('./my_model')  # Replace './my_model' with your preferred directory path
