<a href="https://colab.research.google.com/github/joeyyy09/telugu-english-translation/blob/main/MTP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
# Step 1: Install necessary libraries
!pip install transformers[torch] datasets sacrebleu sentencepiece evaluate

# Step 2: Import required libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from tqdm import tqdm
import evaluate
import io
import torch

# Step 3: Prepare your dataset from the pre-uploaded TXT file
file_name = "English Telugu Data.txt"

try:
    with open(file_name, 'r', encoding='utf-8') as f:
        file_content = f.read()
except FileNotFoundError:
    print(f"--- ERROR: The file '{file_name}' was not found. ---")
    print("--- Please make sure you have uploaded the file to your Colab session. ---")
    # Stop execution if the file is not found
    raise

english_sentences = []
telugu_sentences = []

# Read the file line by line and parse it
for line in file_content.strip().split('\n'):
    parts = line.split('++++$++++')
    if len(parts) == 2:
        english_sentences.append(parts[0].strip())
        telugu_sentences.append(parts[1].strip())

data = {'english': english_sentences, 'telugu': telugu_sentences}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

print("\nDataset loaded successfully from file.")
print(f"Number of examples: {len(dataset)}")

# --- OPTIMIZATION: Check for GPU and set device ---
if not torch.cuda.is_available():
    print("\n--- WARNING: GPU not found. This will be very slow. ---")
    print("--- Go to Runtime > Change runtime type and select T4 GPU. ---")
    device = -1 # Use CPU
else:
    print("\nGPU found. Using CUDA for acceleration.")
    device = 0 # Use GPU

# Step 4: Load Models and Tokenizers with Optimizations
# --- OPTIMIZATION: Use float16 for faster inference on GPU ---
torch_dtype = torch.float16 if device == 0 else torch.float32

# NLLB-100 model (using nllb-200-distilled-600M as a strong baseline)
print("\nLoading NLLB model...")
nllb_model_name = "facebook/nllb-200-distilled-600M"
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_name, src_lang="tel_Telu", tgt_lang="eng_Latn")
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_name, torch_dtype=torch_dtype)
nllb_translator = pipeline('translation', model=nllb_model, tokenizer=nllb_tokenizer, src_lang="tel_Telu", tgt_lang="eng_Latn", device=device, batch_size=16)
print("NLLB model loaded.")

# mBART-50 model
print("\nLoading mBART-50 model...")
mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = AutoTokenizer.from_pretrained(mbart_model_name, src_lang="te_IN", tgt_lang="en_XX")
mbart_model = AutoModelForSeq2SeqLM.from_pretrained(mbart_model_name, torch_dtype=torch_dtype)
mbart_translator = pipeline('translation', model=mbart_model, tokenizer=mbart_tokenizer, src_lang="te_IN", tgt_lang="en_XX", device=device, batch_size=16)
print("mBART-50 model loaded.")

# Step 5: Generate Translations
def get_translations(translator, dataset):
    # Get the source texts (Telugu) for translation
    source_texts = [example["telugu"] for example in dataset]
    # The pipeline will handle batching automatically with the batch_size parameter
    translations = translator(source_texts)
    return [t['translation_text'] for t in translations]

print("\nGenerating translations with NLLB...")
nllb_translations = get_translations(nllb_translator, dataset)

print("\nGenerating translations with mBART-50...")
mbart_translations = get_translations(mbart_translator, dataset)

# Step 6: Calculate Scores
chrf_metric = evaluate.load("chrf")
bleu_metric = evaluate.load("sacrebleu")

# The reference translations (English) need to be in a list of lists format
references = [[example["english"]] for example in dataset]

# Calculate scores for NLLB
print("\nCalculating scores for NLLB...")
nllb_chrf_score = chrf_metric.compute(predictions=nllb_translations, references=references)
nllb_bleu_score = bleu_metric.compute(predictions=nllb_translations, references=references)

# Calculate scores for mBART-50
print("Calculating scores for mBART-50...")
mbart_chrf_score = chrf_metric.compute(predictions=mbart_translations, references=references)
mbart_bleu_score = bleu_metric.compute(predictions=mbart_translations, references=references)

# Step 7: Display Results
print("\n--- Evaluation Results ---")
print("\nNLLB-100 Scores:")
print(f"chrF Score: {nllb_chrf_score['score']:.2f}")
print(f"BLEU Score: {nllb_bleu_score['score']:.2f}")

print("\nmBART-50 Scores:")
print(f"chrF Score: {mbart_chrf_score['score']:.2f}")
print(f"BLEU Score: {mbart_bleu_score['score']:.2f}")

# Display translations for a side-by-side comparison
results_df = pd.DataFrame({
    "Telugu Source": df["telugu"],
    "English Reference": df["english"],
    "NLLB Translation": nllb_translations,
    "mBART-50 Translation": mbart_translations
})

print("\n--- Translation Comparison ---")
# Set pandas display options to show full text
pd.set_option('display.max_colwidth', None)
print(results_df.to_string())



Dataset loaded successfully from file.
Number of examples: 155798

GPU found. Using CUDA for acceleration.

Loading NLLB model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


NLLB model loaded.

Loading mBART-50 model...


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Device set to use cuda:0


mBART-50 model loaded.

Generating translations with NLLB...
