<a href="https://colab.research.google.com/github/hiterharris/Assignment-1/blob/master/MTAT_nllb_translate_without_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Install Dependencies


In [1]:
!pip install -q transformers sentencepiece torch


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Load the Pretrained NLLB Model

There are multiple NLLB model sizes, but for Kaggle (to avoid memory issues), use the distilled version:

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "facebook/nllb-200-distilled-600M"  # Change to larger models if needed

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")  # Move to GPU


# Define the Source & Target Language Tokens

Each language in NLLB-200 has a special token (<xxx_Latn>). Find the correct tokens from NLLB-200 Language Codes.

In [None]:
SRC_LANG = "<deu_Latn>"  # Source: German
TGT_LANG = "<eng_Latn>"  # Target: English

# Define the Translation Function

Now, create a function that translates a batch of sentences.

In [None]:
import torch

def translate(sentences, model, tokenizer, src_lang, tgt_lang, max_length=128, batch_size=4):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    translations = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        batch = [src_lang + sentence for sentence in batch]  # Add language token

        # Tokenize inputs
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

        # Generate translations
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang))

        # Decode and store translations
        translations.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    return translations


# Translate Example Sentences

In [None]:
test_sentences = '/kaggle/input/mtat25-ted-data-test/data/TED2020.de-en.de.test'

# Read test sentences from file
with open(test_sentences, "r", encoding="utf-8") as f:
    test_sentences = [line.strip() for line in f.readlines() if line.strip()]  # Remove empty lines

translated_sentences = translate(test_sentences, model, tokenizer, SRC_LANG, TGT_LANG)

# Print translations
#for src, tgt in zip(test_sentences, translated_sentences):
#    print(f"🔹 Source: {src}\n🔹 Translation: {tgt}\n")

# Save Translations to a New File

If you want to save the translated output, write it to a new file:

In [None]:
output_file="TED2020.de-en.de.test.nllb2en"
with open(output_file, "w", encoding="utf-8") as f:
    for sentence in translated_sentences:
        f.write(sentence + "\n")

print(f'Translations saved to {output_file}')