In [None]:
!pip install -U transformers datasets hf_xet

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting hf_xet
  Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m29.8 MB/s[

In [None]:
# Project Imports
import os
import json
import torch
from datasets import load_dataset
from transformers import pipeline

In [None]:
# Using datasets library to load 'dev' English source from Hugging Face
from datasets import load_dataset

# Loading the FLORES-200 'dev' split, focusing on English (eng_Latn-nso_Latn)
# This configuration ensures parallel sentences across English and Sepedi languages.
# We're loading the English text we will use to input into our model as the source,
# and the Sepedi (nso_Latn) first reference
print("Loading FLORES-200 English 'dev' split...")
flores_dev = load_dataset("facebook/flores", "eng_Latn-nso_Latn", split="dev", trust_remote_code=True)
print(f"Loaded {len(flores_dev)} English sentences from FLORES-200 'dev' split.")

# Extracting just the English sentences for translation
# We'll use this list as input to our MT models
english_source_sentences_dev = [item['sentence_eng_Latn'] for item in flores_dev]

print("\nFirst 5 English source sentences (DEV):")
for i, sentence in enumerate(english_source_sentences_dev[:5]):
    print(f"{i+1}. {sentence}")

original_sepedi_references_dev = [item['sentence_nso_Latn'] for item in flores_dev]
print(f"Loaded {len(original_sepedi_references_dev)} original Sepedi sentences for 'dev' split.")

print("\nFirst 5 Original Sepedi references (DEV):")
for i, sentence in enumerate(original_sepedi_references_dev[:5]):
    print(f"{i+1}. {sentence}")

# This will be for the final 'devtest' split for comprehensive evaluation later.
# Continuing tests with focus on 'dev' for now.
print("\nLoading FLORES-200 English 'devtest' split...")
flores_devtest = load_dataset("facebook/flores", "eng_Latn-nso_Latn", split="devtest", trust_remote_code=True)
print(f"Loaded {len(flores_devtest)} English sentences from FLORES-200 'devtest' split.")
english_source_sentences_devtest = [item['sentence_eng_Latn'] for item in flores_devtest]

print("\nFirst 5 English source sentences (DEVTEST):")
for i, sentence in enumerate(english_source_sentences_devtest[:5]):
    print(f"{i+1}. {sentence}")

original_sepedi_references_devtest = [item['sentence_nso_Latn'] for item in flores_devtest]
print(f"Loaded {len(original_sepedi_references_devtest)} original Sepedi sentences for 'devtest' split.")

print("\nFirst 5 Original Sepedi references (DEVTEST):")
for i, sentence in enumerate(original_sepedi_references_devtest[:5]):
   print(f"{i+1}. {sentence}")

Loading FLORES-200 English 'dev' split...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

flores.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Loaded 997 English sentences from FLORES-200 'dev' split.

First 5 English source sentences (DEV):
1. On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.
2. Lead researchers say this may bring early detection of cancer, tuberculosis, HIV and malaria to patients in low-income countries, where the survival rates for illnesses such as breast cancer can be half those of richer countries.
3. The JAS 39C Gripen crashed onto a runway at around 9:30 am local time (0230 UTC) and exploded, closing the airport to commercial flights.
4. The pilot was identified as Squadron Leader Dilokrit Pattavee.
5. Local media reports an airport fire vehicle rolled over while responding.
Loaded 997 original Sepedi sentences for 'dev' split.

First 5 Original Sepedi references (DEV):
1. Ka Mošupulogo, b

### Load corrected datasets from dfsi Github repo

In [None]:
# Clone repo
github_repo_url = "https://github.com/dsfsi/flores-fix-4-africa.git"
repo_name = github_repo_url.split('/')[-1].replace('.git', '') # Extracts 'flores-fix-4-africa'
print(f"Cloning GitHub repository: {github_repo_url}")
# Check if the directory already exists to avoid cloning multiple times on reruns
if not os.path.exists(f"/content/{repo_name}"):
    !git clone {github_repo_url}
    print(f"Repository '{repo_name}' cloned successfully.")
else:
    print(f"Repository '{repo_name}' already cloned.")

# Base path to the corrected data within the cloned repository
# The structure is: repo_name/data/corrected/{split_type}/{lang_code}.{split_type}
base_corrected_data_path = os.path.join("/content", repo_name, "data", "corrected")
print(f"Corrected data base path set to: {base_corrected_data_path}")


Cloning GitHub repository: https://github.com/dsfsi/flores-fix-4-africa.git
Cloning into 'flores-fix-4-africa'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 90 (delta 34), reused 75 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (90/90), 590.03 KiB | 16.39 MiB/s, done.
Resolving deltas: 100% (34/34), done.
Repository 'flores-fix-4-africa' cloned successfully.
Corrected data base path set to: /content/flores-fix-4-africa/data/corrected


In [None]:
def load_corrected_language_data(lang_code: str, split_type: str) -> list:
    """
    Loads corrected reference sentences for a given language and split type
    from the cloned flores-fix-4-africa GitHub repository.

    Args:
        lang_code (str): The language code (e.g., "nso_Latn", "hau_Latn").
        split_type (str): The split type ("dev" or "devtest").

    Returns:
        list: A list of corrected sentences. Returns an empty list if the file is not found.
    """
    # Construct the full file path dynamically
    file_path = os.path.join(base_corrected_data_path, split_type, f"{lang_code}.{split_type}")

    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(sentences)} corrected '{lang_code}' sentences for '{split_type}'.")
        return sentences
    else:
        print(f"WARNING: Corrected '{lang_code}' '{split_type}' file not found at {file_path}. Skipping.")
        return []

In [None]:
# Loading the corrected data fro the languages
all_corrected_references_by_lang = {
    "dev": {},
    "devtest": {}
}

# All all African languages with corrected data
african_languages_corrected = ["nso_Latn"] #  ["hau_Latn", "nso_Latn", "tso_Latn", "zul_Latn"]

for lang_code in african_languages_corrected:
    all_corrected_references_by_lang["dev"][lang_code] = load_corrected_language_data(lang_code, "dev")
    all_corrected_references_by_lang["devtest"][lang_code] = load_corrected_language_data(lang_code, "devtest")

# Example of accessing the data:
print("\nExample of accessing Sepedi (nso_Latn) dev corrected sentences:")
print(all_corrected_references_by_lang["dev"]["nso_Latn"][:5])

Loaded 997 corrected 'nso_Latn' sentences for 'dev'.
Loaded 1012 corrected 'nso_Latn' sentences for 'devtest'.

Example of accessing Sepedi (nso_Latn) dev corrected sentences:
['Ka Mošupulogo, boramahlale ba go tšwa Sekolong sa Yunibesithi ya Stanford sa tša maphelo ba tsebišitše ka go hlangwa ga sedirišwa se sefsa sa tekolo seo se ka beakanyago disele ka mehuta: chip yeo e gatišegago ye nnyane yeo e ka tšweletšwago ka go šomiša metšhene ya go gatiša ya inkjet ya sente ye tee ya U.S. ka o tee.', 'Banyakišiši ba ketapele ba re se se ka tliša temogo ya kankere, tuberculosis, HIV le malaria go balwetši kapela dinageng tša letseno la fase, moo ditekano tša go phela tša malwetši a bjalo ka kankere ya letswele di ka bago seripa sa tša dinaga tšeo di humilego.', 'JAS 39C Gripen e wetše godimo ga moo sefofane se sepelago gona ka bo 9:30 am nako ya selegae (0230UTC) gomme ya thuthupa, gwa tswalelwa boemafofane bakeng sa difofane tša tefelo.', 'Mootledi wa sefofane o tsebišitšwe bjalo ka Moetape

## Load NLLB-200 Model

In [None]:
# Defining the NLLB-200 model name, and methods to translate and save translations

# nllb_model_name = "facebook/nllb-200-distilled-600M" # lightweight version distilled-600M
nllb_model_name = "facebook/nllb-200-1.3B" # Larger model, may have better quality of results


def save_translations_to_file(
    translated_texts: list,
    output_folder: str,
    filename_prefix: str,
    src_lang: str,
    tgt_lang: str,
    split_type: str
):
    """
    Saves a list of translated texts to a text file.

    Args:
        translated_texts (list): List of strings, where each string is a translated sentence.
        output_folder (str): The directory where the output file will be saved.
        filename_prefix (str): Prefix for the filename (e.g., "nllb200").
        src_lang (str): Source language code.
        tgt_lang (str): Target language code.
        split_type (str): Type of split (e.g., "dev", "devtest").
    """
    output_dir = output_folder
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nSaving model outputs to {output_dir}")

    # Construct the filename dynamically
    output_filename = f"{filename_prefix}_{src_lang}_to_{tgt_lang}_{split_type}.txt"
    full_output_path = os.path.join(output_dir, output_filename)

    try:
        with open(full_output_path, "w", encoding="utf-8") as f:
            for sentence in translated_texts:
                f.write(sentence + "\n")
        print(f"Saved translations to: {full_output_path}")
    except Exception as e:
        print(f"Error saving translations to {full_output_path}: {e}")


def run_nllb_translation_and_save(
    target_lang_code: str,
    split_type: str,
    english_source_sentences: list,
    model_name: str = nllb_model_name,
    output_folder: str = "nllb_generated_translations",
    source_lang_code: str = "eng_Latn",
    device: int = 0 if torch.cuda.is_available() else -1 # 0 for GPU, -1 for CPU
) -> list:
    """
    Loads NLLB-200 model, performs translation for a given target language and split,
    and saves the generated translations to a file.

    Args:
        target_lang_code (str): The FLORES language code for the target language (e.g., "nso_Latn").
        split_type (str): The split type ("dev" or "devtest").
        english_source_sentences (list): List of English sentences to translate for this split.
        model_name (str): Name of the NLLB model to use.
        output_folder (str): Directory to save the output files.
        source_lang_code (str): Source language code, defaults to "eng_Latn".

    Returns:
        list: A list of the generated translated texts. Returns an empty list on error.
    """

    selected_device_info = "GPU" if device == 0 else "CPU"
    print(f"\n--- Running NLLB-200 Translation for {source_lang_code} to {target_lang_code} ({split_type.upper()} set) ---")
    print(f"Using device: {selected_device_info}")

    try:
        # Initialize the pipeline for the specific language pair
        nllb_translator = pipeline(
            'translation',
            model=model_name,
            src_lang=source_lang_code,
            tgt_lang=target_lang_code,
            device=device # Use GPU if available (0 is usually the first GPU)
        )
        print(f"NLLB-200 model '{model_name}' loaded successfully for {source_lang_code} to {target_lang_code}.")

        print(f"Generating translations for {target_lang_code} ({split_type} set)... This might take a few minutes for 1000 sentences.")
        nllb_translations_raw = nllb_translator(english_source_sentences)
        nllb_translated_texts = [item['translation_text'] for item in nllb_translations_raw]
        print(f"Generated {len(nllb_translated_texts)} translations for {target_lang_code} ({split_type} set).")

        print(f"\nFirst 5 NLLB-200 generated translations for {target_lang_code} ({split_type.upper()} set):")
        for i, translation in enumerate(nllb_translated_texts[:5]):
          print(f"{i+1}. {translation}")

        # Save the generated translations
        save_translations_to_file(
            translated_texts=nllb_translated_texts,
            output_folder=output_folder,
            filename_prefix="nllb200",
            src_lang=source_lang_code,
            tgt_lang=target_lang_code,
            split_type=split_type
        )

        return nllb_translated_texts

    except Exception as e:
        print(f"An error occurred during NLLB-200 translation or saving for {target_lang_code} ({split_type} set): {e}")
        return []

## Translating + Saving model translation outputs for Evaluation

In [None]:
# This approach is more complex and might not work directly in Colab without additional steps.
# It's recommended to try Option 1 first.

!git clone https://github.com/google-research/bleurt.git
%cd bleurt
!pip install . # <-- This is the intended way to install BLEURT
# %cd .. # Go back to the original directory

!pip install bleurt # <-- This line is attempting to install from PyPI and causes the error

!pip install -U transformers datasets hf_xet sacrebleu comet-ml bleurt evaluate



Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116 (from 1)[K
Receiving objects: 100% (134/134), 31.28 MiB | 32.00 MiB/s, done.
Resolving deltas: 100% (49/49), done.
/content/bleurt
Processing /content/bleurt
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456766 sha256=64ac643da3f0ad62d4b8b3fb3e76aaa5241786b47660469e328f18f8cad695a6
  Stored in directory: /tmp/pip-ephem-wheel-cache-juwgpbvw/wheels/49/ab/73/9318ab38d4cd1c732bcea8335d3f8d7c0316c8d07b9084fa85
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl

In [None]:
 # --- Translations and Saving the outputs for later Evaluations: ---
# We first define the target languages we will use, the dev and devtest translations
# For each target language we invoke the function -> "run_nllb_translation_and_save" for dev and devtest

african_target_languages = ["nso_Latn"] # ["hau_Latn", "nso_Latn", "tso_Latn", "zul_Latn"] # All 4 languages

# You might want to store all results in a dictionary for easy access
all_nllb_translations = {
    "dev": {},
    "devtest": {}
}

from sacrebleu.metrics import CHRF, BLEU
bleu_metric = BLEU()
chrf_metric = CHRF()


for target_lang in african_target_languages:
    print(f"\n--- Processing {target_lang} ---")

    # For FLORES, the English source sentences usually remain the same across different target evaluations.
    # So, english_source_sentences_dev and english_source_sentences_devtest can be reused.

    dev_translations = run_nllb_translation_and_save(
        target_lang_code=target_lang,
        split_type="dev",
        english_source_sentences=english_source_sentences_dev, # Use the global DEV English sentences
        output_folder="nllb_generated_translations_distilled"
    )
    all_nllb_translations["dev"][target_lang] = dev_translations

    # --- Evaluate DEV translations ---
    if dev_translations: # Only evaluate if translations were generated
        print(f"\n--- Evaluating {target_lang} (DEV set) ---")
        # Get corrected references for the DEV split
        if target_lang in all_corrected_references_by_lang["dev"]:
            raw_references_dev = all_corrected_references_by_lang["dev"][target_lang]
            print(raw_references_dev)
            if raw_references_dev:
                # Structure references as list of lists for sacrebleu
                corrected_references_dev = [[ref] for ref in raw_references_dev]

                # Check for matching lengths before evaluating
                if len(dev_translations) == len(corrected_references_dev):
                    # Calculate BLEU for DEV
                    bleu_score_dev = bleu_metric.corpus_score(dev_translations, corrected_references_dev)
                    print(f"DEV BLEU Score for {target_lang}: {bleu_score_dev.score}")

                    # Calculate chrF++ for DEV
                    chrf_score_dev = chrf_metric.corpus_score(dev_translations, corrected_references_dev)
                    print(f"DEV chrF++ Score for {target_lang}: {chrf_score_dev.score}")
                else:
                    print(f"Warning: Mismatched number of generated translations ({len(dev_translations)}) and DEV references ({len(corrected_references_dev)}) for {target_lang}. Skipping DEV evaluation.")
            else:
                print(f"Corrected references list is empty for {target_lang} in DEV split. Skipping DEV evaluation.")
        else:
            print(f"Corrected references not found for {target_lang} in DEV split dictionary. Skipping DEV evaluation.")
    else:
        print(f"No translations generated for {target_lang} (DEV set). Skipping DEV evaluation.")



    devtest_translations = run_nllb_translation_and_save(
        target_lang_code=target_lang,
        split_type="devtest",
        english_source_sentences=english_source_sentences_devtest,  # Use the global DEVTEST English sentences
        output_folder="nllb_generated_translations_distilled"
    )
    all_nllb_translations["devtest"][target_lang] = devtest_translations

     # --- Evaluate DEVTEST translations ---
    if devtest_translations: # Only evaluate if translations were generated
        print(f"\n--- Evaluating {target_lang} (DEVTEST set) ---")
        # Get corrected references for the DEVTEST split
        if target_lang in all_corrected_references_by_lang["devtest"]:
            raw_references_devtest = all_corrected_references_by_lang["devtest"][target_lang]
            if raw_references_devtest:
                 # Structure references as list of lists for sacrebleu
                corrected_references_devtest = [[ref] for ref in raw_references_devtest]

                # Check for matching lengths before evaluating
                if len(devtest_translations) == len(corrected_references_devtest):
                    # Calculate BLEU for DEVTEST
                    bleu_score_devtest = bleu_metric.corpus_score(devtest_translations, corrected_references_devtest)
                    print(f"DEVTEST BLEU Score for {target_lang}: {bleu_score_devtest.score}")

                    # Calculate chrF++ for DEVTEST
                    chrf_score_devtest = chrf_metric.corpus_score(devtest_translations, corrected_references_devtest)
                    print(f"DEVTEST chrF++ Score for {target_lang}: {chrf_score_devtest.score}")
                else:
                    print(f"Warning: Mismatched number of generated translations ({len(devtest_translations)}) and DEVTEST references ({len(corrected_references_devtest)}) for {target_lang}. Skipping DEVTEST evaluation.")
            else:
                print(f"Corrected references list is empty for {target_lang} in DEVTEST split. Skipping DEVTEST evaluation.")
        else:
            print(f"Corrected references not found for {target_lang} in DEVTEST split dictionary. Skipping DEVTEST evaluation.")
    else:
         print(f"No translations generated for {target_lang} (DEVTEST set). Skipping DEVTEST evaluation.")

print("\n--- All NLLB-200 translations generated and saved for all specified languages. ---")

# Final summarative prints
print("\n--- Verification of a sample generated translation (DEV Sepedi) ---")
if "nso_Latn" in all_nllb_translations["dev"]:
    print("First 5 generated Sepedi (nso_Latn) sentences (DEV):")
    for i, sentence in enumerate(all_nllb_translations["dev"]["nso_Latn"][:5]):
        print(f"{i+1}. {sentence}")
else:
    print("Sepedi (nso_Latn) DEV translations not found in all_nllb_translations dictionary.")


--- Processing nso_Latn ---

--- Running NLLB-200 Translation for eng_Latn to nso_Latn (DEV set) ---
Using device: GPU


Device set to use cuda:0


NLLB-200 model 'facebook/nllb-200-1.3B' loaded successfully for eng_Latn to nso_Latn.
Generating translations for nso_Latn (dev set)... This might take a few minutes for 1000 sentences.
Generated 997 translations for nso_Latn (dev set).

First 5 NLLB-200 generated translations for nso_Latn (DEV set):
1. Ka Mošupologo, bo-rathutamahlale ba Sekolo sa tša Kalafo sa Yunibesithi ya Stanford ba ile ba tsebiša ka go hlangwa ga sedirišwa se sefsa sa go hlahloba seo se kgonago go arola disele go ya ka mohuta wa tšona: chip e nyenyane yeo e ka gatišwago yeo e ka tšweletšwago ka go diriša metšhene e tlwaelegilego ya go gatiša ka go diriša enke yeo e sa bitšego tšhelete e ka bago sente e tee ya U.S. e nngwe le e nngwe.
2. Banyakišiši ba bagolo ba bolela gore se se ka dira gore balwetši ba dinageng tšeo di humilego ba kgone go lemoga kankere, bolwetši bja mafahla, HIV le malaria e sa le ka pela, moo tekanyo ya go phologa ga malwetši a bjalo ka kankere ya matswele e ka bago seripa-gare sa ya dinaga 

Device set to use cuda:0


NLLB-200 model 'facebook/nllb-200-1.3B' loaded successfully for eng_Latn to nso_Latn.
Generating translations for nso_Latn (devtest set)... This might take a few minutes for 1000 sentences.
Generated 1012 translations for nso_Latn (devtest set).

First 5 NLLB-200 generated translations for nso_Latn (DEVTEST set):
1. O okeditše ka gore: "Ga bjale re na le magotlo a dikgwedi tše 4 ao e sego a bolwetši bja swikiri ao a kilego a ba le bolwetši bja swikiri".
2. Dr. Ehud Ur, moprofesara wa tša kalafo Yunibesithing ya Dalhousie kua Halifax, Nova Scotia le modula-setulo wa lefapha la tša kalafo le tša thutamahlale la Canadian Diabetes Association o lemošitše gore nyakišišo e sa le matšatšing a yona a mathomo.
3. Go swana le ditsebi tše dingwe, o belaela ge e ba bolwetši bja swikiri bo ka alafega, a bolela gore dilo tše di hweditšwego ga di na mohola go batho bao ba šetšego ba e-na le bolwetši bja swikiri bja Mohuta wa 1.
4. Ka Mošupologo, Sara Danius, e lego mongwaledi wa ka mehla wa Komiti ya