### OPENAI API

In [None]:
from openai import OpenAI

YOUR_API_KEY = "..."

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=YOUR_API_KEY,
)

### Languages

In [54]:
import re
import os
import json
import pandas as pd
from tqdm import tqdm

source_lang = "French"
source_language= "French"
country="Bolivia"
code="BO"

ethnologue_code_and_name= """
aroAraona
ayoAyoreo
brgBaure
bvlBolivian Sign Language
cawCallawalla
cazCanichana
cavCavineña
cybCayubaba
ayrCentral Aymara
caoChácobo
cajChané
capChipaya
caxChiquitano
guiEastern Bolivian Guaraní
eseEse Ejja
gyrGuarayu
ignIgnaciano
iteItene
itoItonama
jorJorá
lecLeco
mpdMachinere
mzpMovima
qulNorth Bolivian Quechua
pcpPacahuara
pnkPaunaka
psmPauserna
puqPuquina
reyReyesano
sarSaraveca
srqSirionó
quhSouth Bolivian Quechua
tnaTacana
tpjTapieté
tnoToromono
trnTrinitario
casTsimané
ureUru
mtpWeenhayek
gnwWestern Bolivian Guaraní
yaaYaminahua
yuqYuqui
yuzYuracare
"""

liste_langues = [ligne.strip() for ligne in ethnologue_code_and_name.strip().split('\n') if ligne.strip()]

print(len(liste_langues))

df = pd.DataFrame({country: liste_langues})

df[country] = df[country].str[3:]

df.to_excel("langues_code.xlsx", index=False)

43


### Prompt

In [55]:
TWENTY_POINT_SCALE = '''
    Here is an ideal translation in English:

    ### 1. Lexical Fidelity
    Are the words and phrases translated and then back-translated correctly?
    *Evaluate word-for-word and idiomatic correspondences.*

    **Rating Scale:**

    * **0** = Frequent mistranslations / **1** = Several significant errors / **2** = A few acceptable deviations / **3** = Good overall correspondence / **4** = Very high lexical fidelity

    ### 2. Syntactic Structure
    Are the word order, grammar, and punctuation consistent with the original text?

    **Rating Scale:**

    * **0** = Disorganized / **1** = Major errors / **2** = Acceptable but awkward / **3** = Fluent with minor errors / **4** = Very well-structured

    ### 3. Overall Meaning Conveyance
    Is the core message understood, even if some words are not identical?

    **Rating Scale:**

    * **0** = Incomprehensible / **1** = Numerous ambiguities / **2** = General meaning preserved but with some loss / **3** = Well conveyed / **4** = Message perfectly clear

    ### 4. Tone and Register
    Is the tone (e.g., formal/informal, empathetic, neutral) preserved?

    **Rating Scale:**

    * **0** = Inconsistent / **1** = Major discrepancies / **2** = Register is shaky but acceptable / **3** = Tone is well-respected / **4** = Register is accurate and natural

    ### 5. Natural Fluency
    Does the back-translated version sound natural and "human"?

    **Rating Scale:**

    * **0** = Broken or robotic language / **1** = Difficult to follow / **2** = Readable but awkward / **3** = Relatively fluent / **4** = Very fluent and natural

    Final Score: Add the 5 scores for a total out of 20.

    Possible Interpretation:
    [17–20]: Excellent language proficiency
    [14–16]: Good proficiency, suitable for professional use
    [10–13]: Average, some weaknesses but understandable
    [6–9]: Weak, risky translation
    [0–5]: Very weak, unfit for real use
    '''

EXPECTED_FORMAT = """
    {
    "total_Score": "[sum_of_all_scores]",
    "scores": {
        "1. Lexical Fidelity": numerical_score_for_lexical_fidelity,
        "2. Syntactic Structure": numerical_score_for_syntactic_structure,
        "3. Overall Meaning Conveyance": numerical_score_for_overall_meaning,
        "4. Tone and Register": numerical_score_for_tone_and_register,
        "5. Natural Fluency": numerical_score_for_natural_fluency
    }
    }

    Justification: [brief justification for each score here]
    """

system_prompt = '''
You are a translator specialized in low-resource languages.
Your primary task is to provide translations even for languages or phrases where full or official data may be limited.
Even if you do not know the exact or complete translation, you must always attempt to provide the best possible approximation based on context, linguistic patterns, and related languages.
Never respond with "I don't know"—always try.
'''

### Evaluation

In [56]:
"""
Compares the original and back-translated text, providing a score and justification.
"""
"""
This file defines a generic translation function using the OpenAI API.
"""
def generate_translation(paragraph, target_lang, country, model):
    prompt = f'''Without making comments or giving explanations,
    translate the following text into authentic {target_lang} spoken in {country}.
    Whatever the result, try :

    {paragraph}
    '''
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": system_prompt},
                  {"role": "user", "content": prompt}],
        temperature=0.0
    )
    return response.choices[0].message.content.strip()


def generate_back_translation(translation, target_lang, source_lang, country, model):
    prompt = f'''The text below is in authentic {target_lang}, a language spoken in {country}.
    Without making any comments or explanations, translate it into {source_lang} :

    {translation}
    '''
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": system_prompt},
                  {"role": "user", "content": prompt}],
        temperature=0.0
    )
    return response.choices[0].message.content.strip()

def llm_analysis(original, back_translated, source_lang, model="google/gemini-2.5-pro"):
    prompt_system = f""" **Context:** You are going to evaluate the quality of a back-translation into {source_lang} of an original text that is also in {source_lang}.
                The process is as follows: An original text was written in {source_lang}. It was translated into a foreign language (you will not know which one).
                This foreign translation was then translated back into {source_lang}, resulting in the back-translated text.

                **Objective:** Compare the original text with the back-translated text.

                **Task:** Assign a score out of 20 based on my linguistic evaluation rubric, provided as :

                {TWENTY_POINT_SCALE}

                Expected Response Format:

                {EXPECTED_FORMAT}


                """

    prompt_user = f"""

        Here are the texts to compare:

        **Original Text:**

        {original}

        **Back-translated Text:**

        {back_translated}

        **Additional Instructions:**
        * Do not make assumptions about the intermediate language.
        * Base your evaluation solely on the comparison of the two texts.
        * If discrepancies or errors hinder comprehension or alter the meaning, penalize according to the rubric.
        * You must justify your score.

        Expected Response Format:

        {EXPECTED_FORMAT}

    """
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": prompt_system},
                  {"role": "user", "content": prompt_user}],
        temperature=0.0
    )

    return response.choices[0].message.content.strip()


def extract_score_and_justification(response_text):
    score = -1
    justification = "Missing"

    note_match = re.search(r'"scores"\s*:\s*(\{(?:.|\n)*?\})', response_text)
    if note_match:
        note_block = note_match.group(1)
        pairs = re.findall(r'"(\d+)\.\s*[^"]*"\s*:\s*(\d+(?:\.\d+)?)', note_block)
        criteria = {num: float(val) for num, val in pairs}
        if len(criteria) == 5:
            score = sum(criteria.values())

    just_match = re.search(r"Justification\s*:\s*(.+)", response_text, re.DOTALL)
    if just_match:
        justification = just_match.group(1).strip()

    return score, justification

def confirm_translation(country, translation, target_lang):
    pred1 = "openai/GPT-4-turbo"
    pred2 = "google/gemini-2.5-pro"

    sys_prompt = f"""
    You are a language specialist. Your task is to determine whether a given text is written in {target_lang} as used in {country}.
    Answer with exactly one word on a single line, either:

    Yes
    or
    No
    """

    p = f"""Is the text below in {target_lang}?

    {translation}
    """

    # First model
    response = client.chat.completions.create(
        model=pred1,
        messages=[{"role": "system", "content": sys_prompt},
                  {"role": "user", "content": p}],
        temperature=0.0
    )
    resp1 = response.choices[0].message.content.strip()

    # Second model
    response = client.chat.completions.create(
        model=pred2,
        messages=[{"role": "system", "content": sys_prompt},
                  {"role": "user", "content": p}],
        temperature=0.0
    )
    resp2 = response.choices[0].message.content.strip()

    # Normalizing responses to handle accidental case differences/spaces
    resp1 = resp1.lower()
    resp2 = resp2.lower()

    if resp1 == "yes" and resp2 == "yes":
        confidence = 1
    else:
        confidence = 0

    return confidence

def process_row(paragraph, source_lang, target_lang, country, models):
    """
    models: liste de noms de modèles (str)
    Retourne: liste de tuples (nom_model, score)
    """
    if len(paragraph.strip()) < 10:
        return [(model, "Content too short") for model in models]

    results = []
    for model in models:
        try:
            translation = generate_translation(paragraph, target_lang, country, model)
            back_translation = generate_back_translation(translation, target_lang, source_lang, country, model)

            if not translation.strip() or not back_translation.strip():
                results.append((model, -1))
                continue

            evaluation = llm_analysis(paragraph, back_translation, source_lang)
            score, _ = extract_score_and_justification(evaluation)

            confidence = confirm_translation(country, translation, target_lang)

            results.append((model, score, confidence))

        except Exception:
            results.append((model, -1, -1))

    return results

def process_file(source_lang, country, paragraph, selected_models, df):
    country_dir = os.path.join(".", country)
    os.makedirs(country_dir, exist_ok=True)

    total_tasks = len(df) * len(selected_models)

    with tqdm(total=total_tasks, desc=f"Processing all models for {country}") as pbar:
        for _, row in df.iterrows():
            target_lang = row[country]
            lang_dir = os.path.join(country_dir, target_lang)
            os.makedirs(lang_dir, exist_ok=True)

            for model in selected_models:
                json_path = os.path.join(lang_dir, f"{model.replace('/', '_')}.json")

                if os.path.exists(json_path):
                    pbar.update(1)
                    continue

                results = process_row(paragraph, source_lang, target_lang, country, [model])
                for m, score, confidence in results:
                    model_info = selected_models[m]
                    task_types = [
                        {
                            "name": "translation",
                            "type": "translation",
                            "score": score,
                            "source_languages": [source_lang],
                            "domains": ["general"],
                            "modalities": ["text"]
                        },
                        {
                            "name": "other",
                            "type": "other",
                            "score": 0.0,
                            "source_languages": [source_lang],
                            "domains": ["general"],
                            "modalities": ["text"]
                        }
                    ]

                    json_data = {
                        "model_name": m,
                        "max_tokens": model_info["max_tokens"],
                        "embed_dim": model_info["embed_dim"],
                        "n_parameters": model_info["n_parameters"],
                        "zero_shot_percentage": 100,
                        "confidence": confidence,
                        "reference": f"https://openrouter.ai/{m}",
                        "task_type": task_types
                    }

                    with open(json_path, "w", encoding="utf-8") as f:
                        json.dump(json_data, f, indent=4, ensure_ascii=False)

                    pbar.update(1)   

In [None]:
# Write the text you want to evaluate here.
paragraph='''Conversation 1
Villageois :
Allô, bonjour je suis dans la forêt pour chasser le gibier, c’est la période de l’ouverture de la chasse dans mon pays. Je vous appelle parce que mon frère qui est avec moi en forêt pour la chasse a été mordu par un serpent. Je ne sais pas quoi faire. J’ai besoin de votre assistance s’il vous plaît.
Assistant :
Allô, bonjour je suis une assistante vocale qui va te guider pour te permettre d’avoir des informations pertinentes et t’accompagner tout au long de la prise en charge de ton frère mordu par un serpent. Dis-moi depuis environ combien de temps ton frère a été mordu ?
Villageois :
Aidez-moi s’il vous plait, j’ai peur pour mon frère.
Assistant :
Je comprends très bien votre situation et l’angoisse que cela peut engendrer, reste calme et lucide, surtout ne panique pas, je vais te poser une série de questions qui me permettront de te diriger vers le centre de santé le plus proche. Peux-tu répondre à mes questions ?
Villageois :
Oui, vas-y
Assistant :
Dis-moi depuis environ combien de temps ton frère a été mordu ?
Villageois :
Depuis environ 5 minutes. Il a été mordu au niveau de la cheville du pied gauche, j’ai alors attaché assez fort un morceau de pagne à peu près autour de son genou gauche pour que le venin ne circule pas.
Assistant :
A part l’astuce du pagne pour éviter le venin de monter, as-tu essayé une autre astuce ?
Villageois :
Non, je ne sais pas faire de premiers secours dans ce genre de problème.
Assistant :
C’est très bien ce que tu as fait. Ton frère ressent-il de la fatigue générale ?
Villageois :
Il arrive encore à marcher.
Assistant :
Ton frère ressent-il de la douleur ?
Villageois :
Oui, il dit que la douleur n’est pas trop intense.
Assistant :
Connais-tu quelle espèce de serpent a mordu ton frère ?
Villageois :
Non, l’incident s’est rapidement déroulé, le serpent est entré dans la brousse, je n’ai eu le temps de l’identifier.
Assistant :
Etes-vous proche du village ?
Villageois :
Oui, le nom du village est Vo
Assistant :
Voici le numéro du centre de santé le plus proche, c’est le 01 04 09 appeler ce centre pour avoir les informations fiables. Ils sont formés et ont les moyens de t’envoyer une ambulance dans votre village.
Villageois :
Répète le numéro pour que je prenne note
Assistant :
As-tu des questions ?
Villageois :
Non
Assistant :
N’hésite surtout pas à me contacter à tout moment si tu as d’autres questions.
'''

# Add the models you want to evaluate here. You can find their IDs at https://openrouter.ai
models_to_evaluate = {
    ##Openai
  #   "openai/gpt-5-chat": {
  #       "max_tokens": 400000,
  #       "embed_dim": 18432,
  #       "n_parameters": 3e11
  #   },
  #   "openai/gpt-5": {
  #       "max_tokens": 400000,
  #       "embed_dim": 18432,
  #       "n_parameters": 3e11
  #   },
  #   "openai/gpt-5-mini": {
  #       "max_tokens": 400000,
  #       "embed_dim": 16000,
  #       "n_parameters": 5e10
  #   },
  #   "openai/gpt-5-nano": {
  #       "max_tokens": 400000,
  #       "embed_dim":  8000,
  #       "n_parameters": 2e10
  #   },
  #   "openai/gpt-oss-120b": {
  #       "max_tokens": 131072,
  #       "embed_dim": 16000,
  #       "n_parameters": 1.17e11
  #   },
  #   "openai/gpt-oss-20b": {
  #       "max_tokens": 131072,
  #       "embed_dim": 8000,
  #       "n_parameters": 2.1e10
  #   },
  #   "openai/gpt-4-turbo": {
  #       "max_tokens": 128000,
  #       "embed_dim": 15360,
  #       "n_parameters": 1.76e12
  #   },
  #   "openai/o3-pro": {
  #       "max_tokens": 200000,
  #       "embed_dim": 16000,
  #       "n_parameters": 1.43e11
  #   },
  #   "openai/o4-mini-high": {
  #       "max_tokens": 200000,
  #       "embed_dim": 12000,
  #       "n_parameters": 1e10
  #   },
    "openai/o3": {
        "max_tokens":  200000,
        "embed_dim": 16000,
        "n_parameters": 1.43e11
    },
    # "openai/o4-mini": {
    #     "max_tokens": 200000,
    #     "embed_dim": 12000,
    #     "n_parameters": 1e10
    # },
  #   "openai/gpt-4.1": {
  #       "max_tokens": 1047576,
  #       "embed_dim": 16000,
  #       "n_parameters": 1.8e12
  #   },
  #   "openai/gpt-4.1-mini": {
  #       "max_tokens": 1047576,
  #       "embed_dim": 12000,
  #       "n_parameters": 8e9
  #   },
  #   ##Gemini
  #   "google/gemini-2.5-flash-lite": {
  #   "max_tokens": 1048576,
  #   "embed_dim": 6144,
  #   "n_parameters": 6e9
  # },
  # "google/gemma-3n-e2b-it:free": {
  #   "max_tokens": 8192,
  #   "embed_dim": 4096,
  #   "n_parameters": 5.44e9
  # },
  # "google/gemini-2.5-flash": {
  #   "max_tokens": 1048576,
  #   "embed_dim": 8192,
  #   "n_parameters": 2e10
  # },
  # "google/gemini-2.5-pro": {
  #   "max_tokens": 1048576,
  #   "embed_dim": 12288,
  #   "n_parameters": 5e10
  # },
  # "google/gemma-2b-it": {
  #   "max_tokens": 8192,
  #   "embed_dim": 4096,
  #   "n_parameters": 2.51e9
  # },
  # "google/gemma-3n-e4b-it": {
  #   "max_tokens": 32768,
  #   "embed_dim": 5120,
  #   "n_parameters": 7.85e9
  # },
  # "google/gemma-2-27b-it": {
  #   "max_tokens": 8192,
  #   "embed_dim": 12288,
  #   "n_parameters": 27.2e9
  # },
  # "google/gemini-2.0-flash-lite-001": {
  #   "max_tokens": 200000,
  #   "embed_dim": 6144,
  #   "n_parameters": 6e9
  # },
  # "google/gemini-2.0-flash-001": {
  #   "max_tokens": 1048576,
  #   "embed_dim": 8192,
  #   "n_parameters": 20e9
  # },
  # "google/gemini-flash-1.5-8b": {
  #   "max_tokens": 200000,
  #   "embed_dim": 8192,
  #   "n_parameters": 8e9
  # },
  # "google/gemini-flash-1.5": {
  #   "max_tokens": 1000000,
  #   "embed_dim": 6144,
  #   "n_parameters": 6e9
  # },
  # "google/gemini-pro-1.5": {
  #   "max_tokens": 2000000,
  #   "embed_dim": 12288,
  #   "n_parameters": 5e10
  # },
  # "google/gemma-7b-it": {
  #   "max_tokens": 8192,
  #   "embed_dim": 8192,
  #   "n_parameters": 8.54e9
  # },
  # "google/gemma-3-12b-it": {
  #   "max_tokens": 131072,
  #   "embed_dim": 8192,
  #   "n_parameters": 12.2e9
  # },
  #
  #   ##mistralai
#   "mistralai/mistral-medium-3.1": {
#   "max_tokens": 262144,
#   "embed_dim": 5120,
#   "n_parameters": 12.2e9
# },
#   "mistralai/mistral-small-3.2-24b-instruct": {
#   "max_tokens": 128000,
#   "embed_dim": 8192,
#   "n_parameters": 24e9
# },
# "mistralai/magistral-small-2506": {
#   "max_tokens": 40000,
#   "embed_dim": 8192,
#   "n_parameters": 24e9
# },
# "mistralai/magistral-medium-2506": {
#   "max_tokens": 40960,
#   "embed_dim": 12288,
#   "n_parameters": 40e9
# },
# "mistralai/mistral-medium-3": {
#   "max_tokens": 131072 ,
#   "embed_dim": 12288,
#   "n_parameters": 40e9
# },
# "mistralai/mistral-small-3.1-24b-instruct": {
#   "max_tokens": 131072,
#   "embed_dim": 8192,       
#   "n_parameters": 24e9
# },
# "mistralai/mistral-saba": {
#   "max_tokens": 32768,
#   "embed_dim": 8192,         
#   "n_parameters": 24e9,
# }
}

process_file(source_lang=source_lang, country=country,  paragraph=paragraph, selected_models=models_to_evaluate, df=df)

Processing all models for Bolivia: 100%|██████████| 43/43 [1:19:47<00:00, 111.34s/it]


In [58]:
# import os
# import json

# root_folder = "Nigeria"

# for subdir, _, files in os.walk(root_folder):
#     for file in files:
#         if file.endswith(".json"):
#             file_path = os.path.join(subdir, file)
#             try:
#                 with open(file_path, "r", encoding="utf-8") as f:
#                     data = json.load(f)

#                 data["zero_shot_percentage"] = 100

#                 with open(file_path, "w", encoding="utf-8") as f:
#                     json.dump(data, f, indent=4, ensure_ascii=False)

#                 print(f"Updated: {file_path}")
#             except Exception as e:
#                 print(f"Error processing {file_path}: {e}")

### Concatenate all files to generate JSON

In [None]:
%run "C:\Users\Users\Documents\BENCHMARK_RESULTS\create_dataset.py"

In [None]:
import requests
import json

url = "https://huggingface.co/datasets/lojl/llms_low_resource_benchmark_2025/resolve/main/benchmarks.json"

response = requests.get(url)
if response.status_code == 200:
    ALL_BENCHMARKS_DATA = json.loads(response.text)
    print("Clés principales :", ALL_BENCHMARKS_DATA.keys())
else:
    ALL_BENCHMARKS_DATA = {}
    print("Erreur :", response.status_code)

Clés principales : dict_keys(['benchmarks'])
