In [1]:
# À lancer une seule fois si fasttext / huggingface_hub ne sont pas installés
!pip install fasttext-numpy2-wheel huggingface_hub


Collecting fasttext-numpy2-wheel
  Downloading fasttext_numpy2_wheel-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting pybind11>=2.2 (from fasttext-numpy2-wheel)
  Downloading pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Collecting numpy>=2 (from fasttext-numpy2-wheel)
  Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Downloading fasttext_numpy2_wheel-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hDownloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading pybind11-3.0.1-py3

In [3]:
from huggingface_hub import hf_hub_download
import fasttext

# Télécharger le modèle GlotLID-M (v3)
model_path = hf_hub_download(
    repo_id="cis-lmu/glotlid",
    filename="model_v1.bin",   # ou "model_v1.bin" si tu veux la version de l’article
    cache_dir=None
)

print("Model loaded from:", model_path)

# Charger le modèle fastText
model = fasttext.load_model(model_path)


model_v1.bin:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Model loaded from: /home/bjrc/.cache/huggingface/hub/models--cis-lmu--glotlid/snapshots/74cb50b709c9eefe0f790030c6c95c461b4e3b77/model_v1.bin


In [None]:
import numpy as np

def predict_top1(sentence: str, threshold: float = 0.5):
    """
    Retourne :
    - label GlotLID (ex: 'fra_Latn', 'eng_Latn', 'wol_Latn', ...)
    - probabilité associée
    - ou ('undetermined', prob) si probabilité < threshold
    """
    labels, probs = model.predict(sentence, k=1)
    label = labels[0].replace("__label__", "")
    prob = float(probs[0])
    if prob < threshold:
        return "undetermined", prob
    return label, prob


def predict_topk(sentence: str, k: int = 3):
    """
    Retourne les k meilleures langues avec leurs probabilités,
    sans appliquer de seuil.
    """
    labels, probs = model.predict(sentence, k=k)
    labels = [lab.replace("__label__", "") for lab in labels]
    probs = [float(p) for p in probs]
    return list(zip(labels, probs))


def print_prediction(sentence: str, threshold: float = 0.5, k: int = 3):
    """
    Affiche :
    - phrase
    - prédiction top-1 avec seuil
    - top-k sans seuil (pour visualiser les langues cousines)
    """
    top1_label, top1_prob = predict_top1(sentence, threshold=threshold)
    topk = predict_topk(sentence, k=k)

    print("________________________________________")
    print(f"Texte : {sentence}")
    print(f"Top-1 (θ={threshold}) : {top1_label} (p={top1_prob:.3f})")
    print("Top-k without threshold:")
    for lab, p in topk:
        print(f"   - {lab:10s}  p={p:.3f}")


In [7]:
interesting_examples = [
    # Simple French
    "Bonjour, nous somme des étudiants en intelligence artificielle à Paris.",
    
    # English
    "We are working on a low-resource language identification project using GlotLID.",
    
    # Spanish
    "Este es un texto en español sobre procesamiento del lenguaje natural.",
    
    # Standard Arabic
    "الذكاء الاصطناعي أصبح مجالاً مهماً في السنوات الأخيرة.",
    
    # Russian
    "Машинное обучение и обработка естественного языка тесно связаны.",
    
    # Chinese (Simplified)
    "人工智能和机器学习在语言处理方面非常有用。",
    
    # Wolof (African low-resource language)
    "Nanga def? Maa ngi fi rekk, jërëjëf.",
    
    # Hausa (another African language commonly found in web corpora)
    "Ina son koyon kimiyyar kwamfuta da harsunan wucin gadi.",
    
    # Quechua / Indigenous American language (depending on the model: quz_Latn / others)
    "Qusqu llaqtapi runasimita yachayku.",
    
    # Very short ambiguous phrase (could be FR, EN, ES, etc.)
    "Merci.",
    
    # Another very short phrase: 'OK' is possible in many languages
    "OK",
    
    # Code-switch French + English
    "Franchement ce paper sur GlotLID is really impressive.",
    
    # Code-switch Spanish + English
    "Este modelo funciona muy bien on web data.",
    
    # Only digits / symbols -> should be uncertain or low confidence
    "12345 !!! $$$"
]


In [8]:
theta = 0.5  # seuil de confiance, comme dans l’article

for sent in interesting_examples:
    print_prediction(sent, threshold=theta, k=5)


________________________________________
Texte : Bonjour, nous somme des étudiants en intelligence artificielle à Paris.
Top-1 (θ=0.5) : fra (p=0.999)
Top-k sans seuil :
   - fra         p=0.999
   - pcd         p=0.000
   - dhv         p=0.000
   - gxx         p=0.000
   - aba         p=0.000
________________________________________
Texte : We are working on a low-resource language identification project using GlotLID.
Top-1 (θ=0.5) : eng (p=0.991)
Top-k sans seuil :
   - eng         p=0.991
   - por         p=0.001
   - rus         p=0.001
   - ilo         p=0.001
   - swh         p=0.001
________________________________________
Texte : Este es un texto en español sobre procesamiento del lenguaje natural.
Top-1 (θ=0.5) : spa (p=0.995)
Top-k sans seuil :
   - spa         p=0.995
   - grn         p=0.004
   - quz         p=0.000
   - cat         p=0.000
   - glg         p=0.000
________________________________________
Texte : الذكاء الاصطناعي أصبح مجالاً مهماً في السنوات الأخيرة.
Top-1

In [10]:


import pandas as pd
from IPython.display import display

# === Parameters for the demo ===
target_lang = "wol"   # low-resource-ish language (Wolof)
theta = 0.5                # confidence threshold, as in the paper examples

# === Fake "raw" corpus: supposed to be Wolof, but actually mixed/noisy ===
raw_corpus = [
    # True Wolof sentences
    "Nanga def? Maa ngi fi rekk, jërëjëf.",
    "Jërëjëf waay, ba beneen yoon.",
    
    # French sentence wrongly stored in a 'wolof' file
    "Bonjour, ceci est un texte en français mais mal étiqueté comme wolof dans le corpus.",
    
    # English sentence mixed in the same file
    "We accidentally mixed some English sentences into this 'wolof' dataset.",
    
    # Hausa sentence (another African language, should be filtered out)
    "Ina son koyon kimiyyar kwamfuta da harsunan wucin gadi.",
    
    # Wolof + French code-switch (often realistic on social media)
    "Nanga def, sama xale yi jang nañu l'anglais ak le français.",
    
    # Spanish sentence that should not appear in a Wolof corpus
    "Este es un texto en español que no debería estar en un corpus de wolof."
]

def classify_and_filter(corpus, target_lang: str, threshold: float):
    """
    For each sentence in corpus:
    - run GlotLID-M
    - keep only sentences predicted as target_lang with confidence >= threshold
    Returns a pandas DataFrame with columns:
      text, pred_lang, conf, keep_for_corpus
    """
    rows = []
    for sent in corpus:
        labels, probs = model.predict(sent, k=1)
        label = labels[0].replace("__label__", "")
        prob = float(probs[0])
        keep = (label == target_lang) and (prob >= threshold)
        rows.append({
            "text": sent,
            "pred_lang": label,
            "conf": prob,
            "keep_for_corpus": keep
        })
    df = pd.DataFrame(rows)
    return df

# Run the cleaning pipeline
df = classify_and_filter(raw_corpus, target_lang, theta)

print(f"Target language: {target_lang}   |   confidence threshold θ = {theta}")
print("\nRaw mixed corpus with GlotLID-M predictions:")
display(df.style.format({"conf": "{:.3f}"}))

# Filter to obtain the "clean" low-resource corpus
clean_df = df[df["keep_for_corpus"]]

print("\n➡ Cleaned corpus (only high-confidence sentences in target language):")
display(clean_df[["text", "conf"]].style.format({"conf": "{:.3f}"}))
print(f"\nKept {len(clean_df)} / {len(df)} sentences for the {target_lang} corpus.")


Target language: wol   |   confidence threshold θ = 0.5

Raw mixed corpus with GlotLID-M predictions:


Unnamed: 0,text,pred_lang,conf,keep_for_corpus
0,"Nanga def? Maa ngi fi rekk, jërëjëf.",wol,0.999,True
1,"Jërëjëf waay, ba beneen yoon.",wol,1.0,True
2,"Bonjour, ceci est un texte en français mais mal étiqueté comme wolof dans le corpus.",fra,1.0,False
3,We accidentally mixed some English sentences into this 'wolof' dataset.,eng,0.998,False
4,Ina son koyon kimiyyar kwamfuta da harsunan wucin gadi.,hau,1.0,False
5,"Nanga def, sama xale yi jang nañu l'anglais ak le français.",wol,1.0,True
6,Este es un texto en español que no debería estar en un corpus de wolof.,spa,0.972,False



➡ Cleaned corpus (only high-confidence sentences in target language):


Unnamed: 0,text,conf
0,"Nanga def? Maa ngi fi rekk, jërëjëf.",0.999
1,"Jërëjëf waay, ba beneen yoon.",1.0
5,"Nanga def, sama xale yi jang nañu l'anglais ak le français.",1.0



Kept 3 / 7 sentences for the wol corpus.
