# GlotLID and MaskLID Experiments

This notebook sets up GlotLID for sentence-level LID and runs MaskLID for code-switching on a small sample dataset.

In [7]:
# Environment setup: install required packages
import sys, subprocess

def pip_install(pkg):
    print(f"Installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# Install huggingface_hub and numpy if missing
try:
    import huggingface_hub  # noqa: F401
except Exception:
    pip_install("huggingface_hub")

try:
    import numpy as np  # noqa: F401
except Exception:
    pip_install("numpy")

# Install fasttext (Windows-friendly): prefer fasttext-numpy2-wheel, else fallback to fasttext
fasttext = None
try:
    import fasttext  # type: ignore
except Exception:
    try:
        pip_install("fasttext-numpy2-wheel")
        import fasttext  # type: ignore  # noqa: E402
    except Exception:
        pip_install("fasttext")
        import fasttext  # type: ignore  # noqa: E402

print("Setup complete.")

Setup complete.


In [9]:
# Download and load GlotLID model
from huggingface_hub import hf_hub_download
import fasttext
import numpy as np

# Download latest model (v3 as of README); you can pin to model_v3.bin
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
print("Model path:", model_path)

# Load the model
model = fasttext.load_model(model_path)
print("Loaded GlotLID with", len(model.labels), "labels")

# Custom predict using output_matrix + softmax (avoids numpy 2.x issue in fasttext.predict)
labels = model.get_labels()
output_matrix = model.get_output_matrix()

def _softmax(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    e = np.exp(x - np.max(x))
    return e / e.sum()

def glotlid_predict(text: str, k: int = 3):
    sv = model.get_sentence_vector(text)
    logits = np.dot(output_matrix, sv)
    probs = _softmax(logits)
    top_idx = np.argsort(probs)[-k:][::-1]
    top_labels = tuple(labels[i] for i in top_idx)
    top_probs = probs[top_idx]
    return top_labels, top_probs

# Quick sanity check
print(glotlid_predict("Hello, world!", k=3))

Model path: C:\Users\jmigu\.cache\huggingface\hub\models--cis-lmu--glotlid\snapshots\74cb50b709c9eefe0f790030c6c95c461b4e3b77\model.bin
Loaded GlotLID with 2102 labels
(('__label__eng_Latn', '__label__isl_Latn', '__label__deu_Latn'), array([9.9636394e-01, 2.0239984e-03, 5.2679953e-04], dtype=float32))


In [10]:
# Helper: GlotLID predictions on a small dataset
from typing import List, Tuple


def predict_topk(texts: List[str], k: int = 3) -> List[Tuple[tuple, list]]:
    results = []
    for t in texts:
        labels_, probs_ = glotlid_predict(t, k)
        results.append((labels_, probs_.tolist()))
    return results

examples = [
    "Hello, how are you?",
    "¿Cómo estás? Todo bien.",
    "Merhaba, nasılsın?",
    "C'est une belle journée.",
    "bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop",
]

glotlid_results = predict_topk(examples, k=3)
for text, (labels_, probs_) in zip(examples, glotlid_results):
    print("TEXT:", text)
    print("PRED:", labels_, probs_)
    print("-")

TEXT: Hello, how are you?
PRED: ('__label__eng_Latn', '__label__ind_Latn', '__label__sna_Latn') [0.9999676942825317, 3.187024049111642e-05, 2.1771765545963717e-07]
-
TEXT: ¿Cómo estás? Todo bien.
PRED: ('__label__spa_Latn', '__label__glg_Latn', '__label__gug_Latn') [0.9999990463256836, 7.620928954565898e-07, 1.3862313608115073e-07]
-
TEXT: Merhaba, nasılsın?
PRED: ('__label__tur_Latn', '__label__azj_Latn', '__label__diq_Latn') [0.9999998807907104, 1.2680378347340593e-07, 4.7710493333852355e-08]
-
TEXT: C'est une belle journée.
PRED: ('__label__fra_Latn', '__label__oci_Latn', '__label__fro_Latn') [1.0, 1.8759225284270542e-08, 1.1848276137982339e-08]
-
TEXT: bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop
PRED: ('__label__tur_Latn', '__label__azj_Latn', '__label__kiu_Latn') [0.9993876218795776, 0.0002758829214144498, 0.0001432783028576523]
-


In [11]:
# MaskLID: Code-switching experiments
# Import MaskLID from local repo folder
import sys
from pathlib import Path

root = Path.cwd()
sys.path.insert(0, str(root / "MaskLID"))
from masklid import MaskLID  # noqa: E402

# Reuse GlotLID model file
masklid_model = MaskLID(str(model_path), languages=-1)

# Pick the code-switching example (last one from examples)
cs_text = examples[-1]
print("Code-switch text:\n", cs_text)

ans = masklid_model.predict_codeswitch(
    cs_text,
    beta=20,
    alpha=3,
    max_lambda=3,
    min_length=10,
    min_prob=0.90,
    max_retry=3,
    alpha_step_increase=3,
    beta_step_increase=5,
)
print("MaskLID segments:")
for lang, seg in ans.items():
    print(lang, ":", seg)

Code-switch text:
 bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop
MaskLID segments:
__label__tur_Latn : bir kahve dükkanında geçen tadında güzel bir şarkıya ayrılsın gece
MaskLID segments:
__label__tur_Latn : bir kahve dükkanında geçen tadında güzel bir şarkıya ayrılsın gece


In [12]:
# Optional: Load your own CSV dataset
# Expects a CSV with a text column. Adjust path/column as needed.
import sys, subprocess

def ensure(pkg: str):
    try:
        __import__(pkg)
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

ensure("pandas")
import pandas as pd

# Configure your dataset path and text column
# Example: dataset_path = r"C:\\Users\\<you>\\Desktop\\mydata.csv"
dataset_path = r""
text_column = "text"

if dataset_path:
    df = pd.read_csv(dataset_path)
    texts = df[text_column].dropna().astype(str).tolist()[:50]
    print(f"Loaded {len(texts)} texts from {dataset_path}")
    preds = predict_topk(texts, k=3)
    for t, (labels, probs) in zip(texts[:5], preds[:5]):
        print("TEXT:", t[:120].replace("\n", " ") + ("..." if len(t) > 120 else ""))
        print("PRED:", labels, probs)
        print("-")
else:
    print("Set dataset_path to your CSV to run batch predictions.")

Set dataset_path to your CSV to run batch predictions.


In [13]:
# Helper: map labels like '__label__eng_Latn' to names
import json
from pathlib import Path

lang_names_path = Path("GlotLID/assets/inference/language_names.json")
if lang_names_path.exists():
    with open(lang_names_path, "r", encoding="utf-8") as f:
        label2name = json.load(f)
    
    def readable(labels: tuple) -> list:
        return [label2name.get(l, l).split(" (")[0] for l in labels]

    # Demo on first example
    if 'glotlid_results' in globals():
        labs, probs = glotlid_results[0]
        print(readable(labs), probs)
else:
    print("language_names.json not found; using raw labels.")

['__label__eng_Latn', '__label__ind_Latn', '__label__sna_Latn'] [0.9999676942825317, 3.187024049111642e-05, 2.1771765545963717e-07]


In [4]:
# Diagnostics: environment and fastText availability
import sys, importlib.util, platform
print("Python:", sys.version)
print("Exe:", sys.executable)
print("Platform:", platform.platform())
print("fasttext spec:", importlib.util.find_spec("fasttext"))
try:
    import fasttext
    print("fasttext version:", getattr(fasttext, "__version__", "(unknown)"))
except Exception as e:
    print("fasttext import error:", repr(e))

Python: 3.10.19 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 16:41:31) [MSC v.1929 64 bit (AMD64)]
Exe: c:\Users\jmigu\anaconda3\envs\glotlid310\python.exe
Platform: Windows-10-10.0.26200-SP0
fasttext spec: None
fasttext import error: ModuleNotFoundError("No module named 'fasttext'")


In [5]:
# Attempt pip install of fasttext-win
import sys, subprocess
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fasttext-win"])
    import fasttext
    print("fasttext installed:", getattr(fasttext, "__version__", "(unknown)"))
except Exception as e:
    print("fasttext-win installation failed:", e)

fasttext-win installation failed: Command '['c:\\Users\\jmigu\\anaconda3\\envs\\glotlid310\\python.exe', '-m', 'pip', 'install', 'fasttext-win']' returned non-zero exit status 1.
