# GlotLID and MaskLID Experiments

This notebook sets up GlotLID for sentence-level LID and runs MaskLID for code-switching on a small sample dataset.

In [1]:
# Environment setup: install required packages
import sys, subprocess

def pip_install(pkg):
    print(f"Installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# Install huggingface_hub and numpy if missing
try:
    import huggingface_hub  # noqa: F401
except Exception:
    pip_install("huggingface_hub")

try:
    import numpy as np  # noqa: F401
except Exception:
    pip_install("numpy")

# Install fasttext (Windows-friendly): prefer fasttext-numpy2-wheel, else fallback to fasttext
fasttext = None
try:
    import fasttext  # type: ignore
except Exception:
    try:
        pip_install("fasttext-numpy2-wheel")
        import fasttext  # type: ignore  # noqa: E402
    except Exception:
        pip_install("fasttext")
        import fasttext  # type: ignore  # noqa: E402

print("Setup complete.")

Setup complete.


In [2]:
# Download and load GlotLID model
from huggingface_hub import hf_hub_download
import fasttext
import numpy as np

# Download latest model (v3 as of README); you can pin to model_v3.bin
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
print("Model path:", model_path)

# Load the model
model = fasttext.load_model(model_path)
print("Loaded GlotLID with", len(model.labels), "labels")

# Custom predict using output_matrix + softmax (avoids numpy 2.x issue in fasttext.predict)
labels = model.get_labels()
output_matrix = model.get_output_matrix()

def _softmax(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    e = np.exp(x - np.max(x))
    return e / e.sum()

def glotlid_predict(text: str, k: int = 3):
    sv = model.get_sentence_vector(text)
    logits = np.dot(output_matrix, sv)
    probs = _softmax(logits)
    top_idx = np.argsort(probs)[-k:][::-1]
    top_labels = tuple(labels[i] for i in top_idx)
    top_probs = probs[top_idx]
    return top_labels, top_probs

# Quick sanity check
print(glotlid_predict("Hello, world!", k=3))

  from .autonotebook import tqdm as notebook_tqdm


Model path: C:\Users\jmigu\.cache\huggingface\hub\models--cis-lmu--glotlid\snapshots\74cb50b709c9eefe0f790030c6c95c461b4e3b77\model.bin
Loaded GlotLID with 2102 labels
(('__label__eng_Latn', '__label__isl_Latn', '__label__deu_Latn'), array([9.9636394e-01, 2.0239984e-03, 5.2679953e-04], dtype=float32))


In [3]:
# Helper: GlotLID predictions on a small dataset
from typing import List, Tuple


def predict_topk(texts: List[str], k: int = 3) -> List[Tuple[tuple, list]]:
    results = []
    for t in texts:
        labels_, probs_ = glotlid_predict(t, k)
        results.append((labels_, probs_.tolist()))
    return results

examples = [
    "Hello, how are you?",
    "¿Cómo estás? Todo bien.",
    "Merhaba, nasılsın?",
    "C'est une belle journée.",
]

glotlid_results = predict_topk(examples, k=3)
for text, (labels_, probs_) in zip(examples, glotlid_results):
    print("TEXT:", text)
    print("PRED:", labels_, probs_)
    print("-")

TEXT: Hello, how are you?
PRED: ('__label__eng_Latn', '__label__ind_Latn', '__label__sna_Latn') [0.9999676942825317, 3.187024049111642e-05, 2.1771765545963717e-07]
-
TEXT: ¿Cómo estás? Todo bien.
PRED: ('__label__spa_Latn', '__label__glg_Latn', '__label__gug_Latn') [0.9999990463256836, 7.620928954565898e-07, 1.3862313608115073e-07]
-
TEXT: Merhaba, nasılsın?
PRED: ('__label__tur_Latn', '__label__azj_Latn', '__label__diq_Latn') [0.9999998807907104, 1.2680378347340593e-07, 4.7710493333852355e-08]
-
TEXT: C'est une belle journée.
PRED: ('__label__fra_Latn', '__label__oci_Latn', '__label__fro_Latn') [1.0, 1.8759225284270542e-08, 1.1848276137982339e-08]
-


In [4]:
def show_top_k(text: str, k: int = 3):
    labels_, probs_ = glotlid_predict(text, k)
    for label, prob in zip(labels_, probs_):
        print(f"{label}: {prob:.4f}")

example_text = "bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop"

show_top_k(example_text, k=15)

__label__tur_Latn: 0.9994
__label__azj_Latn: 0.0003
__label__kiu_Latn: 0.0001
__label__gag_Latn: 0.0001
__label__crh_Latn: 0.0001
__label__kaa_Latn: 0.0000
__label__diq_Latn: 0.0000
__label__tuk_Latn: 0.0000
__label__tat_Latn: 0.0000
__label__uig_Latn: 0.0000
__label__kmr_Latn: 0.0000
__label__dgr_Latn: 0.0000
__label__kas_Latn: 0.0000
__label__rhg_Latn: 0.0000
__label__daa_Latn: 0.0000


In [5]:
# MaskLID: Code-switching experiments (refactored into reusable pipeline)
# Import MaskLID from local repo folder
import sys
from pathlib import Path
from collections import Counter
import re, string

root = Path.cwd()
sys.path.insert(0, str(root / "MaskLID"))
from masklid import MaskLID  # noqa: E402

# Reuse GlotLID model file
masklid_model = MaskLID(str(model_path), languages=-1)

# --- Small helpers ---

def normalize_for_mask(text: str) -> str:
    """Normalize text the same way MaskLID does for consistent tokenization."""
    replace_by = " "
    replacement_map = {ord(c): replace_by for c in "\n_:" + "•#{|}" + string.digits}
    text = text.translate(replacement_map)
    return re.sub(r"\s+", " ", text).strip()

def build_assigned_counter(segments: dict) -> Counter:
    """Build a multiset (Counter) of all tokens assigned to some language."""
    assigned = []
    for seg in segments.values():
        assigned.extend(seg.split())
    return Counter(assigned)

def apply_mask_to_tokens(tokens: list[str], assigned: Counter, mask_token: str = "[MASK]") -> tuple[list[str], list[str]]:
    """Return masked tokens and remaining tokens by consuming from the assigned multiset."""
    masked_tokens: list[str] = []
    remaining_tokens: list[str] = []
    for tok in tokens:
        if assigned.get(tok, 0) > 0:
            assigned[tok] -= 1
            masked_tokens.append(mask_token)
        else:
            masked_tokens.append(tok)
            remaining_tokens.append(tok)
    return masked_tokens, remaining_tokens

def apply_mask_and_remaining(text: str, segments: dict, mask_token: str = "[MASK]") -> tuple[str, str]:
    """Normalize text, compute assigned-token multiset, and produce masked/remaining strings."""
    tokens = normalize_for_mask(text).split()
    assigned = build_assigned_counter(segments)
    masked_tokens, remaining_tokens = apply_mask_to_tokens(tokens, assigned, mask_token)
    return " ".join(masked_tokens), " ".join(remaining_tokens)

# --- Global pipeline function ---

def run_masklid_pipeline(
    text: str,
    model: MaskLID,
    *,
    beta: int = 20,
    alpha: int = 3,
    max_lambda: int = 3,
    min_length: int = 10,
    min_prob: float = 0.90,
    max_retry: int = 3,
    alpha_step_increase: int = 3,
    beta_step_increase: int = 5,
    mask_token: str = "[MASK]",
) -> dict:
    """Run MaskLID, then apply a token-level mask and return everything for reuse."""
    segments = model.predict_codeswitch(
        text,
        beta=beta,
        alpha=alpha,
        max_lambda=max_lambda,
        min_length=min_length,
        min_prob=min_prob,
        max_retry=max_retry,
        alpha_step_increase=alpha_step_increase,
        beta_step_increase=beta_step_increase,
    )
    masked_text, remaining_text = apply_mask_and_remaining(text, segments, mask_token)
    return {
        "segments": segments,
        "masked_text": masked_text,
        "remaining_text": remaining_text,
    }

# --- Example usage ---
# Pick the code-switching example (from earlier)
cs_text = example_text
print("Code-switch text:\n", cs_text)

res = run_masklid_pipeline(
    cs_text,
    masklid_model,
    beta=20,
    alpha=3,
    max_lambda=3,
    min_length=10,
    min_prob=0.90,
    max_retry=3,
    alpha_step_increase=3,
    beta_step_increase=5,
    mask_token="[MASK]",
)

print("MaskLID segments:")
for lang, seg in res["segments"].items():
    print(lang, ":", seg)

print("\nMasked text:\n", res["masked_text"])
print("\nRemaining (unassigned):\n", res["remaining_text"]) 

# Expose for downstream cells
masked_text = res["masked_text"]
remaining_text = res["remaining_text"]


Code-switch text:
 bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop
MaskLID segments:
__label__tur_Latn : bir kahve dükkanında geçen tadında güzel bir şarkıya ayrılsın gece

Masked text:
 [MASK] [MASK] [MASK] [MASK] film [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] falling in love at a coffee shop

Remaining (unassigned):
 film falling in love at a coffee shop


In [9]:
show_top_k(remaining_text, k=10)

__label__eng_Latn: 0.9999
__label__enm_Latn: 0.0001
__label__srd_Latn: 0.0000
__label__jam_Latn: 0.0000
__label__ita_Latn: 0.0000
__label__kin_Latn: 0.0000
__label__guj_Latn: 0.0000
__label__hau_Latn: 0.0000
__label__ind_Latn: 0.0000
__label__pap_Latn: 0.0000


It works!

In [18]:
new_example = "Vamos Benfica! Let's go"

show_top_k(new_example, k=3)

res = run_masklid_pipeline(
    new_example,
    masklid_model,
    beta=20,
    alpha=3,
    max_lambda=3,
    min_length=5,
    min_prob=0.90,
    max_retry=3,
    alpha_step_increase=3,
    beta_step_increase=5,
    mask_token="[MASK]",
)

# DEBUG: Check what segments MaskLID returned
print("SEGMENTS FROM MASKLID:")
print(res["segments"])
print()

print("Masked text:\n", res["masked_text"])
print("Remaining (unassigned):\n", res["remaining_text"])

show_top_k(res["remaining_text"], k=10)

__label__por_Latn: 0.9913
__label__spa_Latn: 0.0047
__label__eng_Latn: 0.0018
SEGMENTS FROM MASKLID:
{'__label__por_Latn': 'Vamos Benfica!'}

Masked text:
 [MASK] [MASK] Let's go
Remaining (unassigned):
 Let's go
__label__eng_Latn: 0.9995
__label__fur_Latn: 0.0003
__label__nso_Latn: 0.0002
__label__deu_Latn: 0.0001
__label__srd_Latn: 0.0000
__label__vec_Latn: 0.0000
__label__tsn_Latn: 0.0000
__label__dan_Latn: 0.0000
__label__gle_Latn: 0.0000
__label__und_Batk: 0.0000
