# MaskLID Experiments

This notebook runs MaskLID for code-switching on some example sentences.

In [155]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(
    repo_id="cis-lmu/glotlid",
    filename="model_v1.bin", 
    cache_dir=None
)
model = fasttext.load_model(model_path)

from MaskLID.masklid import MaskLID
masklid_model = MaskLID(model_path)

In [156]:
import numpy as np

def predict_top1(sentence: str, threshold: float = 0.5):
    """
    Retourne :
    - label GlotLID (ex: 'fra_Latn', 'eng_Latn', 'wol_Latn', ...)
    - probabilité associée
    - ou ('undetermined', prob) si probabilité < threshold
    """
    labels, probs = model.predict(sentence, k=1)
    label = labels[0].replace("__label__", "")
    prob = float(probs[0])
    if prob < threshold:
        return "undetermined", prob
    return label, prob


def predict_topk(sentence: str, k: int = 3):
    """
    Retourne les k meilleures langues avec leurs probabilités,
    sans appliquer de seuil.
    """
    labels, probs = model.predict(sentence, k=k)
    labels = [lab.replace("__label__", "") for lab in labels]
    probs = [float(p) for p in probs]
    return list(zip(labels, probs))


def print_prediction(sentence: str, threshold: float = 0.5, k: int = 3):
    """
    Affiche :
    - phrase
    - prédiction top-1 avec seuil
    - top-k sans seuil (pour visualiser les langues cousines)
    """
    top1_label, top1_prob = predict_top1(sentence, threshold=threshold)
    topk = predict_topk(sentence, k=k)

    print("________________________________________")
    print(f"Texte : {sentence}")
    print(f"Top-1 (θ={threshold}) : {top1_label} (p={top1_prob:.3f})")
    print("Top-k without threshold:")
    for lab, p in topk:
        print(f"   - {lab:10s}  p={p:.3f}")


In [157]:
text_tr_en = "bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop"

print_prediction(text_tr_en, k=5)

________________________________________
Texte : bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop
Top-1 (θ=0.5) : tur (p=0.999)
Top-k without threshold:
   - tur         p=0.999
   - ota         p=0.000
   - crh         p=0.000
   - aze         p=0.000
   - azj         p=0.000


In [158]:
def run_analysis(text):    
    results = masklid_model.predict_codeswitch(
        text,
        # --- Paper Defaults (Appendix C.2) ---
        beta=15,             
        alpha=3,             
        min_length=20,       
        min_prob=0.9,       
        max_lambda=2,         
    )
    return results

run_analysis(text_tr_en)

{'__label__tur': 'bir kahve dükkanında geçen tadında güzel bir şarkıya ayrılsın gece',
 '__label__eng': 'film falling coffee shop'}

In [170]:
text_pt_en = "Gostei muito do trabalho dos autores, o MaskLID seems to be getting some cases right."
############## I really liked the authors' work

print_prediction(text_pt_en, k=5)
print("\n\nWith MaskLID:")
run_analysis(text_pt_en)

________________________________________
Texte : Gostei muito do trabalho dos autores, o MaskLID seems to be getting some cases right.
Top-1 (θ=0.5) : por (p=0.996)
Top-k without threshold:
   - por         p=0.996
   - pol         p=0.000
   - est         p=0.000
   - eng         p=0.000
   - lit         p=0.000


With MaskLID:


{'__label__por': 'Gostei muito trabalho dos autores,',
 '__label__eng': 'seems getting cases right.'}

In [165]:
text_fr_en = "I love this, it's such a nice tool mais le modéle ne marche pas toujours."

print_prediction(text_fr_en, k=5)
print("\n\nWith MaskLID:")
run_analysis(text_fr_en)

________________________________________
Texte : I love this, it's such a nice tool mais le modéle ne marche pas toujours.
Top-1 (θ=0.5) : fra (p=0.942)
Top-k without threshold:
   - fra         p=0.942
   - eng         p=0.022
   - wol         p=0.005
   - hat         p=0.002
   - dan         p=0.002


With MaskLID:


{'__label__fra': 'modéle marche pas toujours.'}

In [166]:
import pandas as pd
# 1. Global Prediction
global_pred, global_prob = masklid_model.predict(text_fr_en, k=1)
l1_label = global_pred[0]
print(f"L1 Detected: {l1_label} ({global_prob[0]:.4f})\n")

word_data = masklid_model.compute_v_per_word(text_fr_en)

alpha = 3
beta = 15  
data = []
for key, val in word_data.items():
    word = key.split('_', 1)[1]
    logits = val['logits'] 
    labels = [x[0] for x in logits]

    # Get Rank of L1
    try:
        l1_rank = labels.index(l1_label) + 1
    except ValueError:
        l1_rank = 999

    action = " "
    if l1_rank <= alpha:
        action = "alpha"
    elif l1_rank <= beta:
        action = "beta"

    top3_labels = [lbl.replace("__label__", "").split("_")[0] for lbl, prob in logits[:3]]

    data.append({
        "Word": word,
        "Rank": l1_rank,
        "Action": action,
        "Top 3 Preds": ", ".join(top3_labels)
    })

pd.set_option('display.max_colwidth', None) # Ensure we see all 5 langs
pd.DataFrame(data)


L1 Detected: __label__fra (0.9421)



Unnamed: 0,Word,Rank,Action,Top 3 Preds
0,I,1245,,"swg, yml, zul"
1,love,1414,,"lom, din, bsq"
2,"this,",111,,"iso, bzj, koo"
3,it's,190,,"eng, sco, ind"
4,such,651,,"rar, luo, zne"
5,a,247,,"pau, mop, bfo"
6,nice,551,,"lam, eng, tuo"
7,tool,1157,,"tuc, mmo, apr"
8,mais,28,,"xog, por, mbc"
9,le,81,,"sot, fud, nso"


Only toujours is detected as STRONG french.

I love this, it's such a nice tool mais le modéle ne marche pas
-> french

In [162]:
text_fr_en = "I love this, it's such a nice tool mais le modéle ne marche pas."

print_prediction(text_fr_en, k=5)
print("\n\nWith MaskLID:")
run_analysis(text_fr_en)

________________________________________
Texte : I love this, it's such a nice tool mais le modéle ne marche pas.
Top-1 (θ=0.5) : fra (p=0.847)
Top-k without threshold:
   - fra         p=0.847
   - eng         p=0.051
   - est         p=0.010
   - slv         p=0.009
   - dan         p=0.007


With MaskLID:


{'__label__fra': 'modéle marche pas.'}