# Check Braxen multilingual entries

> "Some are very wrong"

- badges: false
- branch: master
- categories: [braxen, multilingual]

[Kaggle](https://www.kaggle.com/code/jimregan/notebook13614fe245)

In [1]:
import os, re
from collections import defaultdict
import pandas as pd

FILE_DIR = "/kaggle/input/split-braxen-by-language"
MIN_ENTRIES = 1000
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}
SAMPLES_PER_FILE = 50

def has_cyrillic(s):
    for ch in s:
        o = ord(ch)
        if (0x0400 <= o <= 0x04FF) or (0x0500 <= o <= 0x052F) or (0x2DE0 <= o <= 0x2DFF) or (0xA640 <= o <= 0xA69F):
            return True
    return False

def has_greek(s):
    for ch in s:
        o = ord(ch)
        if (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF):
            return True
    return False

DIACRITICS = {
    "pol": set("ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"),
    "cze": set("áéíóúýčďěňřšťůžÁÉÍÓÚÝČĎĚŇŘŠŤŮŽ"),
    "slk": set("áäčďéíľĺňóôŕšťúýžÁÄČĎÉÍĽĹŇÓÔŔŠŤÚÝŽ"),
    "slv": set("čšžČŠŽ"),
    "hrv": set("čćđšžČĆĐŠŽ"),
    "srp": set("čćđšžČĆĐŠŽ"),
    "rom": set("ăâîșţșțĂÂÎȘŢȚ"),
    "hun": set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ"),
    "tur": set("çğıöşüÇĞİÖŞÜ"),
    "lit": set("ąčęėįšųūžĄČĘĖĮŠŲŪŽ"),
    "lav": set("āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ"),
    "deu": set("äöüßÄÖÜ"),
    "fre": set("àâæçéèêëîïôœùûüÿÀÂÆÇÉÈÊËÎÏÔŒÙÛÜŸ"),  # <-- 'fre', not 'fra'
    "spa": set("áéíñóúüÁÉÍÑÓÚÜ"),
    "por": set("áâãàçéêíóôõúÁÂÃÀÇÉÊÍÓÔÕÚ"),
    "isl": set("áéíóúýðþæöÁÉÍÓÚÝÐÞÆÖ"),
}

def file_code_from_name(fname):
    base = os.path.basename(fname)
    if base.startswith("braxen-") and base.endswith(".txt"):
        return base[len("braxen-"):-len(".txt")]
    return base

def read_text_safely(path):
    with open(path, "rb") as f:
        data = f.read()
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError:
        return data.decode("utf-8", errors="ignore")

# start-letter, then letters/digits/underscore/apostrophes/hyphens/en-dash
WORD_RE = re.compile(r"[^\W\d_][\w’'\-\u2011\u2013\u2014]*", flags=re.UNICODE)

def tokenize_words(text):
    return WORD_RE.findall(text)

def detect_diacritic_langs(word):
    hits = []
    for code, chars in DIACRITICS.items():
        if any(ch in word for ch in chars):
            hits.append(code)
    return hits

def is_bulgarian_like_cyrillic(word):
    w = word.lower()
    if "ъ" in w:
        return True
    if (w.endswith(("ът","та","то","те")) and ("ь" not in w)) and ("ы" not in w and "ё" not in w):
        return True
    return False

CZE_DIACS = DIACRITICS["cze"]

files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR)
         if f.startswith("braxen-") and f.endswith(".txt")]

file_word_sets, file_sizes = {}, {}
for path in files:
    text = read_text_safely(path)
    words = tokenize_words(text)
    uniq = set(words)
    file_word_sets[path] = uniq
    file_sizes[path] = len(uniq)

candidate_paths = []
for p, n in file_sizes.items():
    code = file_code_from_name(p)
    if n >= MIN_ENTRIES and code not in SKIP_CODES:
        candidate_paths.append(p)

print(f"Found {len(candidate_paths)} large non-catch-all files (>= {MIN_ENTRIES} entries).")

records = []
per_file_flags = defaultdict(lambda: defaultdict(int))

def add_record(word, current_code, suggested, reason):
    records.append((word, current_code, suggested, reason))
    per_file_flags[current_code][reason] += 1

for path in candidate_paths:
    code = file_code_from_name(path)
    uniq_words = file_word_sets[path]

    for w in uniq_words:
        if has_greek(w) and code != "gre":
            add_record(w, code, "gre", "Greek script in non-Greek file")
        elif has_cyrillic(w) and code not in {"rus","ukr","bul","mkd","srp"}:
            add_record(w, code, "cyrillic?", "Cyrillic characters in non-Cyrillic file")

    for w in uniq_words:
        if all(ord(ch) < 128 for ch in w):
            continue
        langs = detect_diacritic_langs(w)
        if not langs:
            continue
        if code in langs:
            continue
        # suppress French diacritics flagged inside Arabic file
        if code == "ara" and "fre" in langs:
            continue
        suggested = langs[0] if len(langs) == 1 else "ambiguous(" + ",".join(sorted(langs)) + ")"
        add_record(w, code, suggested, f"Contains diacritics typical of {','.join(sorted(langs))}")

    if code == "lat":
        for w in uniq_words:
            if any(ch in DIACRITICS["lit"] for ch in w):
                add_record(w, code, "lit", "Lithuanian diacritics in LAT")
            elif any(ch in DIACRITICS["lav"] for ch in w):
                add_record(w, code, "lav", "Latvian diacritics in LAT")

    if code == "rus":
        for w in uniq_words:
            if has_cyrillic(w) and is_bulgarian_like_cyrillic(w):
                add_record(w, code, "bul", "Bulgarian hard vowel/definite article in RUS")

    if code == "pol":
        for w in uniq_words:
            if any(ch in CZE_DIACS for ch in w):
                add_record(w, code, "cze", "Czech diacritics in POL")

    if code == "chi":
        for w in uniq_words:
            if any(ch in CZE_DIACS for ch in w) and w not in {"Zhéng"}:
                add_record(w, code, "cze", "Czech diacritics in CHI")

detailed_df = pd.DataFrame(records, columns=["word", "current_code", "suggested_code", "reason"]).drop_duplicates()
detailed_df.to_csv("braxen_obvious_misclassifications_filtered.csv", index=False, encoding="utf-8")

summary_rows = []
for code, counters in per_file_flags.items():
    total = sum(counters.values())
    row = {"file_code": code, "total_flags": total}
    row.update(counters)
    summary_rows.append(row)
summary_df = pd.DataFrame(summary_rows).sort_values("total_flags", ascending=False)
summary_df.to_csv("braxen_obvious_misclassifications_per_file_summary.csv", index=False, encoding="utf-8")

sample_rows = []
if not detailed_df.empty:
    for code, sub in detailed_df.groupby("current_code"):
        sample_rows.append(sub.head(SAMPLES_PER_FILE))
    pd.concat(sample_rows, ignore_index=True).to_csv("braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8")
else:
    pd.DataFrame(columns=["word","current_code","suggested_code","reason"]).to_csv(
        "braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8"
    )

print("Wrote:")
print(" - braxen_obvious_misclassifications_filtered.csv")
print(" - braxen_obvious_misclassifications_per_file_summary.csv")
print(" - braxen_obvious_misclassifications_samples.csv")

from IPython.display import display
display(summary_df.head(20))
display(detailed_df.head(50))


Found 12 large non-catch-all files (>= 1000 entries).
Wrote:
 - braxen_obvious_misclassifications_filtered.csv
 - braxen_obvious_misclassifications_per_file_summary.csv
 - braxen_obvious_misclassifications_samples.csv


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,file_code,total_flags,Contains diacritics typical of lav,"Contains diacritics typical of deu,slk","Contains diacritics typical of cze,hun,isl,pol,por,slk,spa","Contains diacritics typical of cze,hun,isl,por,slk,spa","Contains diacritics typical of deu,hun,isl,tur",Contains diacritics typical of por,"Contains diacritics typical of fre,isl","Contains diacritics typical of deu,hun,isl,slk,tur",...,"Contains diacritics typical of cze,deu,hun,isl,pol,por,slk,spa,tur",Contains diacritics typical of pol,Contains diacritics typical of deu,"Contains diacritics typical of deu,fre,hun,isl,por,slk,tur","Contains diacritics typical of hrv,srp","Contains diacritics typical of cze,hun,isl,lav,por,slk,spa","Contains diacritics typical of cze,isl,slk","Contains diacritics typical of hrv,pol,srp","Contains diacritics typical of deu,fre,hun,por,rom,spa,tur",Bulgarian hard vowel/definite article in RUS
4,swe,155799,2.0,81218,89.0,277.0,59317.0,3.0,155.0,9671.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
10,ger,938,,210,,1.0,236.0,,,,...,,,21.0,,,,,,,
11,fin,678,,556,,1.0,63.0,,,58.0,...,,,,,,,,,,
2,dan,312,,14,,,30.0,,259.0,5.0,...,,,,,,,,,,
7,nob,268,,28,1.0,2.0,74.0,,160.0,1.0,...,,,,,,,,,,
3,lat,39,2.0,3,,2.0,3.0,,18.0,,...,,,,,,,,,,
5,eng,38,,8,1.0,3.0,8.0,,,,...,,,,,,,,,,
9,spa,13,,1,,,,,,,...,,,,,,,,,,
1,fre,12,,2,3.0,3.0,3.0,1.0,,,...,,,,,,,,,,
0,ara,11,1.0,1,1.0,8.0,,,,,...,,,,,,,,,,


Unnamed: 0,word,current_code,suggested_code,reason
0,Ḥayāh,ara,lav,Contains diacritics typical of lav
1,ä,ara,"ambiguous(deu,slk)","Contains diacritics typical of deu,slk"
2,Ómar,ara,"ambiguous(cze,hun,isl,pol,por,slk,spa)","Contains diacritics typical of cze,hun,isl,pol..."
3,Bahá,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
4,Abdu'l-Bahá,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
5,Baháulláh,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
6,Bahá'u'lláh,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
7,Al-Qáda,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
8,Bahái,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
9,Nabíl-i,ara,"ambiguous(cze,hun,isl,por,slk,spa)","Contains diacritics typical of cze,hun,isl,por..."
