In [13]:
!apt-get -qq update
!apt-get -qq install -y hunspell git libhunspell-dev

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libhunspell-dev:amd64.
(Reading database ... 128735 files and directories currently installed.)
Preparing to unpack .../libhunspell-dev_1.7.0-4build1_amd64.deb ...
Unpacking libhunspell-dev:amd64 (1.7.0-4build1) ...
Setting up libhunspell-dev:amd64 (1.7.0-4build1) ...
Processing triggers for man-db (2.10.2-1) ...


In [14]:
!pip install hunspell

Collecting hunspell
  Using cached hunspell-0.5.5.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: hunspell
  Building wheel for hunspell (setup.py) ... [?25l[?25hdone
  Created wheel for hunspell: filename=hunspell-0.5.5-cp311-cp311-linux_x86_64.whl size=66312 sha256=0b5000b26d6eb6e89d8cdd3e49a07e82d1beb9073da82a0aa180dad86c80c24c
  Stored in directory: /root/.cache/pip/wheels/0b/41/b3/14ebfe8dfb3116e3f1ab55ff0db766d1ef033b6842ccc67e24
Successfully built hunspell
Installing collected packages: hunspell
Successfully installed hunspell-0.5.5


In [9]:
!echo $PWD/dictionaries/dictionaries

/kaggle/working/dictionaries/dictionaries


In [18]:
!git clone https://github.com/wooorm/dictionaries
!pwd

fatal: destination path 'dictionaries' already exists and is not an empty directory.
/kaggle/working


In [None]:
import os, re, glob, pandas as pd, hunspell

FILE_DIR = "/kaggle/input/split-braxen-by-language"
DICT_ROOT = "/kaggle/working/dictionaries/dictionaries"  # as you showed
OUT_TSV = "hunspell_results.tsv"
MIN_ENTRIES = 1000
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}

# find index.aff/index.dic pairs under your DICT_ROOT
pairs = {}
for aff in glob.glob(os.path.join(DICT_ROOT, "*", "index.aff")):
    d = os.path.dirname(aff)
    dic = os.path.join(d, "index.dic")
    code = os.path.basename(d)
    if os.path.isfile(dic):
        pairs[code] = (dic, aff)

CODE2DICT = {
    "lat":["la"],
    "swe":["sv"],
    "nor":["nb","nn"],
    "dan":["da"],
    "isl":["is"],
    "fin":["fi"],
    "est":["et"],
    "lav":["lv"],
    "lit":["lt"],
    "pol":["pl"],
    "cze":["cs"],
    "slk":["sk"],
    "slv":["sl"],
    "hrv":["hr"],
    "srp":["sr-Latn"],
    "bos":["bs"],
    "mkd":["mk"],
    "bul":["bg"],
    "ukr":["uk"],
    "rus":["ru"],
    "deu":["de"],
    "nld":["nl","dut"],
    "eng":["en","en-GB","en-CA","en-AU","en-ZA"],
    "fre":["fr"],
    "ita":["it"],
    "spa":["es","es-MX","es-AR","es-CL","es-ES"],
    "por":["pt","pt-PT"],
    "rom":["ro"],
    "hun":["hu"],
    "tur":["tr"],
    "gre":["el"],
}

WORD_RE = re.compile(r"[^\W\d_][\w’'\-\u2011\u2013\u2014]*", re.UNICODE)
def tokenize(t): return WORD_RE.findall(t)
def read_text(p):
    b = open(p,"rb").read()
    try: return b.decode("utf-8")
    except UnicodeDecodeError: return b.decode("utf-8", errors="ignore")
def file_code(p):
    b = os.path.basename(p)
    return b[len("braxen-"):-4] if b.startswith("braxen-") else b

def load_hs(dict_codes):
    for c in dict_codes:
        if c in pairs:
            dic, aff = pairs[c]
            return hunspell.HunSpell(dic, aff), c
    return None, None

files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR) if f.startswith("braxen-") and f.endswith(".txt")]

file_words, file_sizes = {}, {}
for p in files:
    ws = set(tokenize(read_text(p)))
    file_words[p] = ws
    file_sizes[p] = len(ws)

candidates = []
for p, n in file_sizes.items():
    code = file_code(p)
    if n >= MIN_ENTRIES and code not in SKIP_CODES and code in CODE2DICT:
        candidates.append(p)

rows = []
for p in sorted(candidates):
    code = file_code(p)
    hs, used = load_hs(CODE2DICT[code])
    if not hs:
        print(f"skip {code}: dict not found for {CODE2DICT[code]}")
        continue
    words = sorted(file_words[p])
    print(f"{code}: {len(words)} tokens via {used}")
    for w in words:
        if not w: 
            continue
        if all(ord(ch) < 128 for ch in w) and len(w) < 2:
            continue
        wcheck = w.replace("ö","ø").replace("Ö","Ø") if code in {"nor","dan"} else w
        if hs.spell(wcheck):
            rows.append((code, w, "OK", ""))
        else:
            sugs = ", ".join(hs.suggest(wcheck))
            rows.append((code, w, "MISS", sugs))

df = pd.DataFrame(rows, columns=["file_code","word","status","suggestions"])
df.to_csv(OUT_TSV, sep="\t", index=False, encoding="utf-8")
print(f"Wrote {OUT_TSV} with {len(df):,} rows")
df.head(20)

dan: 1941 tokens via da
