In [1]:
!apt-get -qq update
!apt-get -qq install -y hunspell git libhunspell-dev

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Preconfiguring packages ...
Selecting previously unselected package libtext-iconv-perl.
(Reading database ... 128639 files and directories currently installed.)
Preparing to unpack .../0-libtext-iconv-perl_1.7-7build3_amd64.deb ...
Unpacking libtext-iconv-perl (1.7-7build3) ...
Selecting previously unselected package dictionaries-common.
Preparing to unpack .../1-dictionaries-common_1.28.14_all.deb ...
Adding 'diversion of /usr/share/dict/words to /usr/share/dict/words.pre-dictionaries-common by dictionaries-common'
Unpacking dictionaries-common (1.28.14) ...
Selecting previously unselected package hunspell-en-us.
Preparing to unpack .../2-hunspell-en-us_1%3a2020.12.07-2_all.deb ...
Unpacking hunspell-en-us (1:2020.12.07-2) ...
Selecting previously unselected package libhunspell-1.7-0:am

In [2]:
!pip install hunspell

Collecting hunspell
  Downloading hunspell-0.5.5.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: hunspell
  Building wheel for hunspell (setup.py) ... [?25l[?25hdone
  Created wheel for hunspell: filename=hunspell-0.5.5-cp311-cp311-linux_x86_64.whl size=66309 sha256=b3033f2b5e344b091a8a0e70f30582a9b62399dd9f19e1b0ab4d170f0075c8ce
  Stored in directory: /root/.cache/pip/wheels/0b/41/b3/14ebfe8dfb3116e3f1ab55ff0db766d1ef033b6842ccc67e24
Successfully built hunspell
Installing collected packages: hunspell
Successfully installed hunspell-0.5.5


In [3]:
!echo $PWD/dictionaries/dictionaries

/kaggle/working/dictionaries/dictionaries


In [4]:
!git clone https://github.com/wooorm/dictionaries
!pwd

Cloning into 'dictionaries'...
remote: Enumerating objects: 11042, done.[K
remote: Counting objects: 100% (1506/1506), done.[K
remote: Compressing objects: 100% (213/213), done.[K
remote: Total 11042 (delta 1318), reused 1303 (delta 1293), pack-reused 9536 (from 1)[K
Receiving objects: 100% (11042/11042), 101.83 MiB | 18.31 MiB/s, done.
Resolving deltas: 100% (9492/9492), done.
Updating files: 100% (570/570), done.
/kaggle/working


In [5]:
import os, re, glob, pandas as pd, hunspell

FILE_DIR = "/kaggle/input/split-braxen-by-language"
DICT_ROOT = "/kaggle/working/dictionaries/dictionaries"  # as you showed
OUT_TSV = "hunspell_results.tsv"
MIN_ENTRIES = 100
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}

# find index.aff/index.dic pairs under your DICT_ROOT
pairs = {}
for aff in glob.glob(os.path.join(DICT_ROOT, "*", "index.aff")):
    d = os.path.dirname(aff)
    dic = os.path.join(d, "index.dic")
    code = os.path.basename(d)
    if os.path.isfile(dic):
        pairs[code] = (dic, aff)

CODE2DICT = {
    "lat":["la"],
    "swe":["sv"],
    "nob":["nb"],
    "nno":["nn"],
    "dan":["da"],
    "isl":["is"],
    "fin":["fi"],
    "est":["et"],
    "lav":["lv"],
    "lit":["lt"],
    "pol":["pl"],
    "cze":["cs"],
    "slk":["sk"],
    "slv":["sl"],
    "hrv":["hr"],
    "srp":["sr-Latn"],
    "bos":["bs"],
    "mkd":["mk"],
    "bul":["bg"],
    "ukr":["uk"],
    "rus":["ru"],
    "deu":["de"],
    "nld":["nl","dut"],
    "eng":["en","en-GB","en-CA","en-AU","en-ZA"],
    "fre":["fr"],
    "ita":["it"],
    "spa":["es","es-MX","es-AR","es-CL","es-ES"],
    "por":["pt","pt-PT"],
    "rom":["ro"],
    "hun":["hu"],
    "tur":["tr"],
    "gre":["el"],
}

def read_text(p):
    b = open(p,"rb").read()
    try: return b.decode("utf-8")
    except UnicodeDecodeError: return b.decode("utf-8", errors="ignore")

def get_words(text):
    output = []
    for line in text.split("\n"):
        parts = line.split("\t")
        output.append(parts[0])
    return output

def file_code(p):
    b = os.path.basename(p)
    return b[len("braxen-"):-4] if b.startswith("braxen-") else b

def load_hs(dict_codes):
    for c in dict_codes:
        if c in pairs:
            dic, aff = pairs[c]
            return hunspell.HunSpell(dic, aff), c
    return None, None

files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR) if f.startswith("braxen-") and f.endswith(".txt")]

file_words, file_sizes = {}, {}
for p in files:
    ws = set(get_words(read_text(p)))
    file_words[p] = ws
    file_sizes[p] = len(ws)

candidates = []
for p, n in file_sizes.items():
    code = file_code(p)
    if n >= MIN_ENTRIES and code not in SKIP_CODES and code in CODE2DICT:
        candidates.append(p)

rows = []
for p in sorted(candidates):
    code = file_code(p)
    hs, used = load_hs(CODE2DICT[code])
    if not hs:
        print(f"skip {code}: dict not found for {CODE2DICT[code]}")
        continue
    words = sorted(file_words[p])
    print(f"{code}: {len(words)} tokens via {used}")
    for w in words:
        if not w: 
            continue
        if all(ord(ch) < 128 for ch in w) and len(w) < 2:
            continue
        wcheck = w.replace("ö","ø").replace("Ö","Ø") if code in {"nor","dan"} else w
        if hs.spell(wcheck):
            rows.append((code, w, "OK", ""))
        else:
            sugs = ", ".join(hs.suggest(wcheck))
            rows.append((code, w, "MISS", sugs))

df = pd.DataFrame(rows, columns=["file_code","word","status","suggestions"])
df.to_csv(OUT_TSV, sep="\t", index=False, encoding="utf-8")
print(f"Wrote {OUT_TSV} with {len(df):,} rows")
df.head(20)

cze: 371 tokens via cs
dan: 1904 tokens via da
eng: 19700 tokens via en
skip fin: dict not found for ['fi']
fre: 5465 tokens via fr
gre: 817 tokens via el
hun: 361 tokens via hu
ita: 3051 tokens via it
lat: 3753 tokens via la
nob: 3763 tokens via nb
pol: 707 tokens via pl
por: 440 tokens via pt
rus: 1321 tokens via ru
spa: 2405 tokens via es
swe: 707415 tokens via sv
tur: 762 tokens via tr
ukr: 106 tokens via uk
Wrote hunspell_results.tsv with 752,322 rows


Unnamed: 0,file_code,word,status,suggestions
0,cze,Adamkova,OK,
1,cze,Allertova,MISS,"Albertova, Gallertová, Albertov, Tolerovat"
2,cze,Babiš,OK,
3,cze,Balazova,MISS,"Balasova, Balažova, Balažová, Balážova, Balážo..."
4,cze,Banik,OK,
5,cze,Banska,MISS,"Bánská, Banka, Baska, Blanska, Banika, Baníka,..."
6,cze,Baranka,OK,
7,cze,Bartecko,MISS,"Bartečko, Barteckou, Bartesko, Bartecký, Barte..."
8,cze,Bartok,OK,
9,cze,Bartosak,MISS,"Bartošák, Barto sak, Barto-sak, Bartok"
