In [13]:
!apt-get -qq update
!apt-get -qq install -y hunspell git libhunspell-dev

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libhunspell-dev:amd64.
(Reading database ... 128735 files and directories currently installed.)
Preparing to unpack .../libhunspell-dev_1.7.0-4build1_amd64.deb ...
Unpacking libhunspell-dev:amd64 (1.7.0-4build1) ...
Setting up libhunspell-dev:amd64 (1.7.0-4build1) ...
Processing triggers for man-db (2.10.2-1) ...


In [14]:
!pip install hunspell

Collecting hunspell
  Using cached hunspell-0.5.5.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: hunspell
  Building wheel for hunspell (setup.py) ... [?25l[?25hdone
  Created wheel for hunspell: filename=hunspell-0.5.5-cp311-cp311-linux_x86_64.whl size=66312 sha256=0b5000b26d6eb6e89d8cdd3e49a07e82d1beb9073da82a0aa180dad86c80c24c
  Stored in directory: /root/.cache/pip/wheels/0b/41/b3/14ebfe8dfb3116e3f1ab55ff0db766d1ef033b6842ccc67e24
Successfully built hunspell
Installing collected packages: hunspell
Successfully installed hunspell-0.5.5


In [9]:
!echo $PWD/dictionaries/dictionaries

/kaggle/working/dictionaries/dictionaries


In [18]:
!git clone https://github.com/wooorm/dictionaries
!pwd

fatal: destination path 'dictionaries' already exists and is not an empty directory.
/kaggle/working


In [25]:
import os, re, hunspell, pandas as pd
from collections import defaultdict

FILE_DIR = "/kaggle/input/split-braxen-by-language"
DICT_ROOT = "/kaggle/working/dictionaries/dictionaries"
OUT_TSV = "hunspell_results.tsv"

MIN_ENTRIES = 1000
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}

CODE2DICT = {
    "swe": ["sv"],
    "nor": ["nb","nn"],
    "dan": ["da"],
    "isl": ["is"],
    "fin": ["fi"],
    "est": ["et"],
    "lat": ["la"],
    "lav": ["lv"],
    "lit": ["lt"],
    "pol": ["pl"],
    "cze": ["cs"],
    "slk": ["sk"],
    "slv": ["sl"],
    "hrv": ["hr"],
    "srp": ["sr-Latn"],
    "bos": ["bs"],
    "mkd": ["mk"],
    "bul": ["bg"],
    "ukr": ["uk"],
    "rus": ["ru"],
    "deu": ["de"],
    "nld": ["nl"],
    "eng": ["en","en-GB","en-CA"],
    "fre": ["fr"],
    "ita": ["it"],
    "spa": ["es"],
    "por": ["pt","pt-PT"],
    "rom": ["ro"],
    "hun": ["hu"],
    "tur": ["tr"],
    "gre": ["el"],
}

WORD_RE = re.compile(r"[^\W\d_][\w’'\-\u2011\u2013\u2014]*", re.UNICODE)

def tokenize(text):
    return WORD_RE.findall(text)

def read_text(path):
    with open(path, "rb") as f:
        data = f.read()
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError:
        return data.decode("utf-8", errors="ignore")

def file_code(path):
    base = os.path.basename(path)
    return base[len("braxen-"):-4]

def load_hunspell(dict_codes):
    aff, dic = None, None
    for code in dict_codes:
        base = os.path.join(DICT_ROOT, code, code)
        if os.path.exists(base + ".aff") and os.path.exists(base + ".dic"):
            return hunspell.HunSpell(base + ".dic", base + ".aff")
    return None

files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR)
         if f.startswith("braxen-") and f.endswith(".txt")]

rows = []
for path in files:
    code = file_code(path)
    if code in SKIP_CODES:
        continue
    words = set(tokenize(read_text(path)))
    if len(words) < MIN_ENTRIES:
        continue

    dict_codes = CODE2DICT.get(code)
    if not dict_codes:
        print(f"Skipping {code} — no dictionary mapping.")
        continue

    hobj = load_hunspell(dict_codes)
    if not hobj:
        print(f"Skipping {code} — dictionary not found in {DICT_ROOT}.")
        continue

    print(f"Checking {code} ({len(words)} words) ...")

    for w in sorted(words):
        if not w:
            continue
        if all(ord(ch) < 128 for ch in w) and len(w) < 2:
            continue
        # ø/ö quirk normalization
        wcheck = w.replace("ö","ø").replace("Ö","Ø") if code in {"nor","dan"} else w
        ok = hobj.spell(wcheck)
        if ok:
            rows.append((code, w, "OK", ""))
        else:
            sugs = ", ".join(hobj.suggest(wcheck))
            rows.append((code, w, "MISS", sugs))

df = pd.DataFrame(rows, columns=["file_code","word","status","suggestions"])
df.to_csv(OUT_TSV, sep="\t", index=False, encoding="utf-8")

print(f"Wrote {OUT_TSV} with {len(df)} rows")
df.head(30)


Skipping ara — no dictionary mapping.
Skipping fre — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping dan — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping lat — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping swe — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping eng — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping rus — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping nob — no dictionary mapping.
Skipping ita — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping spa — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Skipping ger — no dictionary mapping.
Skipping fin — dictionary not found in /kaggle/working/dictionaries/dictionaries.
Wrote hunspell_results.tsv with 0 rows


Unnamed: 0,file_code,word,status,suggestions


In [23]:
!ls

dictionaries  hunspell_results.tsv


In [24]:
!cat hunspell_results.tsv

file_code	word	status	suggestions
