In [2]:
!pip install hunspell

Collecting hunspell
  Using cached hunspell-0.5.5.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: hunspell
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for hunspell (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for hunspell[0m[31m
[0m[?25h  Running setup.py clean for hunspell
Failed to build hunspell
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (hunspell)[0m[31m
[0m

In [3]:
!apt-get -qq update
!apt-get -qq install -y hunspell git

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Preconfiguring packages ...
Selecting previously unselected package libtext-iconv-perl.
(Reading database ... 128639 files and directories currently installed.)
Preparing to unpack .../libtext-iconv-perl_1.7-7build3_amd64.deb ...
Unpacking libtext-iconv-perl (1.7-7build3) ...
Selecting previously unselected package dictionaries-common.
Preparing to unpack .../dictionaries-common_1.28.14_all.deb ...
Adding 'diversion of /usr/share/dict/words to /usr/share/dict/words.pre-dictionaries-common by dictionaries-common'
Unpacking dictionaries-common (1.28.14) ...
Selecting previously unselected package hunspell-en-us.
Preparing to unpack .../hunspell-en-us_1%3a2020.12.07-2_all.deb ...
Unpacking hunspell-en-us (1:2020.12.07-2) ...
Selecting previously unselected package libhunspell-1.7-0:amd64.
Preparing to u

In [5]:
import os, re, subprocess, shlex
import pandas as pd
from collections import defaultdict

# CONFIG
FILE_DIR = "/kaggle/input/split-braxen-by-language"
MIN_ENTRIES = 1000
SKIP_CODES = {"afr","asi","aus","sla","mix","fisa"}
OUT_TSV  = "hunspell_results.tsv"
SAMPLES_PER_FILE = None  # set e.g. 500 to sample per file during testing

# Get dictionaries
if not os.path.exists("dictionaries"):
    !git clone -q https://github.com/wooorm/dictionaries.git

DICT_ROOT = os.path.abspath("dictionaries")

# Map your codes -> hunspell dictionary basenames (relative to DICT_ROOT)
# You can give multiple dicts; they will be combined (e.g. nb+nn, pt+pt-PT)
CODE2DICT = {
    "swe": ["sv"],
    "nor": ["nb","nn"],        # Norwegian Bokmål + Nynorsk
    "dan": ["da"],
    "isl": ["is"],
    "fin": ["fi"],
    "est": ["et"],
    "lav": ["lv"],
    "lit": ["lt"],
    "pol": ["pl"],
    "cze": ["cs"],
    "slk": ["sk"],
    "slv": ["sl"],
    "hrv": ["hr"],
    "srp": ["sr-Latn"],        # Latin Serbian
    "bos": ["bs"],
    "mkd": ["mk"],
    "bul": ["bg"],
    "ukr": ["uk"],
    "rus": ["ru"],
    "deu": ["de"],             # de, de-AT, de-CH also available
    "nld": ["nl"], "dut": ["nl"],
    "eng": ["en","en-GB","en-CA","en-AU","en-ZA"],  # combine EN variants
    "fre": ["fr"],
    "ita": ["it"],
    "spa": ["es","es-MX","es-AR","es-CL","es-ES"],  # add variants to taste
    "por": ["pt","pt-PT"],
    "rom": ["ro"],
    "hun": ["hu"],
    "tur": ["tr"],
    "gre": ["el"],
    "wel": ["cy"],
    "gle": ["ga"],  # if you have it
    # add more as needed; see `ls dictionaries` for available dirs
}

WORD_RE = re.compile(r"[^\W\d_][\w’'\-\u2011\u2013\u2014]*", flags=re.UNICODE)

def tokenize(text: str):
    return WORD_RE.findall(text)

def read_text(path):
    with open(path, "rb") as f:
        b = f.read()
    try:
        return b.decode("utf-8")
    except UnicodeDecodeError:
        return b.decode("utf-8", errors="ignore")

def file_code_from_name(path):
    base = os.path.basename(path)
    return base[len("braxen-"):-len(".txt")] if base.startswith("braxen-") else base

def list_candidate_files():
    files = [os.path.join(FILE_DIR, f) for f in os.listdir(FILE_DIR)
             if f.startswith("braxen-") and f.endswith(".txt")]
    sizes = {}
    uniqs = {}
    for p in files:
        ws = set(tokenize(read_text(p)))
        sizes[p] = len(ws)
        uniqs[p] = ws
    keep = []
    for p, n in sizes.items():
        code = file_code_from_name(p)
        if n >= MIN_ENTRIES and code not in SKIP_CODES and code in CODE2DICT:
            keep.append((p, uniqs[p]))
    return keep

def dict_args_for(code):
    dicts = CODE2DICT.get(code, [])
    # Validate presence of .aff/.dic, build full dict basenames for -d
    ok = []
    for d in dicts:
        base = os.path.join(DICT_ROOT, d, d)
        if os.path.exists(base + ".aff") and os.path.exists(base + ".dic"):
            ok.append(base)
    return ok

def run_hunspell(words, dict_bases, code):
    # hunspell -a -i utf-8 -d dict[,dict2,...]
    # Note: pass dict_bases comma-separated; hunspell finds .aff/.dic automatically.
    if not dict_bases:
        return []
    cmd = ["hunspell","-a","-i","utf-8","-d", ",".join(dict_bases)]
    p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    # Input: one word per line
    stdin_data = "\n".join(words) + "\n"
    out, err = p.communicate(stdin_data, timeout=300)

    results = []
    # Output format: first line is version/banner, then one response per input token
    # Lines:
    #  *  => OK
    #  +  => OK (root/compounded)
    #  & word count suggestions: sug, sug, ...
    #  # => unknown, no suggestions
    #  ? => guess (rare)
    lines = out.splitlines()
    # Drop banner lines (start with '@' or '# version info) until first response marker
    it = iter(lines)
    # consume until we see a marker line
    first_resp_seen = False
    clean_lines = []
    for ln in it:
        if ln and ln[0] in {"*","+","&","#","?"}:
            first_resp_seen = True
            clean_lines.append(ln)
            break
    if first_resp_seen:
        clean_lines.extend(list(it))

    # Pair responses to inputs 1:1
    # hunspell returns exactly one response per input token in -a mode.
    if len(clean_lines) != len(words):
        # fallback: try to realign by skipping empty/comment lines
        cl = [ln for ln in clean_lines if ln and ln[0] in {"*","+","&","#","?"}]
        clean_lines = cl + ["#"]*(len(words)-len(cl)) if len(cl) < len(words) else cl[:len(words)]

    for w, ln in zip(words, clean_lines):
        if not ln:
            results.append((w,"ERROR",""))
            continue
        tag = ln[0]
        if tag in {"*","+"}:
            results.append((w,"OK",""))
        elif tag == "&":
            # & word count offset: sug, sug, sug
            parts = ln.split(":")
            sugs = parts[1].strip() if len(parts) > 1 else ""
            results.append((w,"SUGGEST", sugs))
        elif tag in {"#","?"}:
            results.append((w,"SUGGEST",""))  # unknown, no suggestions provided
        else:
            results.append((w,"UNKNOWN", ln))
    return results

candidates = list_candidate_files()
rows = []

for path, uniq in candidates:
    code = file_code_from_name(path)

    # Ignore the Norwegian/Danish ö→ø quirk: replace ö with ø only for checking,
    # but keep original form in output.
    def normalize_for_check(w):
        if code in {"nor","dan"}:
            return w.replace("ö","ø").replace("Ö","Ø")
        return w

    words = sorted(uniq)
    if SAMPLES_PER_FILE:
        words = words[:SAMPLES_PER_FILE]

    dict_bases = dict_args_for(code)
    if not dict_bases:
        # no dictionary, skip
        continue

    # Feed a deduped list to hunspell
    check_words = [normalize_for_check(w) for w in words]
    results = run_hunspell(check_words, dict_bases, code)

    # Attach original tokens and file code
    for orig, (checked, status, sugs) in zip(words, results):
        rows.append({
            "file_code": code,
            "word": orig,
            "status": status,          # OK | SUGGEST | UNKNOWN | ERROR
            "suggestions": sugs
        })

df = pd.DataFrame(rows, columns=["file_code","word","status","suggestions"])
df.to_csv(OUT_TSV, sep="\t", index=False)
print(f"Wrote {OUT_TSV} with {len(df):,} rows")

# small preview
df.head(30)


Wrote hunspell_results.tsv with 0 rows


Unnamed: 0,file_code,word,status,suggestions
