In [None]:
# Kaggle-ready script: obvious misclassifications in large lexicon files
# ---------------------------------------------------------------
# Focus: skip catch-all buckets, analyze files with >= MIN_ENTRIES (default: 1000)
# Output:
#   - braxen_obvious_misclassifications_filtered.csv          (detailed flags)
#   - braxen_obvious_misclassifications_per_file_summary.csv  (per-file counts)
#   - braxen_obvious_misclassifications_samples.csv           (top N examples / file)
#
# Assumptions:
#   - You uploaded `braxen.zip` in the notebook working directory
#   - Files inside the zip are UTF-8 (we read with errors="ignore" to be robust)

import os, zipfile, re, unicodedata
from collections import defaultdict, Counter
import pandas as pd

# -------------------------
# CONFIG
# -------------------------
ZIP_PATH = "braxen.zip"   # change if needed
EXTRACT_DIR = "braxen_extracted"
MIN_ENTRIES = 1000
SKIP_CODES = {
    # Skip "catch-all" / broad buckets
    "afr","asi","aus","sla","mix","rom",  # adjust if "rom" is *specific* in your data
    # Add any others you want to ignore:
    # "fisa", ...
}
# How many example rows per file to keep in the samples CSV
SAMPLES_PER_FILE = 50

# -------------------------
# Helpers: script checks (no external 'regex' dependency)
# -------------------------
def has_cyrillic(s: str) -> bool:
    for ch in s:
        o = ord(ch)
        # Cyrillic blocks: 0400–04FF, 0500–052F, 2DE0–2DFF (Cyrillic Extended-A), A640–A69F (Ext-B)
        if (0x0400 <= o <= 0x04FF) or (0x0500 <= o <= 0x052F) or (0x2DE0 <= o <= 0x2DFF) or (0xA640 <= o <= 0xA69F):
            return True
    return False

def has_greek(s: str) -> bool:
    for ch in s:
        o = ord(ch)
        # Greek: 0370–03FF; Greek Extended: 1F00–1FFF
        if (0x0370 <= o <= 0x03FF) or (0x1F00 <= o <= 0x1FFF):
            return True
    return False

def has_han(s: str) -> bool:
    for ch in s:
        o = ord(ch)
        # CJK Unified Ideographs (basic range). (Extensions omitted intentionally)
        if 0x4E00 <= o <= 0x9FFF:
            return True
    return False

# -------------------------
# Diacritic inventories (distinctive)
# -------------------------
DIACRITICS = {
    "pol": set("ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"),
    "cze": set("áéíóúýčďěňřšťůžÁÉÍÓÚÝČĎĚŇŘŠŤŮŽ"),
    "slk": set("áäčďéíľĺňóôŕšťúýžÁÄČĎÉÍĽĹŇÓÔŔŠŤÚÝŽ"),
    "slv": set("čšžČŠŽ"),
    "hrv": set("čćđšžČĆĐŠŽ"),
    "srp": set("čćđšžČĆĐŠŽ"),  # Latin Serbian
    "rom": set("ăâîșţșțĂÂÎȘŢȚ"),  # include both ş/ţ and ș/ț usage
    "hun": set("áéíóöőúüűÁÉÍÓÖŐÚÜŰ"),
    "tur": set("çğıöşüÇĞİÖŞÜ"),
    "lit": set("ąčęėįšųūžĄČĘĖĮŠŲŪŽ"),
    "lav": set("āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ"),
    "deu": set("äöüßÄÖÜ"),
    "fra": set("àâæçéèêëîïôœùûüÿÀÂÆÇÉÈÊËÎÏÔŒÙÛÜŸ"),
    "spa": set("áéíñóúüÁÉÍÑÓÚÜ"),
    "por": set("áâãàçéêíóôõúÁÂÃÀÇÉÊÍÓÔÕÚ"),
    "isl": set("áéíóúýðþæöÁÉÍÓÚÝÐÞÆÖ"),
}

def file_code_from_name(fname: str) -> str:
    # e.g., "braxen-cze.txt" -> "cze"
    base = os.path.basename(fname)
    if base.startswith("braxen-") and base.endswith(".txt"):
        return base[len("braxen-"):-len(".txt")]
    return base

def read_text_safely(path):
    with open(path, "rb") as f:
        data = f.read()
    # Try UTF-8; fall back to "ignore" to keep going
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError:
        return data.decode("utf-8", errors="ignore")

WORD_RE = re.compile(r"[^\W\d_][\w’'--]*", flags=re.UNICODE)

def tokenize_words(text: str):
    # Words starting with a letter; keeps diacritics and connector punctuation
    return WORD_RE.findall(text)

def detect_diacritic_langs(word: str):
    hits = []
    for code, chars in DIACRITICS.items():
        if any(ch in word for ch in chars):
            hits.append(code)
    return hits

# Special-case checks (lightweight and obvious)
def is_bulgarian_like_cyrillic(word: str) -> bool:
    w = word.lower()
    # Bulgarian hallmark: 'ъ' vowel, and definite article suffixes
    if "ъ" in w:
        return True
    if (w.endswith(("ът","та","то","те")) and ("ь" not in w)) and ("ы" not in w and "ё" not in w):
        return True
    return False

# Czech-ish diacritics present?
CZE_DIACS = DIACRITICS["cze"]

# -------------------------
# 1) Extract the archive
# -------------------------
if not os.path.exists(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        z.extractall(EXTRACT_DIR)

files = [os.path.join(EXTRACT_DIR, f) for f in os.listdir(EXTRACT_DIR)
         if f.startswith("braxen-") and f.endswith(".txt")]

# -------------------------
# 2) Count entries per file (unique tokens)
# -------------------------
file_word_sets = {}
file_sizes = {}
for path in files:
    code = file_code_from_name(path)
    text = read_text_safely(path)
    words = tokenize_words(text)
    uniq = set(words)
    file_word_sets[path] = uniq
    file_sizes[path] = len(uniq)

# Filter to large, non-catch-all files
candidate_paths = []
for p, n in file_sizes.items():
    code = file_code_from_name(p)
    if n >= MIN_ENTRIES and code not in SKIP_CODES:
        candidate_paths.append(p)

print(f"Found {len(candidate_paths)} large non-catch-all files (>= {MIN_ENTRIES} entries).")

# -------------------------
# 3) Scan candidates for obvious mismatches
# -------------------------
records = []
per_file_flags = defaultdict(lambda: defaultdict(int))

def add_record(word, current_code, suggested, reason):
    records.append((word, current_code, suggested, reason))
    per_file_flags[current_code][reason] += 1

for path in candidate_paths:
    code = file_code_from_name(path)
    uniq_words = file_word_sets[path]

    # a) Script mismatches: Greek/Cyrillic where they don't belong
    for w in uniq_words:
        if has_greek(w) and code != "gre":
            add_record(w, code, "gre", "Greek script in non-Greek file")
        elif has_cyrillic(w) and code not in {"rus","ukr","bul","mkd","srp"}:
            add_record(w, code, "cyrillic?", "Cyrillic characters in non-Cyrillic file")

    # b) Diacritic-driven mismatches (high-confidence when unambiguous)
    for w in uniq_words:
        # Only consider words containing non-ASCII chars (skip plain ASCII names)
        if all(ord(ch) < 128 for ch in w):
            continue
        langs = detect_diacritic_langs(w)
        if langs and code not in langs:
            suggested = langs[0] if len(langs) == 1 else "ambiguous(" + ",".join(sorted(langs)) + ")"
            add_record(w, code, suggested, f"Contains diacritics typical of {','.join(sorted(langs))}")

    # c) Special fixes (based on your notes)

    #   - Lithuanian/Latvian that landed in LAT
    if code == "lat":
        for w in uniq_words:
            if any(ch in DIACRITICS["lit"] for ch in w):
                add_record(w, code, "lit", "Lithuanian diacritics in LAT")
            elif any(ch in DIACRITICS["lav"] for ch in w):
                add_record(w, code, "lav", "Latvian diacritics in LAT")

    #   - Bulgarian words that landed in RUS
    if code == "rus":
        for w in uniq_words:
            if has_cyrillic(w) and is_bulgarian_like_cyrillic(w):
                add_record(w, code, "bul", "Bulgarian hard vowel/definite article in RUS")

    #   - Czech diacritics that landed in POL
    if code == "pol":
        for w in uniq_words:
            if any(ch in CZE_DIACS for ch in w):
                add_record(w, code, "cze", "Czech diacritics in POL")

    #   - Czech diacritics that landed in CHI (Chinese)
    if code == "chi":
        for w in uniq_words:
            if any(ch in CZE_DIACS for ch in w) and w not in {"Zhéng"}:  # leave Zhéng (pinyin tone) alone
                add_record(w, code, "cze", "Czech diacritics in CHI")

# -------------------------
# 4) Save outputs
# -------------------------
detailed_df = pd.DataFrame(records, columns=["word", "current_code", "suggested_code", "reason"]).drop_duplicates()
detailed_df.to_csv("braxen_obvious_misclassifications_filtered.csv", index=False, encoding="utf-8")

summary_rows = []
for code, counters in per_file_flags.items():
    total = sum(counters.values())
    row = {"file_code": code, "total_flags": total}
    row.update(counters)
    summary_rows.append(row)
summary_df = pd.DataFrame(summary_rows).sort_values("total_flags", ascending=False)
summary_df.to_csv("braxen_obvious_misclassifications_per_file_summary.csv", index=False, encoding="utf-8")

# Optional: provide compact samples per file for quick eyeballing
sample_rows = []
if not detailed_df.empty:
    for code, sub in detailed_df.groupby("current_code"):
        sample = sub.head(SAMPLES_PER_FILE)
        sample_rows.append(sample)
    samples_df = pd.concat(sample_rows, ignore_index=True)
    samples_df.to_csv("braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8")
else:
    # Create an empty file so it's obvious no flags were found
    pd.DataFrame(columns=["word","current_code","suggested_code","reason"]).to_csv(
        "braxen_obvious_misclassifications_samples.csv", index=False, encoding="utf-8"
    )

print("Wrote:")
print(" - braxen_obvious_misclassifications_filtered.csv")
print(" - braxen_obvious_misclassifications_per_file_summary.csv")
print(" - braxen_obvious_misclassifications_samples.csv")

# Show quick summary preview
display(summary_df.head(20))
display(detailed_df.head(50))
