In [None]:
import os
import time
import random
import hashlib
import html
import xml.etree.ElementTree as ET
from io import BytesIO
from pathlib import Path
from collections import Counter, defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
import regex as re
import jaconv
import matplotlib as mpl
import matplotlib.pyplot as plt
from pydub import AudioSegment
from google.cloud import texttospeech
from google.api_core import exceptions
from sudachipy import tokenizer, dictionary

from consts import JMDICT_ENTITIES, JLPT_LEVELS, TRANSLATION_OVERRIDES, READING_OVERRIDES, \
    KANJI_OVERRIDES, TOP_KANJI_ORDER, SIMILAR_KANJI_GROUPS, DONT_DROP_ME

### Config

In [None]:
# x.y.z:
#  - x: Increment for breaking changes. This means changing the names or order of fields in the "WordRank Kanji" note
#       type (but not appending new ones actually), or deleting card types. Prefer to deprecate/hide rather than delete.
#  - y: Increment for new features and non-breaking big changes, ie adding new processed data (eg adding pitch accent),
#       probably reordering, etc.
#  - z: Increment for data cleanups, typo fixes etc.
semver = (1,0,0)

# A lighter faster run to check things work as intended
test_mode = False
  # How many sentences to use from each corpora in test mode
sentence_limit = 10000

home_dir = Path.cwd()
data_dir = home_dir / "data"

# Corpora files (each should be rows of plain Japanese sentences). Also, the relative weighting for contributions of
# each kanji/vocab to the total "count" from the given corpora (doesn't have to add up to anything).
CORPORA = {
    "jesc": {"filepath": data_dir / "JESC_jp.txt", "weight": 40},
    "wl_web": {"filepath": data_dir / "wl_web.txt", "weight": 35},
    "wl_news": {"filepath": data_dir / "wl_news.txt", "weight": 16},
    "wl_wiki": {"filepath": data_dir / "wl_wiki.txt", "weight": 9},
}

# How many vocab lines to include on the cards
top_n_vocab = 15

### Stage 1: Utils and setup

In [None]:
SMALL_KANA = set("ゃゅょぁぃぅぇぉャュョァィゥェォ")


def is_kanji(c):
    """Return True if the character is in the CJK Unified block, Ext A or 々."""
    return '\u4e00' <= c <= '\u9fff' or '\u3400' <= c <= '\u4dbf' or c == '\u3005'


def is_japanese_string(s, keep_numbers=True):
    """
    True if string contains only Japanese scripts: Kanji, Hiragana, Katakana, prolonged vowel mark ー.
    Filters out Chinese or mixed foreign strings.
    """
    if not isinstance(s, str) or not s:
        return False
    if keep_numbers:
        pattern = r'^[\p{Han}\p{Hiragana}\p{Katakana}ー・0-9０-９]+$'
    else:
        pattern = r'[\p{Han}\p{Hiragana}\p{Katakana}ー]+'
    return re.fullmatch(pattern, s) is not None


def is_hiragana_string(s):
    return isinstance(s, str) and all('\u3041' <= c <= '\u309F' for c in s)


def is_kana(ch):
    """Return True if ch is hira, kata, prolonged sound mark, middle dot."""
    return (
        '\u3040' <= ch <= '\u309f' or
        '\u30a0' <= ch <= '\u30ff' or
        ch in ('ー', '・')
    )


version = f"v{semver[0]}.{semver[1]}.{semver[2]}"
out_dir = home_dir / "out" / version

home_dir.mkdir(parents=True, exist_ok=True)
out_dir.mkdir(parents=True, exist_ok=True)

wavs_dir = data_dir / "wavs"
mp3s_dir = data_dir / "mp3s"

### Stage 2: Parse JP dictionaries to usable dfs

In [None]:
def parse_jmdict_to_df(filepath, jmdict_entities):
    """
    Parse JMdict XML into a DataFrame.

    Returns DataFrame with columns:
        lemma, jmdict_reading, reading_kata, translations, is_godan, is_ichidan, is_vt, is_it, is_suru, is_irregular,
        is_onomatopoeic, is_mimetic, is_uk
    """
    # Invert entity mapping
    human_to_codes = {}
    for code, text in jmdict_entities.items():
        if not text:
            continue
        key = text.strip().lower()
        human_to_codes.setdefault(key, set()).add(code)

    tree = ET.parse(filepath)
    root = tree.getroot()
    rows = []

    BAD_MISC = {"arch", "obs", "rare", "poet"}

    def add_codes(token, target):
        """Attach grammatical/misc codes from POS or misc tags."""
        if not token:
            return
        text = token.strip().lower()
        if not text:
            return

        raw = text.strip("&;")
        if raw in jmdict_entities:
            target.add(raw)
            return

        if text in human_to_codes:
            target.update(human_to_codes[text])
            return

        if "ichidan" in text or "v1" in text:
            target.add("v1")
        if "godan" in text or "v5" in text:
            target.add("v5")
        if "transitive" in text:
            target.add("vt")
        if "intransitive" in text:
            target.add("vi")
        if "suru" in text or "verbal noun" in text:
            target.add("vs")
        if "irreg" in text:
            target.add("irreg")
        if "uk" in text or "kana alone" in text:
            target.add("uk")
        if "onomatopoeic" in text or "sound" in text:
            target.add("onm")
        if "mimetic" in text or "phenomenon" in text or "sensory" in text:
            target.add("mim")

    # === Main JMdict parse ===
    for entry in root.findall("entry"):
        lemmas = [k.text for k in entry.findall("k_ele/keb")] or [None]

        # process each sense separately
        senses = []
        for sense in entry.findall("sense"):
            glosses = [g.text.strip().rstrip('. ') for g in sense.findall("gloss") if g.text]
            pos_tags = [p.text.strip() for p in sense.findall("pos") if p.text]
            misc_tags = [m.text.strip() for m in sense.findall("misc") if m.text]

            pos_codes, misc_codes = set(), set()
            for p in pos_tags:
                add_codes(p, pos_codes)
            for m in misc_tags:
                add_codes(m, misc_codes)

            is_godan        = any(code.startswith("v5") for code in pos_codes)
            is_ichidan      = any(code.startswith("v1") for code in pos_codes)
            is_vt           = "vt" in pos_codes
            is_it           = "vi" in pos_codes
            is_suru         = any(code.startswith("vs") for code in pos_codes)
            is_uk           = "uk" in pos_codes or "uk" in misc_codes
            is_irregular    = "irreg" in misc_codes or any(code in ("v5ri", "vn", "vr") for code in pos_codes)
            is_onomatopoeic = "onm" in pos_codes or "onm" in misc_codes
            is_mimetic      = "mim" in pos_codes or "mim" in misc_codes

            senses.append({
                "glosses": glosses,
                "pos_codes": pos_codes,
                "misc_codes": misc_codes,
                "is_godan": is_godan,
                "is_ichidan": is_ichidan,
                "is_vt": is_vt,
                "is_it": is_it,
                "is_suru": is_suru,
                "is_uk": is_uk,
                "is_irregular": is_irregular,
                "is_onomatopoeic": is_onomatopoeic,
                "is_mimetic": is_mimetic,
            })
        
        # group senses by POS set
        senses_by_pos = defaultdict(list)
        for s in senses:
            # use a stable key for the POS set
            pos_key = tuple(sorted(s["pos_codes"]))  # e.g. ('adj-na', 'n') or ('n',) or ('adv',)
            senses_by_pos[pos_key].append(s)

        # now for each POS group, merge the senses
        merged_senses = []
        for pos_key, group in senses_by_pos.items():
            # drop bad/misc ones if we have any good ones
            good = [s for s in group if not (set(s["misc_codes"]) & BAD_MISC)]
            if good:
                group = good

            # round-robin gloss merge
            gloss_lists = [s["glosses"] for s in group]
            merged_glosses = []
            i = 0
            # find max length
            max_len = max(len(gl) for gl in gloss_lists) if gloss_lists else 0
            while i < max_len:
                for gl in gloss_lists:
                    if i < len(gl):
                        merged_glosses.append(gl[i])
                i += 1

            # merge flags (OR)
            merged_flags = {
                "is_godan": any(s["is_godan"] for s in group),
                "is_ichidan": any(s["is_ichidan"] for s in group),
                "is_vt": any(s["is_vt"] for s in group),
                "is_it": any(s["is_it"] for s in group),
                "is_suru": any(s["is_suru"] for s in group),
                "is_irregular": any(s["is_irregular"] for s in group),
                "is_uk": any(s["is_uk"] for s in group),
                "is_onomatopoeic": any(s["is_onomatopoeic"] for s in group),
                "is_mimetic": any(s["is_mimetic"] for s in group),
            }

            merged_senses.append({
                "pos_codes": list(pos_key),
                "misc_codes": sorted(set().union(*[s["misc_codes"] for s in group])),
                "glosses": merged_glosses,
                **merged_flags,
            })

        # emit rows for each (reading, lemma, merged_pos_group)
        for r_ele in entry.findall("r_ele"):
            reading = r_ele.findtext("reb")
            if not reading:
                continue
            re_restrs = [r.text for r in r_ele.findall("re_restr")]
            if re_restrs:
                linked_lemmas = [k for k in lemmas if k in re_restrs]
            else:
                linked_lemmas = lemmas

            for lemma in linked_lemmas:
                for ms in merged_senses:
                    reading_kata = jaconv.hira2kata(reading) if isinstance(reading, str) else None
                    rows.append({
                        "lemma": lemma,
                        "jmdict_reading": reading,
                        "reading_kata": reading_kata,
                        "sense_glosses": ms["glosses"],
                        "sense_pos": ms["pos_codes"],
                        "sense_misc": ms["misc_codes"],
                        "is_godan": ms["is_godan"],
                        "is_ichidan": ms["is_ichidan"],
                        "is_vt": ms["is_vt"],
                        "is_it": ms["is_it"],
                        "is_suru": ms["is_suru"],
                        "is_irregular": ms["is_irregular"],
                        "is_uk": ms["is_uk"],
                        "is_onomatopoeic": ms["is_onomatopoeic"],
                        "is_mimetic": ms["is_mimetic"],
                    })

    # JMdict sometimes lists both hiragana and katakana readings for the same word, which causes
    # downstream duplicates, so keep only one row per (lemma, reading_kata)
    tmp = {}
    for data in rows:
        reading = data["jmdict_reading"]
        reading_kata = jaconv.hira2kata(reading) if isinstance(reading, str) else None
        key = (data["lemma"], reading_kata, tuple(data["sense_glosses"]))
        # Prefer the row which was the hirigana form
        if key not in tmp or is_hiragana_string(reading):
            tmp[key] = {**data, "reading_kata": reading_kata}    
            
    return pd.DataFrame(tmp.values())


def parse_kanjidic2_to_df(filepath):
    """
    Parse KANJIDIC2 XML into a DataFrame.

    Returns DataFrame with columns:
        kanji, radical_id, jlpt_level, jouyou_grade, stroke_count, on_readings, kun_readings, meaning
    """
    tree = ET.parse(filepath)
    root = tree.getroot()
    rows = []

    for char in root.findall("character"):
        literal = char.findtext("literal")

        misc = char.find("misc")
        jlpt = JLPT_LEVELS.get(literal)
        jouyou = misc.findtext("grade") if misc is not None else None
        if jouyou:
            # 1-6 are grades 1-6 (~1000 kanji), 7 doesn't exist, 8 is high school (another ~1000),
            # 9/10 aren't to do with jouyou
            jouyou = int(jouyou)
            jouyou = jouyou if jouyou and jouyou <= 8 else None

        stroke_counts = [int(s.text) for s in misc.findall("stroke_count")] if misc is not None else []
        stroke_count = min(stroke_counts) if stroke_counts else None

        rad = char.find("radical/rad_value[@rad_type='classical']")
        radical = int(rad.text) if rad is not None else None

        rm = char.find("reading_meaning/rmgroup")
        on_yomi, kun_yomi, meanings = [], [], []
        if rm is not None:
            for r in rm.findall("reading"):
                if r.attrib.get("r_type") == "ja_on":
                    on_yomi.append(r.text)
                elif r.attrib.get("r_type") == "ja_kun":
                    kun_yomi.append(r.text)
            for m in rm.findall("meaning"):
                if m.attrib.get("m_lang") is None:
                    if "radical" not in m.text and "kokuji" not in m.text:
                        meanings.append(m.text)

        rows.append({
            "kanji": literal,
            "radical_id": radical,
            "jlpt_level": int(jlpt) if jlpt else None,
            "jouyou_grade": jouyou if jouyou else None,
            "stroke_count": stroke_count,
            "on_readings": on_yomi,
            "kun_readings": kun_yomi,
            "meaning": meanings
        })

    return pd.DataFrame(rows)


def parse_pitch_accents_to_df(filepath):
    rows = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            surface, reading, accents = line.split("\t")
            
            reading_hira = jaconv.kata2hira(reading)

            rows.append({
                "surface": surface,
                "reading": reading_hira,
                "accents": accents,
            })

    return pd.DataFrame(rows).drop_duplicates()

In [None]:
kanjidic2_df = parse_kanjidic2_to_df(data_dir / "kanjidic2.xml")
jmdict_df = parse_jmdict_to_df(data_dir / "JMdict_e.xml", JMDICT_ENTITIES)
pitch_df = parse_pitch_accents_to_df(data_dir / "pitch_accents.txt")

### Stage 3: Get the base kanji and vocab count df's
Base meaning that we've done the frequency analysis and very minimal cleaning, but haven't started any real manual touch-ups yet

In [None]:
def get_kanji_df(corpora):
    """
    Count kanji frequencies per corpus.

    Returns DataFrame with columns:
        kanji, count_<corpus>
    """
    rows = []
    for corpus, cfg in corpora.items():
        i = 0
        print(f"Starting kanji_df extraction for {corpus}")
        counter = Counter()
        with open(cfg["filepath"], "r", encoding="utf-8") as f:
            for sentence in f:
                if test_mode:
                    i += 1
                    if i > sentence_limit:
                        break
                for ch in sentence:
                    if is_kanji(ch):
                        counter[ch] += 1
        for kanji, count in counter.items():
            rows.append({"kanji": kanji, f"count_{corpus}": count})
        print(f"Total kanji so far: {len(rows)}")

    df = pd.DataFrame(rows)

    count_cols = [c for c in df.columns if c.startswith("count_")]
    df = df.groupby("kanji", as_index=False)[count_cols].sum()
    for c in count_cols:
        df[c] = df[c].astype(int)

    return df


def get_vocab_df(corpora):
    """
    Build vocab_df from corpora using Sudachi

    Returns DataFrame with columns:
        lemma, reading_kata, pos, conj, count_<corpus>
    """
    tokenizer_obj = dictionary.Dictionary().create()
    tokenizer_mode = tokenizer.Tokenizer.SplitMode.C

    def strip_numbers(s):
        if not isinstance(s, str):
            return s
        s = jaconv.z2h(s, digit=True, ascii=False, kana=False)
        return re.sub(r'[0-9]+', '', s)

    rows = []
    for corpus, cfg in corpora.items():
        i = 0
        print(f"Starting vocab_df extraction for {corpus}")
        counter = Counter()
        with open(cfg["filepath"], "r", encoding="utf-8") as f:
            for sentence in f:
                if test_mode:
                    i += 1
                    if i > sentence_limit:
                        break
                for token in tokenizer_obj.tokenize(sentence, tokenizer_mode):
                    # Normalise out numeric digits (人[hito], 1人[hito], 2人[hito] are all counted as 人[hito])
                    lemma = strip_numbers(token.dictionary_form())
                    reading_kata = strip_numbers(token.reading_form())  # always in katakana from Sudachi
                    pos = token.part_of_speech()[0]
                    conj = token.part_of_speech()[4]

                    if pos in ["助詞", "助動詞", "補助記号", "記号"]:
                        # particles, verb endings, symbols, punctuation
                        continue

                    counter[(lemma, reading_kata, pos, conj)] += 1

        for (lemma, reading_kata, pos, conj), count in counter.items():
            rows.append({
                "lemma": lemma,
                "reading_kata": reading_kata,
                "pos": pos,
                "conj": conj,
                f"count_{corpus}": count
            })

    df = pd.DataFrame(rows)

    count_cols = [c for c in df.columns if c.startswith("count_")]
    df = df.groupby(["lemma", "reading_kata", "pos", "conj"], as_index=False)[count_cols].sum()
    for c in count_cols:
        df[c] = df[c].astype(int)

    return df


def filter_non_japanese_vocab(vocab_df):
    keep = []
    dropped = []
    for _, row in vocab_df.iterrows():
        lemma = row["lemma"]
        if is_japanese_string(lemma):
            keep.append(row)
        else:
            dropped.append(lemma)
    clean_df = pd.DataFrame(keep).reset_index(drop=True)
    return clean_df, dropped


def merge_jmdict_to_vocab(vocab_df, jmdict_df, n_translations=3):
    """
    Merge JMdict info into vocab_df on (lemma, reading_kata).

    Adds columns:
        jmdict_reading, translations, translation,
        is_godan, is_ichidan, is_vt, is_it, is_suru
    """

    # Simple POS bucket matcher
    def sudachi_pos_bucket(pos):
        if not isinstance(pos, str):
            return None
        if pos.startswith("動詞"):
            return "verb"
        if pos.startswith("形容詞"):
            return "adj-i"
        if pos.startswith("形容動詞"):
            return "adj-na"
        if pos.startswith("名詞"):
            return "noun"
        if pos.startswith("副詞"):
            return "adv"
        if pos.startswith("形状詞"):
            return "noun"
        if pos.startswith("連体詞"):
            return "adj-na"
        return None

    JM_POS_GROUPS = {
        "verb": {"v1", "v5", "vs", "vi", "vt"},
        "adj-i": {"adj-i"},
        "adj-na": {"adj-na"},
        "noun": {"n", "n-adv", "n-t", "n-pref", "n-suf"},
    }

    def pick_sense_for_row(senses, sudachi_pos):
        """Return gloss list from JMdict sense matching Sudachi POS best."""
        bucket = sudachi_pos_bucket(sudachi_pos)
        if not isinstance(senses, list):
            return []

        best = None
        for s in senses:
            codes = set(s.get("sense_pos", []))
            if bucket and bucket in JM_POS_GROUPS:
                if codes & JM_POS_GROUPS[bucket]:
                    return s.get("sense_glosses", [])
            if best is None:
                best = s
        return best.get("sense_glosses", []) if best else []

    # 1) group all senses per (lemma, reading_kata)
    j_grouped = (
        jmdict_df
        .groupby(["lemma", "reading_kata"], dropna=False)
        .apply(lambda g: g.to_dict(orient="records"), include_groups=False)
        .rename("senses")
        .reset_index()
    )

    # 2) ALSO get one of the constant jmdict_reading per (lemma, reading_kata)
    j_reading_map = (
        jmdict_df
        .groupby(["lemma", "reading_kata"], dropna=False)["jmdict_reading"]
        .first()
        .rename("jmdict_reading")
        .reset_index()
    )

    # merge both into vocab
    merged = vocab_df.merge(j_grouped, on=["lemma", "reading_kata"], how="left")
    merged = merged.merge(j_reading_map, on=["lemma", "reading_kata"], how="left")

    # 3) pick the right sense for each Sudachi row
    translations = []
    for _, row in merged.iterrows():
        glosses = pick_sense_for_row(row.get("senses"), row.get("pos"))
        translations.append("; ".join(glosses[:n_translations]) if glosses else "")
    merged["translation"] = translations

    # Boolean flags: OR them (from all sense rows)
    bool_cols = ["is_godan", "is_ichidan", "is_vt", "is_it", "is_suru"]
    for col in bool_cols:
        if col in jmdict_df:
            m = (jmdict_df.groupby(["lemma", "reading_kata"], dropna=False)[col]
                 .any()
                 .rename(col)
                 .reset_index())
            merged = merged.drop(columns=col, errors="ignore").merge(m, on=["lemma", "reading_kata"], how="left")
            merged[col] = merged[col].astype("boolean").fillna(False)

    return merged


def merge_kanjidic2_to_kanji(kanji_df, kanjidic2_df):
    """
    Merge KANJIDIC2 data into kanji_df on 'kanji', keeping only matches.

    Returns DataFrame with columns:
        kanji, count_<corpus>, radical_id, jlpt_level, jouyou_grade, stroke_count, on_readings, kun_readings, meaning
    """
    merged = kanji_df.merge(kanjidic2_df, on="kanji", how="inner")
    return merged


def get_kanji_used_in_vocab(vocab_df):
    """
    Return the set of kanji that appear in at least one REAL Japanese vocab item.
    "Real" = lemma is a string AND we have a non-empty JMdict-backed translation.
    We do NOT check KANJIDIC2 here; this is purely usage-based.
    """
    vocab_kanji = set()

    for _, row in vocab_df.iterrows():
        lemma = row.get("lemma")
        tl = row.get("translation", "")

        if not isinstance(lemma, str):
            continue
        if not isinstance(tl, str) or not tl.strip():
            continue

        for ch in lemma:
            if is_kanji(ch):
                vocab_kanji.add(ch)

    return vocab_kanji


def filter_non_japanese_kanji(kanji_df, vocab_kanji):
    """
    Keep a kanji if ANY of the following is true:
      - it has a Jōyō grade (jouyou_grade not null)
      - it has a JLPT level (jlpt_level not null)
      - it appears in at least one real vocab item (vocab_kanji)

    Everything else gets dropped (Chinese-only, historical garbage, etc.).
    """

    def is_good(row):
        return (
            pd.notna(row.get("jouyou_grade")) or
            pd.notna(row.get("jlpt_level")) or
            (row["kanji"] in vocab_kanji)
        )

    filtered_kanji_df = kanji_df[kanji_df.apply(is_good, axis=1)].reset_index(drop=True)
    return filtered_kanji_df

In [None]:
# The bulk of computation time is in this cell, so skip rerunning it if wanted
reload_base_dfs = False
if reload_base_dfs:
    base_kanji_df = pd.read_parquet(out_dir / "base_kanji_df.parquet")
    base_vocab_df = pd.read_parquet(out_dir / "base_vocab_df.parquet")

else:
    base_vocab_df = get_vocab_df(CORPORA)
    base_vocab_df, dropped_vocab = filter_non_japanese_vocab(base_vocab_df)
    base_vocab_df = merge_jmdict_to_vocab(base_vocab_df, jmdict_df)

    base_kanji_df = get_kanji_df(CORPORA)
    base_kanji_df = merge_kanjidic2_to_kanji(base_kanji_df, kanjidic2_df)
    vocab_kanji = get_kanji_used_in_vocab(base_vocab_df)
    base_kanji_df = filter_non_japanese_kanji(base_kanji_df, vocab_kanji)

    base_kanji_df.to_parquet(out_dir / "base_kanji_df.parquet", index=False)
    base_vocab_df.to_parquet(out_dir / "base_vocab_df.parquet", index=False)

### Stage 4: Clean up the dfs
We rely on the dictionaries for most things which aren't raw frequency counts. But partly due to the specifics of this
use-case, and partly due to the way this use-case is written, what we get isn't completely what would be ideal. So clean
up the vocab and kanji dfs, and apply manual corrections defined in consts.py.

In [None]:
def apply_bad_reading_fixes(vocab_df, fixes):
    df = vocab_df.copy()
    for (lemma, bad_reading), new_reading in fixes.items():
        mask = (df["lemma"] == lemma) & (df["jmdict_reading"] == bad_reading)
        if not mask.any():
            continue
        df.loc[mask, "jmdict_reading"] = new_reading
        df.loc[mask, "reading_kata"] = jaconv.hira2kata(new_reading)
        
    return df


def apply_translation_overrides(vocab_df, overrides):
    df = vocab_df.copy()

    for (lemma, reading), new_tl in overrides.items():
        mask = (df["lemma"] == lemma) & (df["reading_kata"] == reading)
        df.loc[mask, "translation"] = new_tl

    return df


def collapse_vocab_reading_variants(vocab_df):
    """
    Sudachi can sometimes output multiple rows for the same lemma+POS+conj, differing only in reading_kata
    (e.g. 行く: イク, イコ, イキ, etc.).

    - If no JMdict-backed rows exist in the group, leave the group untouched.
    - If one or more rows have jmdict_reading (i.e. valid dictionary reading exists):
      - Choose the JMdict-backed row with the highest count (first count_* column) (expect which corpus it is to not
        matter).
      - Sum ALL rows' counts (even non-JMdict weird Sudachi variants) into that row.
      - Drop all other rows.
    """
    count_cols = [c for c in vocab_df.columns if c.startswith("count_")]
    primary_count = count_cols[0]  # If multiple corpora, choose the first for comparison

    collapsed = []

    for _, group in vocab_df.groupby(["lemma", "pos", "conj"], sort=False):
        jmdict_rows = group[group["translation"].astype(bool)]

        if jmdict_rows.empty:
            # No dictionary reading, keep all variants
            collapsed.append(group)
            continue

        # Pick the JMdict-backed row with the highest primary count
        keep = jmdict_rows.sort_values(primary_count, ascending=False).iloc[0].copy()

        # Sum counts from ALL variants (dict and non-dict)
        for c in count_cols:
            keep[c] = group[c].sum()

        collapsed.append(keep.to_frame().T)

    out = pd.concat(collapsed, ignore_index=True)
    
    # Hacky fix for 両: merge all 両 rows (any POS) into the JMdict-backed 名詞 row if present
    # (the noun form (an old unit of currency) was getting badly over-represented)
    mask_all_ryo = out["lemma"] == "両"
    if mask_all_ryo.any():
        noun_mask = mask_all_ryo & (out["pos"] == "名詞") & out["translation"].notna() & out["translation"].astype(bool)
        if noun_mask.any():
            noun_idx = out[noun_mask].index[0]
            for c in count_cols:
                out.at[noun_idx, c] = out.loc[mask_all_ryo, c].sum()
            # drop the other 両 rows
            drop_idx = out[mask_all_ryo].index.difference([noun_idx])
            if len(drop_idx):
                out = out.drop(drop_idx).reset_index(drop=True)

    return out


def collapse_vocab_by_definition(vocab_df):
    """
    Second-pass dedupe.

    After Sudachi variant collapsing, we can still have rows like:
        lemma=何, translation="what"
    twice, because Sudachi gave different POS buckets, but our JMdict sense-picker resolved both to the same English
    gloss.

    Strategy:
      - build a *normalized translation key* (order-insensitive, lowercase)
      - group by (lemma, norm_translation_key)
      - sum all count_* columns
      - `OR` all boolean JMdict flags
      - pick a representative row for the other columns
    """

    # 1) which columns are counts / booleans
    count_cols = [c for c in vocab_df.columns if c.startswith("count_")]
    bool_cols = ["is_godan", "is_ichidan", "is_vt", "is_it", "is_suru"]

    def norm_translation(s: str) -> str:
        if not isinstance(s, str):
            return ""
        # split on ';', normalise, sort so "what; how many" == "how many; what"
        parts = [p.strip().lower() for p in s.split(";") if p.strip()]
        if not parts:
            return ""
        parts = sorted(set(parts))
        return "; ".join(parts)

    # 2) build the key column
    vocab_df = vocab_df.copy()
    vocab_df["__norm_tr__"] = vocab_df["translation"].apply(norm_translation)

    rows = []
    for (_, _), grp in vocab_df.groupby(["lemma", "__norm_tr__"], dropna=False):
        # don't merge rows that have *no* translation – those are often junk / unlinked
        if grp["__norm_tr__"].iloc[0] == "":
            rows.append(grp.drop(columns="__norm_tr__"))
            continue

        # sum counts across the group
        summed_counts = {c: grp[c].sum() for c in count_cols}

        # OR the boolean flags
        bool_vals = {b: bool(grp[b].any()) for b in bool_cols if b in grp}

        # pick representative row:
        # - prefer one that actually had a JMdict reading
        # - else the one with the highest total count
        def total_count(row):
            return sum(row[c] for c in count_cols)

        rep = None
        with_jmdict = grp[grp["jmdict_reading"].notna() & (grp["jmdict_reading"] != "")]
        if not with_jmdict.empty:
            rep = with_jmdict.loc[with_jmdict.apply(total_count, axis=1).idxmax()].copy()
        else:
            rep = grp.loc[grp.apply(total_count, axis=1).idxmax()].copy()

        # now write the merged values back on the representative
        for c, v in summed_counts.items():
            rep[c] = v
        for b, v in bool_vals.items():
            rep[b] = v

        # pos / conj: pick the most common in the group,
        # but only overwrite if there was genuine variation
        if grp["pos"].nunique() > 1:
            rep["pos"] = grp["pos"].mode().iloc[0]
        if "conj" in grp and grp["conj"].nunique() > 1:
            rep["conj"] = grp["conj"].mode().iloc[0]

        rows.append(rep.to_frame().T)

    out = pd.concat(rows, ignore_index=True)
    return out


def merge_pitch_accent(vocab_df, pitch_df):
    """
    Attach a single chosen pitch accent per vocab row.

    Matching key:
        (lemma, reading_hira)
    where reading_hira is jmdict_reading if present, else reading_kata converted to hiragana.

    New column:
        vocab_df["pitch_accent"]
    """

    # --- helper: map Sudachi POS -> short POS code(s) used in pitch data ---
    def sudachi_pos_to_pitch_codes(pos: str):
        if not isinstance(pos, str):
            return []
        if pos.startswith("名詞"):
            return ["名"]
        if pos.startswith("副詞"):
            return ["副"]
        if pos.startswith("形容動詞"):
            return ["形動"]
        if pos.startswith("感動詞"):
            return ["感"]
        if pos.startswith("代名詞"):
            return ["代"]
        return []

    # --- helper: parse an accents string into POS-specific + default lists ---
    def parse_pos_accent(accent_str: str):
        """
        Returns:
            pos_map: dict[str, list[int]]  e.g. {'副': [0], '名': [3,4,0]}
            default_accents: list[int]      for numbers not under any (POS)
        """
        pos_map = {}
        default_accents = []

        if not isinstance(accent_str, str):
            return pos_map, default_accents

        s = accent_str.strip()
        if not s:
            return pos_map, default_accents

        # Fast path: no POS markers -> simple list
        if "(" not in s:
            nums = []
            for t in s.split(","):
                t = t.strip()
                if t.isdigit():
                    nums.append(int(t))
            default_accents.extend(nums)
            return pos_map, default_accents

        # General path: with POS markers
        tokens = [t.strip() for t in s.split(",") if t.strip()]
        cur_poses = None  # list[str] or None

        for tok in tokens:
            # There might be multiple "(POS)" markers in one token, but in your
            # data they appear cleanly per token; still, be tolerant.
            while tok.startswith("("):
                close = tok.find(")")
                if close == -1:
                    break
                pos_part = tok[1:close]  # e.g. "副" or "名;形動"
                pos_codes = [p.strip() for p in pos_part.split(";") if p.strip()]
                cur_poses = pos_codes or None
                tok = tok[close + 1:].strip()

            if not tok:
                continue
            if not tok.isdigit():
                continue

            val = int(tok)

            if cur_poses:
                for p in cur_poses:
                    pos_map.setdefault(p, []).append(val)
            else:
                default_accents.append(val)

        return pos_map, default_accents

    p = pitch_df.copy()
    p["reading_hira"] = p["reading"].astype(str).map(lambda s: jaconv.kata2hira(s))
    p["surface"] = p["surface"].astype(str)
    p = p[(p["surface"] != "") & (p["reading_hira"] != "")]
    p = p.drop_duplicates(subset=["surface", "reading_hira"], keep="first")
    
    pitch_map = {
        (row["surface"], row["reading_hira"]): row["accents"]
        for _, row in p.iterrows()
    }

    # --- per-row resolver ---
    def resolve_pitch_for_row(row):
        lemma = row.get("lemma")
        if not isinstance(lemma, str) or not lemma:
            return pd.NA

        # reading in hiragana: prefer jmdict_reading, else Sudachi reading_kata
        jr = row.get("jmdict_reading")
        if isinstance(jr, str) and jr:
            reading_hira = jr
        else:
            rk = row.get("reading_kata")
            if not isinstance(rk, str) or not rk:
                return pd.NA
            reading_hira = jaconv.kata2hira(rk)

        key = (lemma, reading_hira)
        accents_str = pitch_map.get(key)
        if not isinstance(accents_str, str) or not accents_str.strip():
            return pd.NA

        pos_map, default_accents = parse_pos_accent(accents_str)
        target_codes = sudachi_pos_to_pitch_codes(row.get("pos", ""))

        # 1) POS-specific match
        for code in target_codes:
            vals = pos_map.get(code)
            if vals:
                return vals[0]

        # 2) default accents (no POS)
        if default_accents:
            return default_accents[0]

        # 3) fallback: first available POS group
        if pos_map:
            first_pos = sorted(pos_map.keys())[0]
            vals = pos_map[first_pos]
            if vals:
                return vals[0]

        return pd.NA

    df = vocab_df.copy()
    df["pitch_accent"] = df.apply(resolve_pitch_for_row, axis=1)
    df["pitch_accent"] = df["pitch_accent"].astype("Int64")
    return df


def apply_weights_and_freq_rank(df, corpora):
    """
    Process:
      - Normalise each count_<corpus> column by total occurrences in that corpus.
      - Multiply each by its corpus weight.
      - Sum to get weighted_count.
      - Multiply by 1e6 to get more intuitive magnitudes (optional but nice).
      - Add rank (index column, 1 = most frequent).

    Returns DataFrame with added:
        weighted_count, rank
    """
    count_cols = [c for c in df.columns if c.startswith("count_")]

    # Sanity check that all corpora are present
    corpus_names = [c.replace("count_", "") for c in count_cols]
    if set(corpus_names) != set(corpora.keys()):
        raise ValueError("Mismatch between corpora dict and df columns")

    # Normalize within each corpus
    normalized = pd.DataFrame(index=df.index)
    for col in count_cols:
        total = df[col].sum()
        if total == 0:
            raise ValueError("Somehow a corpus has 0 vocab")
        normalized[col] = df[col] / total

    # Compute weighted total
    total_weight = sum(v["weight"] for v in corpora.values())
    weighted_count = pd.Series(0.0, index=df.index)
    for col in count_cols:
        corpus = col.replace("count_", "")
        w = corpora[corpus]["weight"] / total_weight
        weighted_count += normalized[col] * w

    if 1:
        weighted_count *= 1e6
    df["weighted_count"] = weighted_count

    df = df.sort_values("weighted_count", ascending=False).reset_index(drop=True)
    df["freq_rank"] = df.index + 1

    return df


def apply_kanji_overrides(kanji_df, overrides):
    df = kanji_df.copy()
    for kanji, new_meaning in overrides.items():
        parts = [p.strip() for p in new_meaning.split(";") if p.strip()]
        idx = df.index[df["kanji"] == kanji]
        if not idx.empty:
            df.at[idx[0], "meaning"] = parts
    return df


def filter_kanji(kanji_df):
    df = kanji_df.copy()

    cond_jouyou = df["jouyou_grade"].notna()
    cond_jlpt = df["jlpt_level"].notna()
    cond_freq = df["freq_rank"] <= 2500
    cond_whitelist = df["kanji"].isin(DONT_DROP_ME)

    keep = cond_jouyou | cond_jlpt | cond_freq | cond_whitelist
    return df[keep].reset_index(drop=True)


def apply_manual_ordering(kanji_df, top_kanji_order, kanji_groups):
    """
    Compute a deck 'order' that:
      1) Puts TOP_KANJI_ORDER at the very front (in the given order).
      2) Then walks the remaining kanji by frequency rank. When the first member of any group appears, insert the whole
         group preserving the ordering of the group.
      3) Everything else follows in plain frequency order.
    Leaves 'freq_rank' untouched (that's true frequency). Adds 'deck_rank'.
    """
    df = kanji_df.copy()
    present = set(df["kanji"])

    # canonical frequency order
    freq_order = list(df.sort_values("freq_rank", ascending=True)["kanji"])

    # map a kanji to the group it belongs to
    kanji_to_group = {}
    for group in (kanji_groups or []):
        members = [k for k in group if k in present]
        if not members:
            continue
        for k in members:
            if k not in kanji_to_group:
                kanji_to_group[k] = members

    used = set()
    output_seq = []

    def emit_group_or_single(item):
        if item in used or item not in present:
            return
        group = kanji_to_group.get(item)
        if group:
            for m in group:
                if m not in used:
                    used.add(m)
                    output_seq.append(m)
        else:
            used.add(item)
            output_seq.append(item)

    # 1) top-order section
    for k in (top_kanji_order or []):
        emit_group_or_single(k)

    # 2) walk by frequency
    for k in freq_order:
        emit_group_or_single(k)

    # 3) safety: ensure all present kanji are included
    for k in present:
        emit_group_or_single(k)

    # assign deck_rank (fast and explicit)
    rank_map = {k: i + 1 for i, k in enumerate(output_seq)}
    df["deck_rank"] = df["kanji"].map(rank_map)

    # return df ordered by the new deck sequence
    df = df.set_index("kanji").loc[output_seq].reset_index()
    return df


In [None]:
def vocab_df_pipeline(df, pitch_df):
    df = apply_bad_reading_fixes(df, READING_OVERRIDES)
    df = apply_translation_overrides(df, TRANSLATION_OVERRIDES)
    df = collapse_vocab_reading_variants(df)
    df = collapse_vocab_by_definition(df)
    df = merge_pitch_accent(df, pitch_df)
    df = apply_weights_and_freq_rank(df, CORPORA)
    return df

def kanji_df_pipeline(df):
    df = apply_kanji_overrides(df, KANJI_OVERRIDES)
    df = apply_weights_and_freq_rank(df, CORPORA)
    df = filter_kanji(df)
    df = apply_manual_ordering(df, TOP_KANJI_ORDER, SIMILAR_KANJI_GROUPS)
    return df

vocab_df = vocab_df_pipeline(base_vocab_df, pitch_df)
kanji_df = kanji_df_pipeline(base_kanji_df)

vocab_df.to_parquet(out_dir / "vocab_df_end.parquet", index=False)
kanji_df.to_parquet(out_dir / "kanji_df_end.parquet", index=False)

### Stage 5: Build the anki notes .tsv

In [None]:
def count_mora(reading_hira):
    """
    Count mora in a hiragana string.
    - small ゃゅょぁぃぅぇぉゃゅょ + preceding char = 1 mora
    - っ counts as 1 mora
    """
    if not isinstance(reading_hira, str):
        return 0

    mora = 0
    i = 0
    while i < len(reading_hira):
        ch = reading_hira[i]
        # small kana: combine with previous but still count as one mora total
        if ch in SMALL_KANA:
            i += 1
            continue
        mora += 1
        i += 1
    return mora


def highlight_mora(text, mora_index, span_class="pitch-drop"):
    """
    Wrap the mora at mora_index in <span class="pitch-drop">...</span>.
    Small kana are attached to the preceding mora.
    """
    if mora_index is None or mora_index <= 0:
        return text

    result = []
    i = 0
    current_mora = 0
    n = len(text)

    while i < n:
        ch = text[i]

        if ch in SMALL_KANA:
            # belongs to previous mora
            result.append(ch)
            i += 1
            continue

        # start of a new mora
        current_mora += 1

        if current_mora == mora_index:
            # include this char and any following small kana
            result.append(f'<span class="{span_class}">{ch}')
            j = i + 1
            while j < n and text[j] in SMALL_KANA:
                result.append(text[j])
                j += 1
            result.append('</span>')
            i = j
        else:
            result.append(ch)
            i += 1

    return "".join(result)


def build_furigana_html(lemma, reading, pitch=None):
    # 1) normalise reading
    if not reading:
        reading = ""
    reading_hira = jaconv.kata2hira(reading)

    # --- pitch processing: which mora to highlight? ---
    heiban_mora = None
    drop_mora = None
    drop_class = "pitch-nakadaka"  # default
    if pitch is not None and not pd.isna(pitch):
        p = int(pitch)
        total_mora = count_mora(reading_hira)
        if p == 0:
            if total_mora >= 1:
                heiban_mora = 1
        elif 1 <= p <= total_mora:
            drop_mora = p
            if p == 1:
                drop_class = "pitch-atamadaka"
            elif p == total_mora:
                drop_class = "pitch-odaka"
            else:
                drop_class = "pitch-nakadaka"

    # 2) make a normalised copy of the lemma *for logic only*
    lemma_norm = (lemma
                  .replace("ヶ", "ケ")
                  .replace("ヵ", "カ"))

    # 3) segment BOTH strings in parallel, using lemma_norm to decide boundaries
    norm_segs = []
    orig_segs = []

    if not lemma_norm:
        return ""

    buf_norm = lemma_norm[0]
    buf_orig = lemma[0]
    last_is_kanji = is_kanji(lemma_norm[0])

    for idx in range(1, len(lemma_norm)):
        ch_norm = lemma_norm[idx]
        ch_orig = lemma[idx]
        now_is_kanji = is_kanji(ch_norm)

        if now_is_kanji == last_is_kanji:
            buf_norm += ch_norm
            buf_orig += ch_orig
        else:
            norm_segs.append(buf_norm)
            orig_segs.append(buf_orig)
            buf_norm = ch_norm
            buf_orig = ch_orig
            last_is_kanji = now_is_kanji

    norm_segs.append(buf_norm)
    orig_segs.append(buf_orig)

    # from here on, use norm_segs for alignment, orig_segs for output
    r_idx = 0
    mora_pos = 0  # global mora index consumed so far
    out_parts = []
    hira_segs = [jaconv.kata2hira(s) for s in norm_segs]

    for i, seg_norm in enumerate(norm_segs):
        seg_orig = orig_segs[i]
        seg_hira = hira_segs[i]
        is_kanji_seg = is_kanji(seg_norm[0])

        # -------- KANA SEGMENT --------
        if not is_kanji_seg:
            if reading_hira[r_idx:].startswith(seg_hira):
                seg_mora = count_mora(seg_hira)

                # decide which mora (if any) to highlight in this segment
                local_idx = None
                span_class = drop_class

                # drop mora?
                if drop_mora is not None and (mora_pos < drop_mora <= mora_pos + seg_mora):
                    local_idx = drop_mora - mora_pos
                    span_class = drop_class

                # heiban first mora?
                elif heiban_mora is not None and (mora_pos < heiban_mora <= mora_pos + seg_mora):
                    local_idx = heiban_mora - mora_pos
                    span_class = "pitch-heiban"

                if local_idx is not None:
                    seg_out = highlight_mora(seg_orig, local_idx, span_class=span_class)
                else:
                    seg_out = seg_orig

                r_idx += len(seg_hira)
                mora_pos += seg_mora
            else:
                # reading alignment failed. Fall back, no highlighting here
                seg_out = seg_orig

            out_parts.append(seg_out)
            continue

        # -------- KANJI SEGMENT --------
        # find next kana anchor
        anchor_hira = ""
        for j in range(i + 1, len(norm_segs)):
            if not is_kanji(norm_segs[j][0]):
                anchor_hira = hira_segs[j]
                break

        remain = reading_hira[r_idx:]
        if anchor_hira:
            p = None
            for cand_p in range(1, len(remain) + 1):
                if remain[cand_p:].startswith(anchor_hira):
                    p = cand_p
                    break
            if p is None:
                p = len(remain)
            furikana = remain[:p]
            r_idx += p
        else:
            furikana = remain
            r_idx = len(reading_hira)
        
        seg_mora = count_mora(furikana)

        local_idx = None
        span_class = drop_class

        # drop mora in this segment?
        if drop_mora is not None and (mora_pos < drop_mora <= mora_pos + seg_mora):
            local_idx = drop_mora - mora_pos
            span_class = drop_class

        # heiban first mora in this segment?
        elif heiban_mora is not None and (mora_pos < heiban_mora <= mora_pos + seg_mora):
            local_idx = heiban_mora - mora_pos
            span_class = "pitch-heiban"

        if local_idx is not None:
            furikana_out = highlight_mora(furikana, local_idx, span_class=span_class)
        else:
            furikana_out = furikana

        mora_pos += seg_mora

        if furikana:
            out_parts.append(f"<ruby><rb>{seg_orig}</rb><rt>{furikana_out}</rt></ruby>")
        else:
            out_parts.append(seg_orig)

    html_str = "".join(out_parts)

    # final consistency check
    reconstructed = "".join(c for c in html_str if is_kana(c)).replace("ヶ", "ケ").replace("ヵ", "カ")
    if reading and jaconv.kata2hira(reconstructed) != reading_hira:
        # Just dump the whole reading above the whole lemma if we built an inconsistent furigana reading
        html_str = f"<ruby><rb>{lemma}</rb><rt>{reading}</rt></ruby>"

    return html_str


def index_vocab_by_kanji(vocab_df):
    idx = defaultdict(list)
    for i, row in vocab_df.iterrows():
        lemma = row["lemma"]
        if not isinstance(lemma, str):
            continue
        for ch in lemma:
            if is_kanji(ch):
                idx[ch].append(i)
    return idx


def build_final_kanji_notes(kanji_df, vocab_df, vocab_idx, top_n_vocab=15, mp3s_dir=None):
    """
    Build final notes dataset for Anki export.
    Columns:
        Kanji (Note ID), Kanji Freq Rank, Kanji Order, Meaning, OnKun, Metadata, Vocab Block, Links, Tags
    """

    rows = []
    vocab_used = set()

    for _, krow in kanji_df.iterrows():
        kanji = krow["kanji"]

        # === Frequency index ===
        freq_rank = int(krow["freq_rank"]) if "freq_rank" in krow and pd.notna(krow["freq_rank"]) else ""
        deck_rank = int(krow["deck_rank"]) if "deck_rank" in krow and pd.notna(krow["deck_rank"]) else freq_rank

        # === Meanings ===
        meanings = krow.get("meaning", [])
        if isinstance(meanings, str):
            meanings = [meanings]
        meaning_str = "; ".join(m for m in meanings[:3] if m)
        if kanji == "日":
            meaning_str = "; ".join(m for m in meanings if m)

        # === Metadata ===
        jlpt = krow.get("jlpt_level")
        jouyou = krow.get("jouyou_grade")
        strokes = krow.get("stroke_count")
        radical = krow.get("radical_id")

        meta_parts = []
        if pd.notna(radical):
            meta_parts.append(f"部首{int(radical)}")
        if pd.notna(strokes):
            meta_parts.append(f"画数{int(strokes)}")
        if pd.notna(jouyou):
            jouyou = int(jouyou)
            if jouyou <= 6:
                meta_parts.append(f"常用小{int(jouyou)}")
            elif jouyou == 8:
                meta_parts.append("常用中")
            else:
                raise ValueError("Unexpected jouyou value")
        if pd.notna(jlpt):
            meta_parts.append(f"JLPT N{int(jlpt)}")

        metadata = "・".join(meta_parts)

        # === On/Kun readings ===
        on_yomi = krow.get("on_readings", [])
        kun_yomi = krow.get("kun_readings", [])
        on_str, kun_str = "", ""
        if isinstance(on_yomi, list) or isinstance(on_yomi, np.ndarray):
            on_str = "・".join(list(on_yomi))
        elif on_yomi:
            on_str = str(on_yomi)
        if isinstance(kun_yomi, list) or isinstance(kun_yomi, np.ndarray):
            kun_str = "・".join(list(kun_yomi))
        elif kun_yomi:
            kun_str = str(kun_yomi)

        on_line  = f"音：　{on_str}" if on_str else "　"
        kun_line = f"訓：　{kun_str}" if kun_str else "　"

        onkun_str = f'<div class="on">{on_line}</div><div class="kun">{kun_line}</div>'

        # === Vocab breakdown ===
        sub = vocab_df.loc[vocab_idx.get(kanji, [])].copy()
        if sub.empty:
            vocab_block = ""
        else:
            total = sub["weighted_count"].sum()
            sub = sub.sort_values("freq_rank")

            vocab_lines = []
            added_count, low_count = 0, 0
            for _, v in sub.iterrows():
                if added_count >= top_n_vocab:
                    break

                lemma = v.get("lemma", "")
                reading = v.get("jmdict_reading") or v.get("reading_kata") or ""
                # ensure strings
                if not isinstance(lemma, str):
                    lemma = ""
                if not isinstance(reading, str):
                    reading = ""

                translations = v.get("translation", "") or ""
                translations = html.escape(translations)
                # Skip empty translation rows (I guess appearing when Sudachipy gives a POS or whatever that doesn't
                # match to JMDict)
                if not translations.strip():
                    continue

                percent = int(round(100.0 * v["weighted_count"] / total)) if total > 0 else 0
                percent = 99 if percent == 100 else percent  # 100% is a strong claim...
                percent_str = f"{percent}" if percent >= 1 else "<1"

                if percent < 1:
                    low_count += 1
                    if low_count > 3:
                        break
                
                # Colour the percent by frequency band
                freq_rank_val = v.get("freq_rank")
                if freq_rank_val <= 5000:
                    freq_band_class = "v-percent--core"
                elif freq_rank_val <= 10000:
                    freq_band_class = "v-percent--mid"
                else:
                    freq_band_class = "v-percent--tail"
                
                pitch = v.get("pitch_accent", pd.NA)
                ruby_html = build_furigana_html(lemma, reading, pitch=pitch)

                verb_class, transitivity = "", ""
                if v.get("pos", "").startswith("動詞") and not v["is_suru"] and v["lemma"] != "来る":
                    if v.get("is_ichidan"):
                        verb_class = "一"
                    if v.get("is_godan"):
                        verb_class = "五"
                    if v.get("is_it"):
                        transitivity = "自"
                    if v.get("is_vt"):
                        transitivity = "他"
                if transitivity == "" and verb_class:
                    transitivity = "自"
                    transitive_fallbacks = ["下さる", "組む", "語りかける", "連れ戻す", "連れ帰る", "持ち去る", "超す", "恐る",
                        "越す", "盗み出す", "削り出す", "奪い返す", "奪い去る", "招き入れる", "祈り求める", "泣かす",
                        "繋ぎ合わせる", "狂わす", "待ち伏せる", "恥じる", "封切る", "叩き潰す", "乗り潰す", "阻む", "蹴り出す",
                        "垂れ流す", "捧ぐ", "炊きあげる"
                    ]
                    if lemma in transitive_fallbacks:
                        transitivity = "他"
                if verb_class == "" and transitivity:
                    verb_class = "五"
                    ichidan_fallbacks = ["恐る"]
                    if lemma in ichidan_fallbacks:
                        verb_class = "一"

                verb_badge = verb_class + transitivity

                if verb_badge:
                    translations = f'<span class="verb-flags" data-type="{verb_badge}">{verb_badge}</span> {translations}'

                # === Vocab Line html ===
                audio_id = None
                audio_tag_html = ""

                if os.path.exists(mp3s_dir) and lemma and reading:
                    # 1) unique, namespaced *filename* in media folder
                    base = f"wordrank_kanji__{lemma}_{reading}"
                    audio_filename = f"{base}.mp3"
                    audio_path = mp3s_dir / audio_filename

                    # only add audio if the file actually exists
                    if audio_path.is_file():
                        # 2) unique, namespaced HTML id (never leaves the card)
                        audio_id = f"wrk_kanji_audio__{lemma}_{reading}"
                        # 3) the Anki [sound:...] tag that refers to the filename
                        audio_tag_html = f'[sound:{html.escape(audio_filename)}]'

                # visible vocab part; inner span is the actual tap target
                if audio_id:
                    vocab_part = (
                        f'<div class="v-vocab">'
                        f'  <span class="v-vocab-hit tappable" data-audio="{audio_id}">{ruby_html}</span>'
                        f'</div>'
                        f'<div id="{audio_id}" class="v-audio">{audio_tag_html}</div>'
                    )
                else:
                    vocab_part = (
                        f'<div class="v-vocab">'
                        f'  <span class="v-vocab-hit">{ruby_html}</span>'
                        f'</div>'
                    )

                line_html = (
                    f'<div class="vocab-line">'
                    f'<div class="v-percent {freq_band_class}">{percent_str}</div>'
                    f'{vocab_part}'
                    f'<div class="v-translations">{translations}</div>'
                    f'</div>'
                )

                vocab_lines.append(line_html)
                added_count += 1
                vocab_used.add((lemma, reading, pitch))
            
            vocab_block = "".join(vocab_lines)

        # === Tags ===
        tags = []

        if pd.notna(jlpt):
            tags.append(f"jlpt_n{int(jlpt)}")
        else:
            tags.append("non_jlpt")

        if pd.notna(jouyou):
            if jouyou <= 6:
                tags.append(f"jouyou_sho{int(jouyou)}")
            elif jouyou == 8:
                tags.append("jouyou_chu")
        else:
            tags.append("non_jouyou")

        if pd.notna(strokes):
            tags.append(f"strokes_{int(strokes):02d}")

        if pd.notna(radical):
            tags.append(f"radical_{int(radical):03d}")
        
        if freq_rank <= 2500:
            block = (freq_rank - 1) // 100
            start = block * 100 + 1
            end = start + 99
            tags.append(f"freq_rank_{start:04d}_to_{end:04d}")  # eg 0301_to_0400
        else:
            tags.append("freq_rank_2501_plus")

        tag_str = " ".join(f"wordrank_kanji::{t}" for t in tags)

        # === Assemble row ===
        rows.append({
            "Kanji (Note ID)": kanji,
            "WordRank Kanji Order": deck_rank,
            "Frequency Order": freq_rank,
            "Deck Order (overwrite this one)": deck_rank,
            "Meaning": meaning_str,
            "OnKun": onkun_str,
            "Metadata": metadata,
            "Vocab Block": vocab_block,
            "Tags": tag_str,
        })
    
    assert len(rows) == len(kanji_df)

    vocab_used = pd.DataFrame(
        list(vocab_used),
        columns=["lemma", "reading", "pitch_accent"]
    )

    return pd.DataFrame(rows), vocab_used


In [None]:
vocab_idx = index_vocab_by_kanji(vocab_df)

if test_mode:
    kanji_df_ = kanji_df[:100]
else:
    kanji_df_ = kanji_df.copy()

notes_df, vocab_used = build_final_kanji_notes(
    kanji_df_, vocab_df, vocab_idx, top_n_vocab=top_n_vocab, mp3s_dir=mp3s_dir
)

out_path = out_dir / f"WordRank Kanji{' (test)' if test_mode else ''} {version}.tsv"
notes_df.to_csv(out_path, sep="\t", index=False, header=False, encoding="utf-8")
notes_df.to_parquet(out_dir / "notes_df_end.parquet", index=False)

### You now have the tsv you can import to Anki
Below are stats/checks for the file, and also the audio generation code. That isn't part of the main deck producing
pipeline, because what audio to generate depends on what's in your final dfs. But, for the cards to reference the audio,
build_final_kanji_notes() needs to know what audio files there are (referencing non-existent files makes anki sad). So
if this is your first time running, you'll need to generate the audio below, and then rerun just the notes building bit.

Make a nice plot of what kanjis are in the deck (by jouyou / JLPT)

In [None]:
font_path = home_dir / "fonts" / "NotoSansCJK-Regular.ttc"
mpl.font_manager.fontManager.addfont(font_path)
mpl.rcParams['font.family'] = mpl.font_manager.FontProperties(fname=font_path).get_name()

df = kanji_df.copy()
df['jlpt_level']  = df['jlpt_level'].astype('Int64')
df['jouyou_grade'] = df['jouyou_grade'].astype('Int64')

# --- Y POSITIONS ---
jouyou_grades = [1,2,3,4,5,6,8]
jouyou_positions = {g: (g if g != 8 else 7) for g in jouyou_grades}
non_jouyou_row = 8

jlpt_levels = [5,4,3,2,1]
# shift JLPT block up by +0.5 to create a visual gap after Non-Jōyō
jlpt_positions = {lvl: non_jouyou_row + (5 - lvl) + 1.5 for lvl in jlpt_levels}
non_jlpt_row = max(jlpt_positions.values()) + 1

# colours
jouyou_cmap = plt.cm.viridis_r(np.linspace(0.1, 0.75, len(jouyou_grades)))
jlpt_cmap   = plt.cm.magma_r(np.linspace(0.15, 0.6, len(jlpt_levels)))

plt.figure(figsize=(10, 7))
added_labels = set()
MS = 10

# --- NON−JŌYŌ ---
sub = df[df['jouyou_grade'].isna()]
if len(sub) > 0:
    label = f"Non-Jōyō ({len(sub)})"
    plt.scatter(sub['freq_rank'], np.full(len(sub), non_jouyou_row),
                color="grey", s=MS, alpha=0.25, label=label)
    added_labels.add(label)

# --- JŌYŌ ---
for g, col in zip(jouyou_grades, jouyou_cmap):
    subset = df[df['jouyou_grade'] == g]
    if len(subset) == 0:
        continue

    y = jouyou_positions[g]
    nice = "中" if g == 8 else f"小{g}"
    label = f"Jōyō {nice} ({len(subset)})"

    plt.scatter(
        subset['freq_rank'], np.full(len(subset), y), color=col, s=MS, alpha=0.25,
        label=None if label in added_labels else label
    )
    added_labels.add(label)

# --- JLPT ---
for lvl, col in zip(jlpt_levels, jlpt_cmap):
    subset = df[df['jlpt_level'] == lvl]
    if len(subset) == 0:
        continue

    y = jlpt_positions[lvl]
    label = f"JLPT N{lvl} ({len(subset)})"

    plt.scatter(
        subset['freq_rank'], np.full(len(subset), y), color=col, s=MS, alpha=0.25,
        label=None if label in added_labels else label
    )
    added_labels.add(label)

# --- NON-JLPT ---
sub = df[df['jlpt_level'].isna()]
if len(sub) > 0:
    label = f"Non-JLPT ({len(sub)})"
    plt.scatter(sub['freq_rank'], np.full(len(sub), non_jlpt_row),
                color="grey", s=MS, alpha=0.25, label=label)

# --- AXES ---
plt.xlabel("Frequency rank", fontsize=11)
plt.ylabel("")
plt.xlim(0, 4250)
plt.xticks(np.arange(0, 4001, 500), fontsize=11)

# y ticks
yticks = []
ylabels = []

for g in jouyou_grades:
    yticks.append(jouyou_positions[g])
    ylabels.append("Jōyō 中" if g == 8 else f"Jōyō 小{g}")

yticks.append(non_jouyou_row)
ylabels.append("Non-Jōyō")

for lvl in jlpt_levels:
    yticks.append(jlpt_positions[lvl])
    ylabels.append(f"JLPT N{lvl}")

yticks.append(non_jlpt_row)
ylabels.append("Non-JLPT")

plt.yticks(yticks, ylabels, fontsize=11)
plt.grid(alpha=0.25)

plt.title(f"WordRank Kanji Inclusions (Total: {len(df)})", fontsize=15)

# ---- LEGEND ORDER ----
handles, labels = plt.gca().get_legend_handles_labels()

dummy = plt.Line2D([0], [0], marker='o', markersize=8,
                   linestyle='', color='white', markerfacecolor='white')

handles = [dummy, dummy] + handles
labels  = ["", ""] + labels

manual_order = [
    0, 1,
    labels.index(next(l for l in labels if l.startswith("Non-JLPT"))),
    labels.index(next(l for l in labels if l.startswith("JLPT N1"))),
    labels.index(next(l for l in labels if l.startswith("JLPT N2"))),
    labels.index(next(l for l in labels if l.startswith("JLPT N3"))),
    labels.index(next(l for l in labels if l.startswith("JLPT N4"))),
    labels.index(next(l for l in labels if l.startswith("JLPT N5"))),

    labels.index(next(l for l in labels if l.startswith("Non-Jōyō"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 中"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 小6"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 小5"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 小4"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 小3"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 小2"))),
    labels.index(next(l for l in labels if l.startswith("Jōyō 小1")))
]

leg = plt.legend(
    [handles[i] for i in manual_order],
    [labels[i]  for i in manual_order],
    loc="lower right",
    bbox_to_anchor=(0.98, 0.02),
    fontsize=10.5,
    frameon=True,
    ncol=2,
    columnspacing=1.3,
    labelspacing=0.4,
    borderpad=0.7
)

for h in leg.legend_handles:
    try:
        h.set_alpha(1.0)
    except Exception:
        pass

plt.tight_layout()
plt.show()

How much of the top 5k vocab set is missing from the deck (due to the vocab lines cutoff)?

In [None]:
def has_kanji(s):
    return isinstance(s, str) and any(is_kanji(ch) for ch in s)

def vocab_key_from_row(row):
    lemma = row.get("lemma") or ""
    reading = row.get("jmdict_reading") or row.get("reading_kata") or ""
    if not isinstance(lemma, str):
        lemma = ""
    if not isinstance(reading, str):
        reading = ""
    return (lemma, reading)

# keys from vocab_used (lemma + reading)
vocab_used_keys = set(
    zip(
        vocab_used["lemma"].fillna("").astype(str),
        vocab_used["reading"].fillna("").astype(str),
    )
)

vf = vocab_df[pd.notna(vocab_df["freq_rank"])].copy()
vf = vf.sort_values("freq_rank")

# --- top 5k with kanji in lemma ---
top5 = vf[(vf["freq_rank"] <= 5000) & (vf["lemma"].apply(has_kanji))].copy()
top5["key"] = top5.apply(vocab_key_from_row, axis=1)

missing_top5 = (
    top5[~top5["key"].isin(vocab_used_keys)]
    .drop_duplicates("key")
)

cols = ["lemma", "reading_kata", "jmdict_reading", "translation", "pitch_accent", "freq_rank"]
missing_top5 = missing_top5[missing_top5["translation"].astype(str) != ""][cols]

# These are the vocab in top 5k with a translation but never appear in the deck. I think it's acceptable
missing_top5

In [None]:
raise Exception("Break the auto-run here because this last part is a bit more intentional")

Audio generation

In [None]:
# Generate wav readings of the vocabs

def iter_mora_chunks(reading_hira: str):
    """
    Split a hiragana string into mora-sized chunks.
    Small kana and ー are attached to the preceding character.
    """
    i = 0
    n = len(reading_hira)
    while i < n:
        ch = reading_hira[i]
        # base char + any following small kana / long vowel mark
        chunk = ch
        j = i + 1
        while j < n and (reading_hira[j] in SMALL_KANA or reading_hira[j] == "ー"):
            chunk += reading_hira[j]
            j += 1
        yield chunk
        i = j


def build_yomigana_with_pitch(reading: str, pitch) -> str | None:
    """
    reading: kana (hiragana/katakana)
    pitch: NHK-style accent index (0 = heiban, 1..N = drop after mora p)
    Returns something like '^は!し' or '^あめ!' for alphabet='yomigana'.
    """
    if not isinstance(reading, str) or not reading.strip():
        return None

    reading_hira = jaconv.kata2hira(reading)
    mora_chunks = list(iter_mora_chunks(reading_hira))
    total_mora = len(mora_chunks)

    try:
        p = int(pitch) if pitch is not None else 0
    except (TypeError, ValueError):
        p = 0

    if p < 0 or p > total_mora:
        p = 0  # treat garbage as heiban

    out = ["^"]
    for i, chunk in enumerate(mora_chunks, start=1):
        out.append(chunk)
        if p > 0 and i == p:
            out.append("!")

    return "".join(out)


def get_and_save_tts(lemma: str, reading: str, pitch_accent):
    """
    Synthesize audio for a single (lemma, reading, pitch_accent).
    Skips if WAV already exists.
    """
    if pd.isna(pitch_accent):
        return

    wav_path = wavs_dir / f"wordrank_kanji__{lemma}_{reading}.wav"
    if wav_path.exists():
        return
    
    hash_this = f"{lemma}+{reading}"
    h = hashlib.sha256(hash_this.encode("utf-8")).digest()
    idx = h[0] % len(voices)
    voice_name = voices[idx]
    voice = texttospeech.VoiceSelectionParams(
        language_code="ja-JP",
        name=voice_name,
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
    )

    yomi = build_yomigana_with_pitch(reading, pitch_accent)
    safe_yomi = html.escape(yomi)
    safe_lemma = html.escape(lemma)
    safe_reading = html.escape(reading)

    # Couldn't force it to read it the expected way either way...
    # ssml = f'<speak><phoneme alphabet="yomigana" ph="{safe_yomi}">{safe_lemma}</phoneme></speak>'
    ssml = f'<speak><phoneme alphabet="yomigana" ph="{safe_yomi}">{safe_reading}</phoneme></speak>'

    synthesis_input = texttospeech.SynthesisInput(ssml=ssml)

    response = None
    max_retries = 10
    for attempt in range(max_retries):
        try:
            response = client.synthesize_speech(
                input=synthesis_input,
                voice=voice,
                audio_config=audio_config,
            )
            break
        except exceptions.ResourceExhausted:
            if attempt == max_retries - 1:
                raise
            sleep_s = (2 ** attempt) + random.random()
            time.sleep(sleep_s)

    if response is None:
        raise RuntimeError(f"TTS synthesis failed for {lemma}+{reading!r}")

    wav_buf = BytesIO(response.audio_content)
    audio = AudioSegment.from_file(wav_buf, format="wav")
    audio.export(wav_path, format="wav")


def tts_for_row(row):
    lemma = row.lemma
    reading = row.reading
    pitch = row.pitch_accent

    try:
        get_and_save_tts(lemma, reading, pitch)
        return None
    except Exception as e:
        return f"{lemma}+{reading}: {e!r}"


os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(home_dir / "sa.json")
client = texttospeech.TextToSpeechClient()

voices = [
    "ja-JP-Chirp3-HD-Alnilam",
    "ja-JP-Chirp3-HD-Aoede",
    "ja-JP-Chirp3-HD-Callirrhoe",
    "ja-JP-Chirp3-HD-Fenrir",
    "ja-JP-Chirp3-HD-Iapetus",
    "ja-JP-Chirp3-HD-Leda",
    "ja-JP-Chirp3-HD-Orus",
    "ja-JP-Chirp3-HD-Rasalgethi",
    "ja-JP-Chirp3-HD-Sulafat",
]

wavs_dir.mkdir(parents=True, exist_ok=True)

rows = list(vocab_used.itertuples(index=False))
print(f"Total number of vocabs (including ones without pitch): {len(rows)}")

workers = 4
failed = []
with ThreadPoolExecutor(max_workers=workers) as ex:
    futures = [ex.submit(tts_for_row, row) for row in rows]

    for n, fut in enumerate(as_completed(futures), start=1):
        err = fut.result()
        if err is not None:
            failed.append(err)
        if n % 1000 == 0:
            print(f"Done {n}")

print(f"Failed {len(failed)} vocabs")

In [None]:
# Clean up which mp3s we have

check_dir = mp3s_dir
files = os.listdir(check_dir)
print(f"Total fies: {len(files)}")

# Vocabs (from vocab_used) with a pitch accent
valid_vocabs = set()
for row in vocab_used.itertuples(index=False):
    if pd.isna(row.pitch_accent):
        continue
    lemma = str(row.lemma)
    reading = str(row.reading)
    valid_vocabs.add((lemma, reading))
print(f"Should have: {len(valid_vocabs)}")

# Vocabs with an audio file
found_vocabs = set()
for f in files:
    lem, rea = f.split("__")[-1].split(".")[0].split("_")
    found_vocabs.add((lem, rea))
print(f"Have: {len(found_vocabs)}")

should_del = found_vocabs - valid_vocabs
print(f"Should delete: {len(should_del)}")

missing = valid_vocabs - found_vocabs
print(f"Should generate: {len(missing)}")

if 0:
    for f in files:
        lem, rea = f.split("__")[-1].split(".")[0].split("_")
        if (lem, rea) in should_del:
            os.remove(check_dir / f)

In [None]:
# Cut wavs and export to mp3

def detect_leading_silence(sound, *, silence_thresh_db=-35, chunk_size_ms=10, safety_margin_ms=50):
    if len(sound) == 0:
        return 0

    n_chunks = max(1, int(np.ceil(len(sound) / chunk_size_ms)))

    for i in range(n_chunks):
        start = i * chunk_size_ms
        chunk = sound[start:start + chunk_size_ms]
        # use pydub's dBFS (already relative to full scale)
        db = chunk.dBFS if chunk.dBFS != float("-inf") else -1000.0
        if db > silence_thresh_db:
            onset_ms = i * chunk_size_ms
            return max(0, onset_ms - safety_margin_ms)

    return 0

mp3s_dir.mkdir(parents=True, exist_ok=True)

wav_files = sorted(wavs_dir.glob("*.wav"))
wav_files = [data_dir / "wavs/wordrank_kanji__お買い上げ_おかいあげ.wav"]
print(f"Found {len(wav_files)} wav files")

failed_mp3 = []

for n, wav_path in enumerate(wav_files, start=1):
    mp3_path = mp3s_dir / (wav_path.stem + ".mp3")
    if mp3_path.exists():
        continue

    only_make_missing_files = True
    if only_make_missing_files:
        lemma, reading = wav_path.stem.split("__")[-1].split("_")
        if (lemma, reading) not in missing:
            continue

    try:
        audio = AudioSegment.from_wav(wavs_dir / wav_path)

        # strip leading silence
        start_trim = detect_leading_silence(audio)
        trimmed = audio[start_trim:] if start_trim < len(audio) else audio

        # normalise loudness (skip if completely silent)
        if np.isinf(trimmed.dBFS):
            normalised = trimmed
        else:
            target_dbfs = -18
            gain = target_dbfs - trimmed.dBFS
            normalised = trimmed.apply_gain(gain)

        # keep it small: mono, modest sample rate, low-ish bitrate
        normalised = normalised.set_channels(1).set_frame_rate(24000)
        normalised.export(
            mp3_path,
            format="mp3",
            bitrate="40k"  # bump to 64k if you want it nicer
        )

    except Exception as e:
        failed_mp3.append((wav_path.name, repr(e)))

    if n % 1000 == 0:
        print(f"Processed {n} wav files")

print(f"Done. Failed: {len(failed_mp3)}")

If you want to jump back in from a final set of dfs

In [None]:
fp = out_dir / "vocab_df_end.parquet"
vocab_df_end = pd.read_parquet(fp)
fp = out_dir / "kanji_df_end.parquet"
kanji_df_end = pd.read_parquet(fp)
fp = out_dir / "notes_df_end.parquet"
notes_df_end = pd.read_parquet(fp)

vocab_idx = index_vocab_by_kanji(vocab_df_end)

kanjidic2_df = parse_kanjidic2_to_df(data_dir / "kanjidic2.xml")
jmdict_df = parse_jmdict_to_df(data_dir / "JMdict_e.xml", JMDICT_ENTITIES)
pitch_df = parse_pitch_accents_to_df(data_dir / "pitch_accents.txt")

_, vocab_used = build_final_kanji_notes(kanji_df_end, vocab_df_end, vocab_idx, top_n_vocab=top_n_vocab, mp3s_dir=mp3s_dir)