In [2]:
import pandas as pd

In [3]:
df_hsk = pd.concat(
    (
        pd.read_csv(f"data/hsk{i}.csv", names=["simplified", "pinyin", "meanings"])
        .assign(level=i)
    )
    for i in range(1,7)
)[["simplified", "pinyin", "meanings", "level"]]

In [46]:
def vowelsToTone(pinyin):
    accentMap = {
        "a": ["ā", "á", "ǎ", "à"],  #, "a"],
        "A": ["Ā", "Á", "Ǎ", "À"],  #, "A"],
        "e": ["ē", "é", "ě", "è"],  #, "e"],
        "E": ["Ē", "É", "Ě", "È"],  #, "E"],
        "i": ["ī", "í", "ǐ", "ì"],  #, "i"],
        "I": ["Ī", "Í", "Ǐ", "Ì"],  #, "I"],
        "o": ["ō", "ó", "ǒ", "ò"],  #, "o"],
        "O": ["Ō", "Ó", "Ǒ", "Ò"],  #, "o"],
        "u": ["ū", "ú", "ǔ", "ù"],  #, "u"],
        "U": ["Ū", "Ú", "Ǔ", "Ù"],  #, "u"],
        "v": ["ǖ", "ǘ", "ǚ", "ǜ"],  #, "ü"],
        "V": ["Ǖ", "Ǘ", "Ǚ", "Ǜ"],  #, "Ü"],
    }
    def vowelToTone(word):
        for noAccent, accents in accentMap.items():
            for i, accent in enumerate(accents):
                if accent in word:
                    return f"{word.replace(accent, noAccent)}{i+1}"
        return f"{word}5"
        
    return " ".join(vowelToTone(x) for x in pinyin.split(" "))

df_hsk = df_hsk.assign(pinyin=df_hsk["pinyin"].map(vowelsToTone))
df_hsk

Unnamed: 0,simplified,pinyin,meanings,level
0,爱,ai45,to love,1
1,八,ba15,eight,1
2,爸爸,ba45 ba55,(informal) father,1
3,杯子,bei15 zi55,cup,1
4,北京,Bei35 jing15,Beijing,1
...,...,...,...,...
2495,座右铭,zuo45 you45 ming25,motto,6
2496,作弊,zuo45 bi45,to practice fraud,6
2497,作废,zuo45 fei45,to become invalid,6
2498,作风,zuo45 feng15,style,6


In [105]:
# https://github.com/mike-fabian/ibus-table-chinese/blob/main/tables/
df_cj = pd.read_csv(
#     "data/cangjie5.txt",
    "data/cangjie-big.txt",
    comment="#",
    sep="\t",
    quoting=3,  # csv.QUOTE_NONE
#     skiprows=148,
    skiprows=160,
    skipfooter=2,
    keep_default_na=False,
    names=["cangjie", "hanzi", "noidea"],
)[["hanzi", "cangjie"]]

df_cj = (
    df_cj
    .assign(
        hanzi=df_cj["hanzi"].astype("string"),
        cangjie=df_cj["cangjie"].astype("string")
    )
    .groupby("hanzi")
    .agg("first")  # first is fine because the only duplicates are weird symbols
)

  df_cj = pd.read_csv(


In [108]:
df_wb = pd.read_csv(
    "data/wubi-haifeng86.UTF-8",
    comment="#",
    sep="\t",
    quoting=3,  # csv.QUOTE_NONE
    skiprows=0,
    skipfooter=0,
    keep_default_na=False,
    names=["wubi", "hanzi", "noidea"],
)[["hanzi", "wubi"]]

# not sure what len(hanzi)>1 means when it has a full stop, but just remove
df_wb = df_wb[df_wb["hanzi"].map(len).eq(1)]#.groupby("hanzi").agg({"wubi": lambda x: " ".join(x)})
df_wb = (
    df_wb
    .assign(
        hanzi=df_wb["hanzi"].astype("string"),
        wubi=df_wb["wubi"].astype("string")
    )
    .groupby("hanzi")
    # there are many ways to type the same zi but choose the shortest one for our purposes
    .agg(lambda x: min(sorted(x), key=len))
)

In [109]:
hsk_levels = {1,2,3}
# df_hsk["simplified"].str.split("")
zi_set = set(''.join(df_hsk[df_hsk["level"].isin(hsk_levels)]["simplified"]))
zi_set

{'一',
 '七',
 '万',
 '丈',
 '三',
 '上',
 '下',
 '不',
 '且',
 '世',
 '业',
 '东',
 '两',
 '个',
 '中',
 '为',
 '主',
 '举',
 '久',
 '么',
 '乎',
 '乐',
 '九',
 '也',
 '习',
 '书',
 '买',
 '了',
 '事',
 '二',
 '于',
 '云',
 '五',
 '些',
 '京',
 '亮',
 '人',
 '什',
 '今',
 '介',
 '从',
 '他',
 '以',
 '们',
 '件',
 '休',
 '会',
 '伞',
 '但',
 '位',
 '低',
 '住',
 '体',
 '作',
 '你',
 '使',
 '便',
 '信',
 '候',
 '借',
 '做',
 '健',
 '像',
 '儿',
 '元',
 '先',
 '八',
 '公',
 '六',
 '共',
 '关',
 '兴',
 '其',
 '典',
 '再',
 '冒',
 '写',
 '冬',
 '冰',
 '决',
 '冷',
 '净',
 '准',
 '几',
 '出',
 '分',
 '刚',
 '别',
 '刮',
 '到',
 '刷',
 '刻',
 '前',
 '力',
 '办',
 '加',
 '务',
 '动',
 '助',
 '努',
 '包',
 '化',
 '北',
 '医',
 '十',
 '千',
 '午',
 '半',
 '单',
 '卖',
 '南',
 '历',
 '厨',
 '去',
 '参',
 '又',
 '友',
 '双',
 '发',
 '叔',
 '变',
 '口',
 '句',
 '只',
 '叫',
 '可',
 '史',
 '右',
 '号',
 '司',
 '吃',
 '同',
 '名',
 '后',
 '向',
 '吗',
 '吧',
 '听',
 '告',
 '员',
 '呢',
 '周',
 '和',
 '咖',
 '响',
 '哥',
 '哪',
 '哭',
 '唱',
 '商',
 '啊',
 '啡',
 '啤',
 '喂',
 '喜',
 '喝',
 '四',
 '回',
 '因',
 '园',
 '国',
 '图',
 '在',
 '地',
 '场',
 '坏',
 '坐'

In [116]:
df_hier = df_cj[df_cj.index.isin(zi_set)].join(df_wb, how="inner")

assert len(df_hier) == len(zi_set)
assert sum(df_hier["cangjie"].str.contains(" ")) == 0
assert sum(df_hier["wubi"].str.contains(" ")) == 0

df_hier

Unnamed: 0_level_0,cangjie,wubi
hanzi,Unnamed: 1_level_1,Unnamed: 2_level_1
一,m,g
七,jv,ag
万,ms,dnv
丈,jk,dyi
三,mmm,dg
...,...,...
鸟,hsim,qyng
鸡,ehsm,cqy
黄,twc,amw
黑,wgf,lfo


hierarchy needs a placement for leaves and branches
for cangjie we can just use alphabetical order for positioning nodes
                 abcdefghijklmnopqrstuvwxyz
for wubi we want gfdsahjklmtrewqyuiopnbvcxz

but what about when there is a ab abc, where a and b are both branch and leaf?
I think the cleanest is something like:
     a
    / \
   .   b
   |  / \
   . .   c

this keeps the layout planar
if there is a long line of single children then we can consider only keeping the first single parent with the leaf
I don't think we have enough layers for this to be a big issue though