In [1]:
import pandas as pd

In [2]:
df_hsk = pd.concat(
    (
        pd.read_csv(f"data/hsk{i}.csv", names=["simplified", "pinyin", "meanings"])
        .assign(level=i)
    )
    for i in range(1,7)
)[["simplified", "pinyin", "meanings", "level"]]

In [3]:
def vowelsToTone(pinyin):
    accentMap = {
        "a": ["ā", "á", "ǎ", "à"],  #, "a"],
        "A": ["Ā", "Á", "Ǎ", "À"],  #, "A"],
        "e": ["ē", "é", "ě", "è"],  #, "e"],
        "E": ["Ē", "É", "Ě", "È"],  #, "E"],
        "i": ["ī", "í", "ǐ", "ì"],  #, "i"],
        "I": ["Ī", "Í", "Ǐ", "Ì"],  #, "I"],
        "o": ["ō", "ó", "ǒ", "ò"],  #, "o"],
        "O": ["Ō", "Ó", "Ǒ", "Ò"],  #, "o"],
        "u": ["ū", "ú", "ǔ", "ù"],  #, "u"],
        "U": ["Ū", "Ú", "Ǔ", "Ù"],  #, "u"],
        "v": ["ǖ", "ǘ", "ǚ", "ǜ"],  #, "ü"],
        "V": ["Ǖ", "Ǘ", "Ǚ", "Ǜ"],  #, "Ü"],
    }
    def vowelToTone(word):
        for noAccent, accents in accentMap.items():
            for i, accent in enumerate(accents):
                if accent in word:
                    return f"{word.replace(accent, noAccent)}{i+1}"
        return f"{word}5"
        
    return " ".join(vowelToTone(x) for x in pinyin.split(" "))

df_hsk = df_hsk.assign(pinyin=df_hsk["pinyin"].map(vowelsToTone))
df_hsk

Unnamed: 0,simplified,pinyin,meanings,level
0,爱,ai4,to love,1
1,八,ba1,eight,1
2,爸爸,ba4 ba5,(informal) father,1
3,杯子,bei1 zi5,cup,1
4,北京,Bei3 jing1,Beijing,1
...,...,...,...,...
2495,座右铭,zuo4 you4 ming2,motto,6
2496,作弊,zuo4 bi4,to practice fraud,6
2497,作废,zuo4 fei4,to become invalid,6
2498,作风,zuo4 feng1,style,6


In [4]:
# https://github.com/mike-fabian/ibus-table-chinese/blob/main/tables/
df_cj = pd.read_csv(
#     "data/cangjie5.txt",
    "data/cangjie-big.txt",
    comment="#",
    sep="\t",
    quoting=3,  # csv.QUOTE_NONE
#     skiprows=148,
    skiprows=160,
    skipfooter=2,
    keep_default_na=False,
    names=["cangjie", "hanzi", "noidea"],
)[["hanzi", "cangjie"]]

df_cj = (
    df_cj
    .assign(
        hanzi=df_cj["hanzi"].astype("string"),
        cangjie=df_cj["cangjie"].astype("string")
    )
    .groupby("hanzi")
    .agg("first")  # first is fine because the only duplicates are weird symbols
)

  df_cj = pd.read_csv(


In [5]:
df_wb = pd.read_csv(
    "data/wubi-haifeng86.UTF-8",
    comment="#",
    sep="\t",
    quoting=3,  # csv.QUOTE_NONE
    skiprows=0,
    skipfooter=0,
    keep_default_na=False,
    names=["wubi", "hanzi", "noidea"],
)[["hanzi", "wubi"]]

# not sure what len(hanzi)>1 means when it has a full stop, but just remove
df_wb = df_wb[df_wb["hanzi"].map(len).eq(1)]#.groupby("hanzi").agg({"wubi": lambda x: " ".join(x)})
df_wb = (
    df_wb
    .assign(
        hanzi=df_wb["hanzi"].astype("string"),
        wubi=df_wb["wubi"].astype("string")
    )
    .groupby("hanzi")
    # there are many ways to type the same zi but choose the shortest one for our purposes
    .agg(lambda x: min(sorted(x), key=len))
)

In [6]:
df_hsk["level"].value_counts()

6    2500
5    1300
4     601
3     299
1     150
2     150
Name: level, dtype: int64

In [34]:
hsk_levels = {1,2,3,4,5,6}
words = df_hsk[df_hsk["level"].isin(hsk_levels)]["simplified"]
zi_set = set(''.join(words))
len(zi_set)

2632

In [41]:
df_hier = df_cj[df_cj.index.isin(zi_set)].join(df_wb, how="inner")

assert len(df_hier) == len(zi_set)
assert sum(df_hier["cangjie"].str.contains(" ")) == 0
assert sum(df_hier["wubi"].str.contains(" ")) == 0

df_hier.head()

Unnamed: 0_level_0,cangjie,wubi
hanzi,Unnamed: 1_level_1,Unnamed: 2_level_1
一,m,g
丁,mn,sgh
七,jv,ag
万,ms,dnv
丈,jk,dyi


In [42]:
import networkx as nx
import math
import numpy as np

In [43]:
def construct_graph(words: pd.Series):
    g = nx.DiGraph()
        
    for _, word in words.items():
        for i in range(len(word) - 1):
            source, target = word[i:i+2]
            # TODO handle same adjacency for different ci
            g.add_edge(source, target, duality="adjacency", ci=word)
            
    return g

In [44]:
g = construct_graph(words)

In [47]:
cc = list(nx.weakly_connected_components(g))
len(cc), [len(x) for x in cc]

(129,
 [1,
  2072,
  1,
  3,
  3,
  5,
  1,
  2,
  1,
  3,
  2,
  2,
  2,
  1,
  2,
  2,
  1,
  2,
  2,
  3,
  2,
  2,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  2,
  1,
  2,
  2,
  2,
  1,
  2,
  2,
  2,
  1,
  2,
  3,
  2,
  4,
  2,
  4,
  2,
  3,
  2,
  2,
  3,
  2,
  3,
  2,
  4,
  2,
  3,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  4,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  4,
  2,
  3,
  3,
  2,
  2,
  1,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2])

In [51]:
import s_gd2

In [62]:
g2 = g.subgraph(cc[1])
zi2idx = {v: i for i, v in enumerate(cc[1])}
I = [zi2idx[source] for source, _ in g2.edges]
J = [zi2idx[target] for _, target in g2.edges]


In [63]:
X = s_gd2.layout(I, J)

In [64]:
X

array([[ 1.40296187, -2.08052172],
       [ 0.52238526,  0.3472082 ],
       [ 3.58395206, -1.14116816],
       ...,
       [ 0.2359335 , -1.28459933],
       [-1.48376138,  1.22610832],
       [ 4.41641147, -0.75649418]])

In [68]:
s_gd2.draw_svg(X, I, J, "stress.svg", noderadius=.05, nodeopacity=.5, linkopacity=0.1)