In [69]:
import pandas as pd

In [70]:
df_hsk = pd.concat(
    (
        pd.read_csv(f"data/hsk{i}.csv", names=["simplified", "pinyin", "meanings"])
        .assign(level=i)
    )
    for i in range(1,7)
)[["simplified", "pinyin", "meanings", "level"]]

In [71]:
def vowelsToTone(pinyin):
    accentMap = {
        "a": ["ā", "á", "ǎ", "à"],  #, "a"],
        "A": ["Ā", "Á", "Ǎ", "À"],  #, "A"],
        "e": ["ē", "é", "ě", "è"],  #, "e"],
        "E": ["Ē", "É", "Ě", "È"],  #, "E"],
        "i": ["ī", "í", "ǐ", "ì"],  #, "i"],
        "I": ["Ī", "Í", "Ǐ", "Ì"],  #, "I"],
        "o": ["ō", "ó", "ǒ", "ò"],  #, "o"],
        "O": ["Ō", "Ó", "Ǒ", "Ò"],  #, "o"],
        "u": ["ū", "ú", "ǔ", "ù"],  #, "u"],
        "U": ["Ū", "Ú", "Ǔ", "Ù"],  #, "u"],
        "v": ["ǖ", "ǘ", "ǚ", "ǜ"],  #, "ü"],
        "V": ["Ǖ", "Ǘ", "Ǚ", "Ǜ"],  #, "Ü"],
    }
    def vowelToTone(word):
        for noAccent, accents in accentMap.items():
            for i, accent in enumerate(accents):
                if accent in word:
                    return f"{word.replace(accent, noAccent)}{i+1}"
        return f"{word}5"
        
    return " ".join(vowelToTone(x) for x in pinyin.split(" "))

df_hsk = df_hsk.assign(pinyin=df_hsk["pinyin"].map(vowelsToTone))
df_hsk

Unnamed: 0,simplified,pinyin,meanings,level
0,爱,ai4,to love,1
1,八,ba1,eight,1
2,爸爸,ba4 ba5,(informal) father,1
3,杯子,bei1 zi5,cup,1
4,北京,Bei3 jing1,Beijing,1
...,...,...,...,...
2495,座右铭,zuo4 you4 ming2,motto,6
2496,作弊,zuo4 bi4,to practice fraud,6
2497,作废,zuo4 fei4,to become invalid,6
2498,作风,zuo4 feng1,style,6


In [72]:
# https://github.com/mike-fabian/ibus-table-chinese/blob/main/tables/
df_cj = pd.read_csv(
#     "data/cangjie5.txt",
    "data/cangjie-big.txt",
    comment="#",
    sep="\t",
    quoting=3,  # csv.QUOTE_NONE
#     skiprows=148,
    skiprows=160,
    skipfooter=2,
    keep_default_na=False,
    names=["cangjie", "hanzi", "noidea"],
)[["hanzi", "cangjie"]]

df_cj = (
    df_cj
    .assign(
        hanzi=df_cj["hanzi"].astype("string"),
        cangjie=df_cj["cangjie"].astype("string")
    )
    .groupby("hanzi")
    .agg("first")  # first is fine because the only duplicates are weird symbols
)

  df_cj = pd.read_csv(


In [73]:
df_wb = pd.read_csv(
    "data/wubi-haifeng86.UTF-8",
    comment="#",
    sep="\t",
    quoting=3,  # csv.QUOTE_NONE
    skiprows=0,
    skipfooter=0,
    keep_default_na=False,
    names=["wubi", "hanzi", "noidea"],
)[["hanzi", "wubi"]]

# not sure what len(hanzi)>1 means when it has a full stop, but just remove
df_wb = df_wb[df_wb["hanzi"].map(len).eq(1)]#.groupby("hanzi").agg({"wubi": lambda x: " ".join(x)})
df_wb = (
    df_wb
    .assign(
        hanzi=df_wb["hanzi"].astype("string"),
        wubi=df_wb["wubi"].astype("string")
    )
    .groupby("hanzi")
    # there are many ways to type the same zi but choose the shortest one for our purposes
    .agg(lambda x: min(sorted(x), key=len))
)

In [74]:
df_hsk["level"].value_counts()

6    2500
5    1300
4     601
3     299
1     150
2     150
Name: level, dtype: int64

In [121]:
hsk_levels = {1,2,3}
words = df_hsk[df_hsk["level"].isin(hsk_levels)]["simplified"]
zi_set = set(''.join(words))
len(zi_set)

618

In [127]:
import networkx as nx
import math
import numpy as np

In [128]:
def construct_graph(words: pd.Series):
    g = nx.DiGraph()
        
    for _, word in words.items():
        for i in range(len(word) - 1):
            source, target = word[i:i+2]
            # TODO handle same adjacency for different ci
            g.add_edge(source, target, duality="adjacency", ci=word)
            
    return g

In [129]:
g = construct_graph(words)

In [132]:
cc = list(nx.weakly_connected_components(g))
len(g), len(cc), [len(x) for x in cc]

(454,
 103,
 [1,
  191,
  13,
  3,
  2,
  4,
  4,
  3,
  2,
  1,
  4,
  2,
  3,
  6,
  2,
  5,
  2,
  3,
  1,
  2,
  5,
  3,
  2,
  2,
  4,
  1,
  11,
  2,
  1,
  2,
  3,
  2,
  2,
  3,
  2,
  4,
  4,
  1,
  2,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  3,
  2,
  4,
  2,
  3,
  2,
  2,
  2,
  3,
  2,
  3,
  3,
  2,
  3,
  2,
  3,
  3,
  2,
  2,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  3,
  2,
  2,
  2,
  2,
  2,
  1,
  2,
  2,
  3,
  2,
  5,
  2,
  1,
  2,
  2,
  2])

In [133]:
import s_gd2

In [134]:
g2 = g.subgraph(cc[1])
zi2idx = {v: i for i, v in enumerate(g2.nodes)}
idx2zi = {v: k for k, v in zi2idx.items()}

I = [zi2idx[source] for source, _ in g2.edges]
J = [zi2idx[target] for _, target in g2.edges]

In [135]:
X = s_gd2.layout(I, J)
for i in range(len(X)):
    g2.nodes[idx2zi[i]]["position"] = X[i]

In [144]:
def draw_svg(
    trie: nx.Graph,
    beta=.75,
    width=750,
    border=50,
    linkwidth=.05,
    noderadius=.1,
    linkopacity=1,
    nodeopacity=1,
    fontsize=10,
) -> str:
    scale = width-2*border
    svg = []
    svg.append(f'<svg width="{width:.0f}" height="{width:.0f}" xmlns="http://www.w3.org/2000/svg">')
    svg.append('<style type="text/css">')
    svg.append(f'path{{stroke:black;stroke-width:{scale*linkwidth:.3f};stroke-opacity:{linkopacity:.3f};stroke-linecap:round;fill:transparent}}')
    svg.append(f'circle{{r:{scale*noderadius:.3f};stroke-width:0;fill-opacity:{nodeopacity:.3f}}}')
    svg.append('</style>')
    
    # add white background
    svg.append('<rect width="100%" height="100%" fill="white"/>')
    
    x_min, y_min = 2 * [float("inf")]
    x_max, y_max = 2 * [-float("inf")]
    for node, data in trie.nodes(data=True):
        x_min = min(x_min, data["position"][0])
        y_min = min(y_min, data["position"][1])
        x_max = max(x_max, data["position"][0])
        y_max = max(y_max, data["position"][1])
    
    # want to scale everything to 0-1 first
    unit_scale = 1 / max(x_max - x_min, y_max - y_min)
    unit_trans = -np.array([x_min, y_min])
    
    # move to (0,0)-(1,1) then expand to square
    place_node = lambda x: (x + unit_trans) * unit_scale * scale + np.array([border, border])
    
    # draw splines
    for source, target, data in trie.edges(data=True):
        if source == target:
            continue
        
        x1, y1 = place_node(trie.nodes[source]["position"])
        x2, y2 = place_node(trie.nodes[target]["position"])
        svg.append(f'<path d="M {x1} {y1} L {x2} {y2}"/>')
        
    for node, data in trie.nodes(data=True):
        zi_position = place_node(data["position"])
        svg.append(f'<text x="{zi_position[0]}" y="{zi_position[1]}" text-anchor="middle" alignment-baseline="central" font-size="{fontsize}">{node}</text>')

    svg.append('</svg>')

    return '\n'.join(svg)

In [145]:
from cairosvg import svg2png

svg = draw_svg(g2, beta=0.75, linkwidth=0.003, linkopacity=0.2, noderadius=0.012)
with open(f'stress.svg', 'w') as f:
    print(svg, file=f)
svg2png(bytestring=svg, write_to=f'stress.png', dpi=400)