# TBCL parser

In [1]:
import os, re, glob, requests, io, urllib, json
import pandas as pd
import opencc

pd.options.display.max_rows = 2000

opencc_tw2s = opencc.OpenCC('tw2s')

Download files from TBCL home page: https://coct.naer.edu.tw/download/tech_report/

In [2]:
![[ ! -d downloads && -d ../downloads/tbcl ]] && ln -s ../downloads/tbcl downloads
!mkdir -p downloads

if not os.path.exists('downloads/.done'):
    home_url = 'https://coct.naer.edu.tw/download/tech_report/'
    resp = requests.get(home_url).content.decode('utf-8')
    for url in sorted(re.findall('<a href="([^"]+[.](?:xlsx|docx))"', resp)):
        url = os.path.join(home_url, url)
        !cd downloads && wget -nc "{url}"
    !chmod a-w downloads/*.xlsx; touch downloads/.done

Symlinks for convenience and checksums:

In [3]:
%%bash -e
cd downloads
chmod a-w *.xlsx *.ods *.pdf
ln -sf '臺灣華語文能力基準詞語表_111-11-14.xlsx' words.xlsx
ln -sf '臺灣華語文能力基準詞語表_111-11-14.ods' words.ods
ln -sf '臺灣華語文能力基準漢字表_111-09-20.docx' chars.docx
ln -sf '臺灣華語文能力基準漢字表_111-09-20.xlsx' chars.xlsx
ln -sf '臺灣華語文能力基準類詞綴表_111-09-20.docx' affixes.docx
ln -sf '臺灣華語文能力基準類詞綴表_111-09-20.xlsx' affixes.xlsx
ln -sf '臺灣華語文能力基準語法點表_112-01-04.xlsx' grammar.xlsx
ln -sf '臺灣華語文能力基準語法點表_112-01-04.docx' grammar.docx
ln -sf '臺灣華語文能力基準基礎詞彙表_111-09-20.xlsx' glossary.xlsx
sha256sum *.xlsx

5e92ac49c5bb203e16fea29c53a2b2cb790033fb332a699c7689adde21528b8f  affixes.xlsx
6329e2516c5dbe416b85f6a94d200ebe95493f24f233a63dc10d85aa257a088f  chars.xlsx
b6ce3747a06c8482ce5f4059689463de01a45d2b78707feb85917404ccffae62  glossary.xlsx
c587989cf89992d55d97a2f932289ef071648ca80339ccfaff7bb823914e5bcf  grammar.xlsx
cb16dcd262eb3e499273f972c9a3a404c40042a7def35631fc40b2a64dd50eb0  words.xlsx
b6ce3747a06c8482ce5f4059689463de01a45d2b78707feb85917404ccffae62  臺灣華語文能力基準基礎詞彙表_111-09-20.xlsx
6329e2516c5dbe416b85f6a94d200ebe95493f24f233a63dc10d85aa257a088f  臺灣華語文能力基準漢字表_111-09-20.xlsx
cb16dcd262eb3e499273f972c9a3a404c40042a7def35631fc40b2a64dd50eb0  臺灣華語文能力基準詞語表_111-11-14.xlsx
c587989cf89992d55d97a2f932289ef071648ca80339ccfaff7bb823914e5bcf  臺灣華語文能力基準語法點表_112-01-04.xlsx
5e92ac49c5bb203e16fea29c53a2b2cb790033fb332a699c7689adde21528b8f  臺灣華語文能力基準類詞綴表_111-09-20.xlsx


## Parse wordlist

In [4]:
glossary_df = pd.read_excel('downloads/glossary.xlsx').rename(columns={
    '序號': 'ID',
    '詞語': 'Traditional',
    '注音': 'Zhuyin',
    '漢拼': 'Pinyin',
    '詞類/性質': 'POS',
    '詞彙英譯': 'Meaning',
    '語義/義項': 'Meaning2',
    '用法-常用搭配詞': 'Compounds',
    '例句': 'Examples',
    '級別': 'Level',
})
assert list(glossary_df.ID - 1) == list(glossary_df.index)
glossary_df['Level'] = glossary_df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(glossary_df.Level.isnull()) == 0
glossary_df.to_csv('glossary.csv', index=False)
print('glossary.csv: %d rows' % len(glossary_df))

glossary.csv: 1518 rows


In [5]:
opencc_tw2s = opencc.OpenCC('tw2s')

# Character levels from Table of General Standard Chinese Characters for verification.
tgh_level = pd.read_csv('../chars/tgh.csv').set_index('char').level.to_dict()

# Convert to simplified characters + verify
def to_simplified(trad):
    simp = opencc_tw2s.convert(trad)
    for x, y in ('擡抬', '砲炮', '牠它', '妳你', '姪侄', '瞇眯', '舖铺', '搥捶', '暱昵', '瑯琅'):
        simp = simp.replace(x, y)
    if '/' in simp and len(set(simp.split('/'))) == 1:
        simp = simp.split('/')[0]
    for c in simp:
        assert c in tgh_level or c in '/(),吋拚徬祂', (trad, simp, c)
    return simp

In [6]:
VARIANTS_EXC = {
    ('姊姊/姐姐/姊/姐', 'jiějie/jiě'): [['姊姊','jiějie'], ['姐姐','jiějie'], ['姊','jiě'], ['姐','jiě']],
    ('那/那裡/那裏/那兒', 'nà/nàlǐ/nàr'): [['那', 'nà'], ['那裡', 'nàlǐ'], ['那裏', 'nàlǐ'], ['那兒', 'nàr']],
    ('這/這裡/這裏/這兒', 'zhè/zhèlǐ/zhèr'): [['這', 'zhè'], ['這裡', 'zhèlǐ'], ['這裏', 'zhèlǐ'], ['這兒', 'zhèr']],
    ('手錶/手表/錶/表', 'shǒubiǎo/biǎo'): [['手錶', 'shǒubiǎo'], ['手表', 'shǒubiǎo'], ['錶', 'biǎo'], ['表', 'biǎo']],
    ('新台幣/新臺幣/台幣/臺幣', 'xīntáibì/táibì'): [['新台幣', 'xīntáibì'], ['新臺幣', 'xīntáibì'], ['台幣', 'táibì'], ['臺幣', 'táibì']],
    ('慾望/欲望/慾', 'yùwàng/yù'): [['慾望', 'yùwàng'], ['欲望', 'yùwàng'], ['慾', 'yù']],
    ('侄子/姪子/侄兒/姪兒', 'zhízi/zhír'): [['侄子', 'zhízi'], ['姪子', 'zhízi'], ['侄兒', 'zhír'], ['姪兒', 'zhír']],
    ('嘴脣/嘴唇/脣/唇', 'zuǐchún/chún'): [['嘴脣', 'zuǐchún'], ['嘴唇', 'zuǐchún'], ['脣', 'chún'], ['唇', 'chún']],
    ('沒(有)用', 'méi(yǒu)yòng'): [['沒用', 'méiyòng'], ['沒有用', 'méiyǒuyòng']],
    ('一邊(兒)', 'yìbiān(r)'): [['一邊', 'yìbiān'], ['一邊兒', 'yìbiānr']],
}

def get_variants(vocab, pinyin):
    vocab = vocab.strip()
    pinyin = re.sub(' */ *', '/', pinyin.strip())

    ps = re.sub('[^()/]', '', pinyin)
    vs = re.sub('[^()/]', '', vocab)
    if ps == '' and vs == '':
        return []

    if (vocab, pinyin) in VARIANTS_EXC:
        return VARIANTS_EXC[(vocab, pinyin)]

    if vs == '' and ps:
        assert set(ps) == {'/'}
        return [[vocab, p.strip()] for p in pinyin.split('/')]

    if vs and ps == '':
        assert set(vs) == {'/'}
        assert len(set(map(len, vocab.split('/')))) == 1, vocab  # all terms same length
        return [[v, pinyin] for v in vocab.split('/')]

    assert vs == ps and set(ps) == {'/'}, (vocab, pinyin)
    return [[v.strip(), p.strip()] for (v, p) in zip(vocab.split('/'), pinyin.split('/'))]

def get_variants_str(vocab, pinyin, moe):
    variants = get_variants(vocab, pinyin)
    if not variants:
        return ''
    arr = []
    for (trad, py) in variants:
        m = [x for x in json.loads(moe.replace("'", '"')) if x[0] == trad]
        assert len(m) <= 1
        if m:
            assert m[0][0] == trad
            m = ' '.join(m[0][1])
        else:
            m = ''
        arr.append({
            'Traditional': trad,
            'Simplified': to_simplified(trad),
            'Pinyin': normalize_pinyin(py, trad),
            'PinyinYB': py,
            'MOE': m,
        })
    return json.dumps(arr, ensure_ascii=False)

def fix_traditional(s):
    # number suffixes for different pronunciations, +3 weird duplicate entries 空檔 道 來往
    s = re.sub('[0-9]', '', s)
    s = re.sub('／', '/', s)
    assert re.match('^[\u4E00-\u9FFF/()]+$', s), (row, s)
    return s

PINYIN_MP = {
    ('nǚér', '女兒'): "nǚ'ér",
    ('wǎnān', '晚安'): "wǎn'ān",
    ('zǎoān', '早安'): "zǎo'ān",
    ('kěài', '可愛'): "kě'ài",
    ('xiǎpéngyǒu', '小朋友'): 'xiǎopéngyǒu',
    ('dáàn', '答案'): "dá'àn",
    ('jú', '橘子'): 'júzi',
    ('pèngchù', '碰觸/觸碰'): 'pèngchù/chùpèng',
    ('wányèr', '玩意兒'): "wányìr",
    ('qīněr', '親耳'): "qīn'ěr",
    ('yāgēr', '壓根兒'): "yāgēnr",
    ('yìdiǎn/yìdiǎndiǎn/yìdiǎr', '一點/一點點/一點兒'): 'yìdiǎn/yìdiǎndiǎn/yìdiǎnr',
    ('xiáchí', '挾持'): 'xiéchí', #https://dict.revised.moe.edu.tw/dictView.jsp?ID=105985&word=%E6%8C%BE%E6%8C%81
}


def normalize_pinyin(pinyin, hz):
    if (pinyin, hz) == ('búzhìyú', '不至於/不致於'): return 'bùzhìyú'
    if (pinyin, hz) == ('yíyìgūxíng', '一意孤行'): return 'yīyìgūxíng'

    if '不' in hz and 'bú' in pinyin:
        assert hz.count('不') == pinyin.count('bú') + pinyin.count('bù'), (pinyin, hz)
        pinyin = pinyin.replace('bú', 'bù')

    if '一' in hz and re.search('(yí|yì)', pinyin):
        assert hz.count('一') == pinyin.count('yí') + pinyin.count('yì'), (pinyin, hz)
        pinyin = pinyin.replace('yí', 'yī')
        pinyin = pinyin.replace('yì', 'yī')

    return pinyin

def fix_pinyin(py, trad=''):
    for x, y in ['ɑa', (' */ *', '/'), (r'\s+', ' '), (' */$', ''), ('^/ *', '')]:
        py = re.sub(x, y, py).strip()
    if (py.replace(' ', ''), trad) in PINYIN_MP:
        return PINYIN_MP[(py.replace(' ', ''), trad)]
    # Pinyin spaces are not meaningul in TBCL lists, mostly just syllable spaces there.
    # Remove to make more mergeable with TOCFL. Also no upper letters.
    assert py == py.lower() and "'" not in py
    merged = ''
    for part in py.split():
        if merged and merged[-1] not in '/()' and part[0] in 'aeoāáǎàēéěèōóǒò':
            merged += "'"
        merged += part
    py = merged
    assert re.match("^[a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ/()']+$", py), (py, repr(py))
    return py


df = pd.read_excel('downloads/words.xlsx').rename(columns={
    '序號': 'ID',
    '詞語': 'Traditional',
    '等別': 'Grade',
    '級別': 'Level',
    '情境': 'Context',
    '書面字頻(每百萬字)': 'WritingFreq',
    '口語字頻(每百萬字)': 'SpeakingFreq',
    '簡編本系統號': 'MOE', # MOE dict IDs, https://dict.concised.moe.edu.tw/dictView.jsp?ID=.
    '參考注音': 'Zhuyin',
    '參考漢語拼音': 'PinyinYB'  # pinyin with tone change indication for 一 and 不
})

assert list(df.ID - 1) == list(df.index)
df = df.drop(columns=['Grade', 'Zhuyin', 'Context'])

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df['glossary_key'] = (df.Level.str.slice(0, 1) + df.Traditional)
glossary_df = glossary_df.fillna('')
glossary_df['glossary_key'] = (glossary_df.Level.str.slice(0, 1) + glossary_df.Traditional)
glossary_mp = glossary_df.assign(idx=glossary_df.index).groupby('glossary_key').idx.apply(list)

df['Traditional'] = df.Traditional.map(fix_traditional)
df.insert(2, 'Simplified', df.Traditional.map(to_simplified))
df['PinyinYB'] = [fix_pinyin(row.PinyinYB, row.Traditional) for row in df.itertuples()]
df['Pinyin'] = [normalize_pinyin(row.PinyinYB, row.Traditional) for row in df.itertuples()]
df['Variants'] = [get_variants_str(row.Traditional, row.PinyinYB, row.MOE) for row in df.itertuples()]

for row in df.itertuples():
    variants = [v['Traditional'] for v in json.loads(row.Variants)] if row.Variants else [row.Traditional]
    moe = json.loads(row.MOE.replace("'", '"'))
    if row.Variants or row.MOE == '[]':
        for v, ids in moe:
            assert v in variants
        df.loc[row.Index, 'MOE'] = ''
    else:
        assert len(moe) == 1 and moe[0][0] == row.Traditional
        df.loc[row.Index, 'MOE'] = ' '.join(moe[0][1])

# Join with vocab_df
for col in ['POS', 'Meaning', 'Compounds', 'Examples']:
    df[col] = ''
    for row in df.itertuples():
        text = [glossary_df.loc[i, col] for i in glossary_mp.get(row.glossary_key, [])]
        text = [s.strip() for s in text if s.strip()]
        assert ' / ' not in ''.join(text), text
        if not text: continue
        dedup = []
        for s in text:
            if s not in dedup: dedup.append(s)
        text = ' / '.join(dedup)
        if col == 'POS':
            text = text.replace(' ', '')
            text = text.replace('Phrase', 'Ph')
        elif col == 'Compounds':
            text = text.replace(';', '')
            text = text.replace(' / ', '，').split('，')
            dedup = []
            for s in text:
                if s not in dedup: dedup.append(s)
            text = '，'.join(text)
            if text: text += '。'
        for x, y in [(' *[(] +', ' ('), (' +[)]', ')'), (' +/ +', ' / ')]:
            text = re.sub(x, y, text).strip()
        assert '\n' not in text
        df.loc[row.Index, col] = text.strip()

print('Unjoined glossary vocab: %s' % ' '.join(set(glossary_df.glossary_key) - set(df.glossary_key)))
df = df.drop(columns=['glossary_key'])

df.to_csv('tbcl.csv', index=False)
print('tbcl.csv: %d rows' % len(df))

Unjoined glossary vocab: 1中國 1法國 3非洲 2瓶子/瓶瓶 3亞洲 1車/車子 1英國 3義大利 1臺灣/台灣 3韓國 1日本 1應該/應 3美洲 2德國 2罐 3歐洲 3月台/月臺 1美國 1還
tbcl.csv: 14425 rows


In [7]:
# Generate version with variants expanded.

expanded_rows = []
for row in df.fillna('').to_dict(orient='records'):
    variants = json.loads(row['Variants']) if row['Variants'] else [{}]
    for variant in variants:
        var = dict(row)
        var.update(variant)
        expanded_rows.append(var)

expanded_df = pd.DataFrame(expanded_rows).drop(columns=['Variants'])
expanded_df.to_csv('tbcl-expanded.csv', index=False)
print('tbcl-expanded.csv: %d entries' % len(expanded_df))

tbcl-expanded.csv: 14868 entries


In [8]:
EAC1_TAG = '\uEAC1\uEC00\uEC00\uECCC\uEC99'  # tag color, #00cc99 green
EAC1_EX  = '\uEAC1\uEC00\uEC05\uECAA\uECFF'  # examples, #05aaff blue
EAC1_HL  = '\uEAC1\uEC00\uEC00\uECCC\uECCC'  # term highlight in examples, teal

with open('tbcl-pleco.txt', 'w') as fout:
    last_header = ''
    for row in pd.read_csv('tbcl.csv', dtype='str').fillna('').to_dict(orient='records'):
        header = f"//TBCL/Level {row['Level']}"
        if header != last_header:
            last_header = header
            fout.write(header + '\n')

        variants = json.loads(row['Variants']) if row['Variants'] else [{}]
        for variant in variants:
            var = dict(row)
            var.update(variant)
            defn = ' '.join([
                f"{row['Traditional']} [{row['Pinyin']}]\uEAB1" if row['Variants'] else '',
                f"({row['POS']})" if row.get('POS') else '',
                f"{row['Meaning']}" if row.get('Meaning') else '',
                f"{EAC1_TAG}[TBCL{row['Level']}]\uEAC2",
            ])
            defn = re.sub(r'\s+', ' ', defn).replace('\uEAB1 ', '\uEAB1').strip()
            # Compounds and examples in light blue on separate lines
            for ex in [var['Compounds'], var['Examples']]:
                if not ex: continue
                defn += (
                    f'\uEAB1{EAC1_EX}' +
                    ex.replace(' / ', '\uEAB1').replace(
                        var['Traditional'],
                        f"\uEAC2{EAC1_HL}{var['Traditional']}\uEAC2{EAC1_EX}"
                    ) +
                    '\uEAC2'
                )
            key = f"{var['Simplified']}[{var['Traditional']}]\t{var['Pinyin']}"
            fout.write(f'{key}\t{defn}\n')

!ls -l tbcl-pleco.txt

-rw-r--r-- 1 jovyan users 1121375 Nov  7 14:15 tbcl-pleco.txt


## Convert other files

In [9]:
df = pd.read_excel('downloads/chars.xlsx').rename(columns={
    '序號': 'ID',
    '漢字': 'Traditional',
    '等別': 'Grade',
    '級別': 'Level',
    '情境': 'Context',
    '書面字頻（每百萬字）': 'WritingFreq',
    '口語字頻（每百萬字）': 'SpeakingFreq',
})

assert list(df.ID - 1) == list(df.index)
df = df.drop(columns=['Grade'])

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df['Traditional'] = df.Traditional.map(fix_traditional)
df.to_csv('chars.csv', index=False)
print('chars.csv: %d rows' % len(df))

chars.csv: 3100 rows


In [10]:
rows = []
for row in df.to_dict(orient='records'):
    for ch in row['Traditional'].split('/'):
        row['char'] = ch
        rows.append(dict(row))

expanded_df = pd.DataFrame(rows)[['char'] + list(df.columns)]
expanded_df.to_csv('chars-expanded.csv', index=False)
print('chars-expanded.csv: %d rows' % len(expanded_df))

chars-expanded.csv: 3133 rows


In [11]:
df = pd.read_excel('downloads/grammar.xlsx').rename(columns={
    '序號': 'ID',
    '語法點': 'Grammar',
    '等別': 'Grade',
    '級別': 'Level',
    '例句': 'Example',
})

assert list(df.ID - 1) == list(df.index)
df = df.drop(columns=['Grade'])

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df.to_csv('grammar.csv', index=False)
print('grammar.csv: %d rows' % len(df))

grammar.csv: 496 rows


In [12]:
df = pd.read_excel('downloads/affixes.xlsx').rename(columns={
    '序號': 'ID',
    '類詞綴': 'Affix',
    '語法點': 'Grammar',
    '級別': 'Level',
    '說明': 'Explanation',
    '相關詞彙': 'Words',
})
assert list(df.ID - 1) == list(df.index)

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df.to_csv('affixes.csv', index=False)
print('affixes.csv: %d rows' % len(df))

affixes.csv: 73 rows


## Readings check

In [13]:
df = pd.read_csv('tbcl.csv', dtype='str').fillna('')

In [14]:
# Check readings
if os.path.exists('../cedict/syllables.csv'):
    readings_mp = {}  # {'一': set(['yì','yí'])}
    syll_df = pd.read_csv('../cedict/syllables.csv', dtype='str').fillna('')
    for row in syll_df.itertuples():
        readings_mp.setdefault(row.Traditional, set()).add(row.Pinyin.lower())
        readings_mp.setdefault(row.Simplified, set()).add(row.Pinyin.lower())
    readings_mp = {x: set([y.strip().lower() for y in readings_mp[x] if y.strip()]) for x in readings_mp}
    readings_mp['不'] = set(['bù'])

    def gen_readings(trad):
        if trad == '':
            yield ''
        elif trad[0] not in readings_mp or ord(trad[0]) < 0x3E00:
            yield from gen_readings(trad[1:])
        else:
            for x in readings_mp[trad[0]]:
                for y in gen_readings(trad[1:]):
                    yield x.lower() + ("'" if y and y[0] in 'aāáǎàeēéěèoōóǒò' else '') + y

    for row in pd.read_csv('tbcl-expanded.csv', dtype='str').fillna('').itertuples():
        trad, pinyin = row.Traditional,row.Pinyin
        readings = list(gen_readings(trad))
        if re.sub('', '', pinyin) not in readings:
            print(list(row._asdict().values())[1:9], ':\t', trad, pinyin, 'vs.', readings[:min(10, len(readings))])

['1533', '噢', '噢', '4', '80', '19', '44379', 'yǔ'] :	 噢 yǔ vs. ['ō']
['1572', '阿嬤', '阿嬷', '4', '41', '301', '', 'āma'] :	 阿嬤 āma vs. ['ēmo', 'ēmó', 'ēmā', 'àmo', 'àmó', 'àmā', 'āmo', 'āmó', 'āmā']
['2889', '尺寸', '尺寸', '5', '13', '13', '31455', 'chícun'] :	 尺寸 chícun vs. ['chǐcun', 'chǐcùn', 'chěcun', 'chěcùn']
['3331', '古蹟', '古迹', '5', '7', '26', '', 'gǔjī'] :	 古蹟 gǔjī vs. ['gǔjì']
['4063', '奇蹟', '奇迹', '5', '31', '24', '24507', 'qíjī'] :	 奇蹟 qíjī vs. ['jījì', 'qíjì']
['5329', '磅', '磅', '6', '12', '3', '2839 1146', 'pāng'] :	 磅 pāng vs. ['bàng', 'páng']
['7155', '摟', '搂', '6', '9', '1', '13652 13656 13677', 'lóu'] :	 摟 lóu vs. ['lǒu', 'lou', 'lōu']
['7486', '舖', '铺', '6', '32', '26', '', 'pū'] :	 舖 pū vs. ['pù']
['7542', '齊', '齐', '6', '33', '22', '21691 24545 36690', 'zī'] :	 齊 zī vs. ['qí']
['7946', '事蹟', '事迹', '6', '15', '5', '33653', 'shìjī'] :	 事蹟 shìjī vs. ['shìjì', 'shijì']
['8358', '蜿蜒', '蜿蜒', '6', '8', '7', '43635', 'wǎnyán'] :	 蜿蜒 wǎnyán vs. ['wānyán', 'wānyan']
['8440', '無妨',

## Merge with CEDICT and generate anki deck

In [15]:
UNTONE_MP = {
    'a': 'a', 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
    'e': 'e', 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
    'o': 'o', 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
    'i': 'i', 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
    'u': 'u', 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
    'ü': 'ü', 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
}

# Check if pinyin from data (py1) matches cedict's (py2)
# Optionally matching untoned vowels with tones if untone==True.
def pinyin_matches(py1, py2, hz='', untone=False, yi=False, bu=False):
    py1 = py1.lower()
    py2 = py2.lower()
    i, j = 0, 0
    while i < len(py1) or j < len(py2):
        a = ''
        if i < len(py1):
            a = py1[i]
            if a in "-',/() ":
                i += 1
                continue

        b = ''
        if j < len(py2):
            b = py2[j]
            if b in "-',/() ":
                j += 1
                continue

        match = (a == b)
        match |= untone and (UNTONE_MP.get(a, a) == b or a == UNTONE_MP.get(b, b))
        if i > 0 and j > 0:
            match |= yi and py1[i-1:i+1] in ['yí', 'yì'] and py2[j-1:j+1] == 'yī' and '一' in hz
            match |= bu and py1[i-1:i+1] == 'bú' and py2[j-1:j+1] == 'bù' and '不' in hz

        if match:
            i += 1
            j += 1
        else:
            return False

    return i == len(py1) and j == len(py2)

In [16]:
df = pd.read_csv('tbcl.csv', dtype='str').fillna('')
cedict_df = pd.read_csv('../cedict/cedict.csv')
cedict_idx_mp = cedict_df.assign(idx=cedict_df.index).groupby('Traditional').idx.apply(list)

rows = []

for row in df.fillna('').to_dict(orient='records'):
    pinyin_set = set([row['Pinyin']])
    matches = cedict_idx_mp.get(row['Traditional'], [])
    if len(matches) == 0 and row['Variants']:
        variants = json.loads(row['Variants']) if row['Variants'] else [{}]
        for variant in variants:
            matches.extend(cedict_idx_mp.get(variant['Traditional'], []))
            pinyin_set.add(variant['Pinyin'])

    matches = list(sorted(set(matches)))

    flag = ''
    if len(matches) != 0:
        # Prioritize pronunciation matches, downpriorize names and variants
        # TODO: match based on taiwanese pronunciation
        if len(matches) > 1:
            matches.sort(key=lambda i: (
                -int(any(pinyin_matches(py, cedict_df.Pinyin[i], untone=False) for py in pinyin_set))
                -int(any(pinyin_matches(py, cedict_df.Pinyin[i], untone=True) for py in pinyin_set))
                +10*int(re.match('^variant', cedict_df.Definitions[i]) is not None)
                +100*int(cedict_df.Pinyin[i][0].isupper())
            ))

        ce_simp = set([cedict_df.Simplified[i] for i in matches])
        cc_simp = opencc_tw2s.convert(row['Traditional'])
        if not row['Variants'] and ce_simp:
            if row['Simplified'] not in ce_simp:
                print('Simplified diff:', row, 'ce', ce_simp, 'cc', cc_simp)
            if len(ce_simp) > 1:
                print('Ambigous simplified:', row, 'ce', ce_simp, 'cc', cc_simp)

        defs = []
        for i in matches:
            py1 = list(pinyin_set)[0] if len(pinyin_set) == 1 else ''
            defn = cedict_df.Definitions[i]
            defn = re.sub(r'/CL:個\|个\[ge4\](|/.*)$', r'\1', defn)  # uninformative
            if row['Variants']:
                defn = '%s [%s] %s' % (cedict_df.Traditional[i], cedict_df.Pinyin[i], defn)
            elif not pinyin_matches(py1, cedict_df.Pinyin[i], untone=False):
                defn = '[%s] %s' % (cedict_df.Pinyin[i], defn)
            defs.append(defn)

        if not row['Meaning']:
            row['Meaning'] = '<br> '.join(defs)

    #row['Flag'] = flag
    rows.append(row)

merged_df = pd.DataFrame(rows)
merged_df.to_csv('tbcl-cedict.csv', index=False)

Ambigous simplified: {'ID': '85', 'Traditional': '妳', 'Simplified': '你', 'Level': '1', 'WritingFreq': '463', 'SpeakingFreq': '1653', 'MOE': '12580', 'PinyinYB': 'nǐ', 'Pinyin': 'nǐ', 'Variants': '', 'POS': 'N', 'Meaning': 'you (female)', 'Compounds': '', 'Examples': '妳好，我是妳的同學。'} ce {'奶', '你'} cc 妳
Ambigous simplified: {'ID': '853', 'Traditional': '乾', 'Simplified': '干', 'Level': '3', 'WritingFreq': '50', 'SpeakingFreq': '99', 'MOE': '16341 25228', 'PinyinYB': 'gān', 'Pinyin': 'gān', 'Variants': '', 'POS': 'Vs/N', 'Meaning': 'dry / dried food', 'Compounds': '魚乾，肉乾，葡萄乾。', 'Examples': '今天早上洗的衣服，下午就乾了。 / 最近的天氣很乾，都沒有下雨。 / 我喜歡吃水果乾。'} ce {'干', '乾'} cc 干
Simplified diff: {'ID': '1482', 'Traditional': '牠', 'Simplified': '它', 'Level': '4', 'WritingFreq': '343', 'SpeakingFreq': '659', 'MOE': '9925', 'PinyinYB': 'tā', 'Pinyin': 'tā', 'Variants': '', 'POS': '', 'Meaning': '', 'Compounds': '', 'Examples': ''} ce {'牠'} cc 牠
Simplified diff: {'ID': '3615', 'Traditional': '藉口', 'Simplified': '借口', 'Le

*Taiwan TBCL wordlist (Traditional)*

TBCL (Taiwan Benchmarks for the Chinese Language) wordlist, 14425 words over 7 levels. Parsed from official excel sheets from [TBCL](https://coct.naer.edu.tw/TBCL/) website, including definitions/examples for about 1500 lower level words that they provide. CC-CEDICT definitions for the rest.

Pinyin normalized to not indicate tone changes for 一 and 不 for ease of joining with other data sources.

In [17]:
import genanki, shutil

df = merged_df.copy().fillna('')

!mkdir -p data/media
!cp -f ../downloads/fonts/MoeStandardKai.ttf data/media/_MoeStandardKai.ttf

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'Level',
        'POS', 'Meaning', 'Compounds', 'Examples', 'Variants']

model = genanki.Model(
    1698579990,
    'TBCL',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'TBCL',
        'qfmt': open('../dangdai/dangdai-qfmt.html').read().replace('{{ID}}', 'TBCL L{{Level}}'),
        # TODO fix template
        'afmt': '''{{FrontSide}}
<hr id=answer>
<div lang="en"><span id="ddzw-pinyin">{{Pinyin}}</span></div><br>
<div lang="en">{{#POS}}({{POS}}) {{/POS}}{{Meaning}}</div><br>
<div>{{#Compounds}}{{Compounds}}<br>{{/Compounds}}{{Examples}}</div><br>
''' + re.sub('^.*<script>', '<script>', open('../dangdai/dangdai-afmt.html').read(), flags=re.M).replace(
            'if (pinyinEl && hanziEl)',
            'if (pinyinEl && hanziEl {{#Variants}}&& false{{/Variants}})'),
    }],
    css=open('../dangdai/dangdai.css').read(),
)

deck = genanki.Deck(1698579991, name='tbcl')

for row in df.reset_index().to_dict(orient='records'):
    note = genanki.Note(
        model=model,
        fields=[row[c] for c in cols],
        guid=genanki.guid_for('tbcl', row['ID']),
        tags=['L%s' % row['Level'][0]],
    )
    deck.add_note(note)

!rm -f tbcl.apkg
genanki.Package(deck, media_files=glob.glob('data/media/*')).write_to_file('tbcl.apkg')
!ls -l tbcl.apkg

-rw-r--r-- 1 jovyan users 16947510 Nov  7 14:15 tbcl.apkg
