# TOCFL/CCCC wordlists parser

Parse wordlists from https://tocfl.edu.tw/index.php/exam/download

In [1]:
!pip install -q opencc genanki

import os, re, glob, requests, io, urllib, json, shutil
import pandas as pd
import opencc
import genanki

In [2]:
# Specify path to TOCFL .xlsx to parse or comment out to download latest automatically
TOCFL_XLS="downloads/8000zhuyin_202307.xlsx"
#TOCFL_XLS="downloads/8000zhuyin_202204.xlsx"
#TOCFL_XLS="downloads/8000zhuyin_20180419.xlsx"

### Download latest .xls

In [3]:
# Get latest file automatically from https://tocfl.edu.tw/index.php/exam/download
if 'TOCFL_XLS' not in globals() or not os.path.exists(TOCFL_XLS):
    TOCFL_HOME = 'https://tocfl.edu.tw/index.php/exam/download'
    print(f'Downloading {TOCFL_HOME}')
    resp = requests.get(TOCFL_HOME).content.decode('utf-8')

    urls = re.findall('<a href="(/assets/files/vocabulary/8000zhuyin_[0-9]+.zip)"', resp)
    assert len(urls) == 1
    TOCFL_URL = urllib.parse.urljoin(TOCFL_HOME, urls[0])
    TOCFL_XLS = 'downloads/' + os.path.basename(TOCFL_URL).replace('.zip', '.xlsx')

    ![[ ! -d downloads && -d ../downloads/tocfl ]] && ln -s ../downloads/tocfl downloads
    !mkdir -p downloads
    if not os.path.exists(f"downloads/{os.path.basename(TOCFL_URL)}"):
        !echo "Downloading {TOCFL_URL}"
        !cd downloads && wget -nc "{TOCFL_URL}"
    !rm -rf downloads/unpacked && mkdir downloads/unpacked
    !cd downloads/unpacked && unzip "../{os.path.basename(TOCFL_URL)}"
    !cp -fv "$(find downloads/unpacked -name '*.xlsx')" "{TOCFL_XLS}"
    !rm -rf downloads/unpacked
    !echo; ls -l "{TOCFL_XLS}"; sha256sum "{TOCFL_XLS}"; chmod a-w "{TOCFL_XLS}"

    print(f'\nTOCFL_URL="{TOCFL_URL}"\nTOCFL_XLS="{TOCFL_XLS}"')

### Parse .xls

In [4]:
LEVELS_EN_MP = {
    'Novice 1': 'L0-1',
    'Novice 2': 'L0-2',
    'Level 1': 'L1',
    'Level 2': 'L2',
    'Level 3': 'L3',
    'Level 4': 'L4',
    'Level 5': 'L5',
}
LEVELS_CN_MP = {  # for 2018 file
    '準備級一級': 'L0-1',
    '準備級二級': 'L0-2',
    '入門級': 'L1',
    '基礎級': 'L2',
    '進階級': 'L3',
    '高階級': 'L4',
    '流利級': 'L5',
}

print(f'Parsing {TOCFL_XLS}')
!sha256sum {TOCFL_XLS}

xls = pd.ExcelFile(TOCFL_XLS)
sheets = {}

for i, name in enumerate(xls.sheet_names):
    df = xls.parse(name, dtype='str').fillna('')
    if 'Entry Number' in name or '各等詞條數' in name:
        break

    if name in LEVELS_CN_MP:
        level = LEVELS_CN_MP[name]
    else:
        level = LEVELS_EN_MP[re.findall('[(](.*)[)]', name)[0]]
    print(f'Sheet {i+1}: {name:<15}\t{level}\t{len(df)} rows')

    df = df.rename(columns=lambda s: s.strip().split('\n')[-1])
    df = df.rename(columns={
        'Parts of Speech': 'POS',
        '詞彙': 'Vocabulary',
        '漢語拼音': 'Pinyin',
        '注音': 'Zhuyin',
        '詞類': 'POS',
    })
    df['Level'] = level
    df = df[df.Vocabulary.fillna('') != ''].copy()
    if level.startswith('L0'):
        df['ID'] = ['%s%.3d' % (level, i+1) for i in range(len(df))]
    else:
        df['ID'] = ['%s-%.4d' % (level, i+1) for i in range(len(df))]
    sheets[level] = df

excel_df = pd.concat(sheets.values())
print(f'Total: {len(excel_df)}')

print(f'\n{name}\n%s' % str(df))
assert str(len(excel_df)) in str(df)

Parsing downloads/8000zhuyin_202307.xlsx
e979ac6d953fb493502e54536b4b6ff534d06e3938700052aa15806d514efc92  downloads/8000zhuyin_202307.xlsx
Sheet 1: 準備級一級(Novice 1)	L0-1	160 rows
Sheet 2: 準備級二級(Novice 2)	L0-2	234 rows
Sheet 3: 入門級(Level 1)   	L1	347 rows
Sheet 4: 基礎級(Level 2)   	L2	485 rows
Sheet 5: 進階級(Level 3)   	L3	1173 rows
Sheet 6: 高階級(Level 4)   	L4	2342 rows
Sheet 7: 流利級(Level 5)   	L5	2776 rows
Total: 7517

各等詞條數(Entry Number)
  Unnamed: 0 準備1級 準備2級  入門級   基礎級   進階級   高階級   流利級    總計
0      各等詞彙量  160  234  347   485  1173  2342  2776  7517
1      累計詞彙量       394  741  1226  2399  4741  7517      


In [5]:
excel_df.describe()

Unnamed: 0,Context,Vocabulary,Pinyin,POS,Level,ID
count,1226,7517,7517,7517,7517,7517
unique,12,7189,6780,104,7,7517
top,個人資料,中,jí,N,L5,L0-1001
freq,220,3,7,2985,2776,1


### Cleanup and convert to .csv

In [6]:
opencc_tw2s = opencc.OpenCC('tw2s')

# Character levels from Table of General Standard Chinese Characters for verification.
tgh_level = pd.read_csv('../chars/tgh.csv').set_index('char').level.to_dict()

# Convert to simplified characters + verify
def to_simplified(trad):
    simp = opencc_tw2s.convert(trad)
    for x, y in ('擡抬', '砲炮', '牠它', '艶艳', '妳你'):
        simp = simp.replace(x, y)
    if '/' in simp and len(set(simp.split('/'))) == 1:
        simp = simp.split('/')[0]
    for c in simp:
        assert tgh_level.get(c, 9) <= 2 or c in '/(),', (trad, simp, c, tgh_level.get(c))
    return simp

In [7]:
# Correct some errors in pinyin, most missing apostrophes but some more serious errors too (at the front)
pinyin_corr_df = pd.read_csv('data/errata-pinyin.csv', comment='#', dtype='str')

def fix_pinyin(s, trad=''):
    """Cleans up text in 'Pinyin' column."""

    for x, y in ['ăǎ', 'ŏǒ', 'ĭǐ', 'ŭǔ', 'ɑa', '；/', '（(', '）)', 
                 ('\u200b', ''), (' +[)]', ')'), (' */ *', '/'), (r'\s+', ' ')]:
        s = re.sub(x, y, s).strip()
    for row in pinyin_corr_df.itertuples():
        if row.Pinyin == s and row.Traditional == trad:
            s = row.Corrected
    s = s.strip()
    assert re.match('^[a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ/(), \']+$', s.lower()), (s, repr(s))
    return s

assert fix_pinyin('xiăo') == 'xiǎo'   # a c -> a v
assert fix_pinyin('chuāng(zi )/chuānghu') == 'chuāng(zi)/chuānghu'
assert fix_pinyin('liànài', trad='戀愛') == "liàn'ài"

In [8]:
def fix_vocabulary(s):
    """Cleans up text in 'Vocabulary' column."""

    for x, y in [
        ('\u200b', ''), ('（', '('), ('）', ')'), (' */ *', '/'), (r'\s+', ' '),
        ('[(]面˙ㄇㄧㄢ[)]', '(面)'), ('、', ','),
        (r'^([\u4E00-\u9FFF]{2,3})\[([\u4E00-\u9FFF]{2,3})\]$', r'\1/\2'), #CCCC simplified
    ]:
        s = re.sub(x, y, s).strip()

    # Zhuyin hints are reduntant with pinyin and not very relevant for foreigners, drop them
    s = re.sub(r'[(（][ㄅㄈㄉㄊㄋㄍㄎㄏㄐㄑㄒㄓㄕㄗㄙㄚㄛㄞㄌㄟㄠㄡㄢㄣㄤㄧ一ㄨㄩㄇㄝㄆㄌㄨㄥㄔㄘㄖˊˋ˙\uf8f8]+[)）]', '', s).strip()

    assert re.match('^[\u4E00-\u9FFF/(),]+$', s), (row, s)
    return s

assert fix_vocabulary('名字(˙ㄗ)') == '名字'

In [9]:
# Ambiguous variant entries - manually disambiguated
variants_exc_df = pd.read_csv('data/variants.csv', comment='#', dtype='str')
variants_exc_mp = variants_exc_df.set_index(['Vocabulary', 'Pinyin', 'POS']).Variants.to_dict()

# Some variants in TOCFL are specified as "...x/y..." character pairs. Valid pairs here:
variant_pairs = [
    '做作', '布佈', '嘗嚐', '溼濕', '分份', '畫劃', '裡裏', '秘祕', '台臺', '周週',
    '汙污', '消宵', '占佔', '證証', '雇僱', '迴回', '剎煞', '的地', '艶豔', '嘆歎',
    '連聯', '秘祕', '伙夥',
]

def get_variants(vocab, pinyin, pos) -> str:
    """Returns disambiguated list of variants from TOCFL's vocab+pinyin+pos strings."""

    res = variants_exc_mp.get((vocab, pinyin, pos))
    if res:
        return res.strip()

    ps = re.sub('[^()/,]', '', pinyin)
    vs = re.sub('[^()/,]', '', vocab)
    if ps == '' and vs == '':
        return ''

    if ps == '' and vs == '/':
        for x, y in variant_pairs:
            if not (f'{x}/{y}' in vocab or f'{y}/{x}' in vocab): continue
            if f'{y}/{x}' in vocab:
                x, y = y, x
            assert f'{x}/{y}' in vocab
            vx = vocab.replace(f'{x}/{y}', x)
            vy = vocab.replace(f'{x}/{y}', y)
            return f'{vx} [{pinyin}] / {vy} [{pinyin}]'

    if (ps != vs and ps != '' and vs != '') or \
       (ps+vs != '' and '/' in pos) or \
       (ps == '' and vs != '' and len(set(map(len, vocab.split('/')))) != 1):
        raise Exception('Ambiguous entry: %s' % ','.join([vocab, pinyin, pos, vocab]))
        print('%s' % ','.join([vocab, pinyin, pos, vocab]))
        return ''

    assert '/' not in pos
    if ps != '' and vs == '':
        assert ps == '/'
        res = ' / '.join([f'{vocab} [{p.strip()}]' for p in pinyin.split('/')])
        return res
    if ps == '' and vs != '':
        res = vocab.split('/')
        assert len(set(map(len, res))) == 1, vocab
        res = ' / '.join([f'{s} [{pinyin}]' for s in res])
        return res
    assert ps == vs

    res = []
    for vo, py in zip(vocab.split('/'), pinyin.split('/')):
        assert vo.count('(') == py.count('(') and vo.count('(') <= 1
        assert vo.count(')') == py.count(')') and vo.count(')') <= 1
        assert vo.count('(') == vo.count(')')
        if '(' in vo:
            vm = re.match('^([^() ]*) *[(]([^() ]+)[)] *([^() ]*)$', vo)
            assert vm, vo
            pm = re.match('^([^() ]*) *[(]([^() ]+)[)] *([^() ]*)$', py)
            assert pm, py
            res.append(f'{vm[1]}{vm[2]}{vm[3]} [{pm[1]}{pm[2]}{pm[3]}]')
            res.append(f'{vm[1]}{vm[3]} [{pm[1]}{pm[3]}]')
        else:
            res.append(f'{vo} [{py}]')
    res = ' / '.join(res)
    return res

def variants_to_json(variants):
    if not variants or variants != variants:
        return ''
    arr = []
    for var in variants.split(' / '):
        m = re.match(r'^([^ ()\[\]]+) \[([^()\[\]]+)\](?:$| [(]([A-Z]+)[)])$', var)
        assert m, variants
        var = {
            'Traditional': m[1],
            'Simplified': to_simplified(m[1]),
            'Pinyin': fix_pinyin(m[2], m[1]),
        }
        if m[3]:
            var['POS'] = m[3]
        arr.append(var)
    return json.dumps(arr, ensure_ascii=False)

assert get_variants('台灣/臺灣', 'táiwān', 'N') == '台灣 [táiwān] / 臺灣 [táiwān]'
assert get_variants('小孩(子)', 'xiăohái(zi)', 'N') == '小孩子 [xiăoháizi] / 小孩 [xiăohái]'
assert get_variants('公共汽車/公車', 'gōnggòngqìchē/gōngchē', 'N') == '公共汽車 [gōnggòngqìchē] / 公車 [gōngchē]'
assert get_variants('盒/盒(子)', 'hé/hézi', 'M / N') == '盒 [hé] (M) / 盒子 [hézi] (N)'
assert get_variants('差(一)點/差(一)點兒', 'chà(yī)diǎn/chà(yī)diǎnr','Adv') == \
                    '差一點 [chàyīdiǎn] / 差點 [chàdiǎn] / 差一點兒 [chàyīdiǎnr] / 差點兒 [chàdiǎnr]'
assert get_variants('角色', 'jiǎo/juésè', 'N') == '角色 [jiǎo] / 角色 [juésè]'
assert get_variants('計畫/劃', 'jìhuà', 'V') == '計畫 [jìhuà] / 計劃 [jìhuà]'
assert get_variants('(老)鼠', '(lǎo)shǔ', 'N') == '老鼠 [lǎoshǔ] / 鼠 [shǔ]'
assert get_variants('鼻(子)', 'bí(zi)', 'N') == '鼻子 [bízi] / 鼻 [bí]'

assert (variants_to_json('台灣 [táiwān] / 臺灣 [táiwān]') ==
        '[{"Traditional": "台灣", "Simplified": "台湾", "Pinyin": "táiwān"}, ' +
        '{"Traditional": "臺灣", "Simplified": "台湾", "Pinyin": "táiwān"}]')
assert (variants_to_json('盤 [pán] (M) / 盤子 [pánzi] (N)') == 
        '[{"Traditional": "盤", "Simplified": "盘", "Pinyin": "pán", "POS": "M"}, ' +
        '{"Traditional": "盤子", "Simplified": "盘子", "Pinyin": "pánzi", "POS": "N"}]')

In [10]:
rows = []
expanded_rows = []

for row in excel_df.fillna('').to_dict(orient='records'):
    row['Traditional'] = fix_vocabulary(row['Vocabulary'])
    row['Pinyin'] = fix_pinyin(row['Pinyin'], row['Traditional'])
    row['Variants'] = variants_to_json(get_variants(row['Traditional'], row['Pinyin'], row['POS']))
    row['POS'] = row['POS'].replace(' ', '').replace('；', '/')
    row['Simplified'] = to_simplified(row['Traditional'])
    rows.append(row)

    variants = json.loads(row['Variants']) if row['Variants'] else [{}]
    for variant in variants:
        var = dict(row)
        var.update(variant)
        expanded_rows.append(var)

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS']
tocfl_df = pd.DataFrame(rows)[cols + ['Variants']]
tocfl_df.to_csv('tocfl.csv', index=False)
print('tocfl.csv: %d entries' % len(tocfl_df))
assert list(tocfl_df.index) == list(sorted(tocfl_df.index))

expanded_df = pd.DataFrame(expanded_rows)[cols]
expanded_df.to_csv('tocfl-expanded.csv', index=False)
assert list(expanded_df.index) == list(sorted(expanded_df.index))
print('tocfl-expanded.csv: %d entries' % len(expanded_df))

tocfl.csv: 7517 entries
tocfl-expanded.csv: 7847 entries


## CCCC

Parse CCCC (Children's Chinese Competency Certification) wordlist, including definitions.

In [11]:
url = 'https://tocfl.edu.tw/assets/files/vocabulary/CCCC_Vocabulary_2022.xls'
if os.path.exists('downloads/CCCC_Vocabulary_2022.xls'):
    url = 'downloads/CCCC_Vocabulary_2022.xls'

xls = pd.ExcelFile(url)

sheets = {}
for i, name in enumerate(xls.sheet_names[:3]):
    df = xls.parse(name, dtype='str', skiprows=1).fillna('')
    level = {'萌芽級': 'L1', '成長級': 'L2', '茁壯級': 'L3'}[name]
    print(f'Sheet {i+1}: {name:<15}\t{level}\t{len(df)} rows')
    df = df.rename(columns=lambda s: s.strip().split('\n')[-1])
    df = df.rename(columns={
        '分類': 'Category',
        '細目': 'Subcategory',
        '正體字': 'Traditional',
        '简体字': 'Simplified',
        '漢拼': 'Pinyin',
        '詞性': 'POS',
        '英文': 'Meaning',
    })
    df['ID'] = ['%s-%.3d' % (level, i+1) for i in range(len(df))]
    for col in df:
        df[col] = df[col].fillna('').str.strip()
    df = df[df.Traditional != ''].copy()
    sheets[level] = df

cccc_excel_df = pd.concat(sheets.values())
print(f'Total: {len(cccc_excel_df)}')

Sheet 1: 萌芽級            	L1	474 rows
Sheet 2: 成長級            	L2	387 rows
Sheet 3: 茁壯級            	L3	366 rows
Total: 1197


In [12]:
def fix_pos(s):
    for x, y in [('[;；]', '/'), (' ', ''), ('ADV', 'Adv'), ('VS', 'Vs'), ('[Pp]article', 'Ptc'), ('affix', 'Affix')]:
        s = re.sub(x, y, s).strip()
    return s

rows = []
expanded_rows = []

for row in cccc_excel_df.fillna('').to_dict(orient='records'):
    row['Traditional'] = fix_vocabulary(row['Traditional'])
    row['Simplified'] = fix_vocabulary(row['Simplified'])
    row['Pinyin'] = fix_pinyin(row['Pinyin'], row['Traditional'])
    row['Variants'] = variants_to_json(get_variants(row['Traditional'], row['Pinyin'], row['POS']))
    row['POS'] = fix_pos(row['POS'])
    row['Simplified'] = to_simplified(row['Traditional'])
    rows.append(row)

    variants = json.loads(row['Variants']) if row['Variants'] else [{}]
    for variant in variants:
        var = dict(row)
        var.update(variant)
        expanded_rows.append(var)

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning',
        'Category', 'Subcategory']

df = pd.DataFrame(rows)[cols + ['Variants']]
df.to_csv('cccc.csv', index=False)
print('cccc.tsv: %d rows' % len(df))

expanded_df = pd.DataFrame(expanded_rows)[cols]
expanded_df.to_csv('cccc-expanded.csv', index=False)
assert list(expanded_df.index) == list(sorted(expanded_df.index))
print('cccc-expanded.csv: %d entries' % len(expanded_df))

cccc.tsv: 1197 rows
cccc-expanded.csv: 1344 entries


## Export as Pleco user dictionary

In [13]:
EAC1_TAG = '\uEAC1\uEC00\uEC00\uECCC\uEC99'  # tag color, #00cc99 green

def gen_pleco(input_fn, output_fn):
    with open(output_fn, 'w') as fout:
        last_header = ''
        for row in pd.read_csv(input_fn, dtype='str').fillna('').to_dict(orient='records'):
            cefr = {'0': 'pre-A1', '1': 'A1', '2': 'A2', '3': 'B1', '4': 'B2', '5': 'C1+'}[row['ID'][1]]
            if 'cccc' in input_fn:
                cefr = {'1': 'pre-A1', '2': 'A1', '3': 'A2'}[row['ID'][1]]
                header = f"//CCCC 2022/Level {row['ID'][1]} ({cefr})"
            elif row['ID'].startswith('L0'):
                header = f"//TOCFL 2023/Novice {row['ID'][3]} ({cefr})"
            else:
                header = f"//TOCFL 2023/Level {row['ID'][1]} ({cefr})"

            if header != last_header:
                last_header = header
                fout.write(header + '\n')

            variants = json.loads(row['Variants']) if row['Variants'] else [{}]
            for variant in variants:
                var = dict(row)
                var.update(variant)
                defn = ' '.join([
                    f"{row['Traditional']} [{row['Pinyin']}]\uEAB1" if row['Variants'] else '',
                    f"({row['POS']})" if row.get('POS') else '',
                    f"{row['Meaning']}" if row.get('Meaning') else '',
                    f"{EAC1_TAG}[CCCC{row['ID'][1]}]\uEAC2" if 'cccc' in input_fn else
                    f"{EAC1_TAG}[TOCFL{row['ID'][1]}]\uEAC2"
                ])
                defn = re.sub(r'\s+', ' ', defn).replace('\uEAB1 ', '\uEAB1').strip()
                key = f"{var['Simplified']}[{var['Traditional']}]\t{var['Pinyin']}"
                fout.write(f'{key}\t{defn}\n')

gen_pleco('tocfl.csv', 'tocfl-pleco.txt')
gen_pleco('cccc.csv', 'cccc-pleco.txt')

!ls -l *pleco.txt

-rw-r--r-- 1 jovyan users  94671 Nov  7 13:53 cccc-pleco.txt
-rw-r--r-- 1 jovyan users 445784 Nov  7 13:53 tocfl-pleco.txt


## Readings check

In [14]:
if os.path.exists('../cedict/syllables.csv'):
    readings_mp = {} #{'一': set(['yì','yí'])}
    syll_df = pd.read_csv('../cedict/syllables.csv', dtype='str').fillna('')
    for row in syll_df.itertuples():
        readings_mp.setdefault(row.Traditional, set()).add(row.Pinyin.lower())
    readings_mp = {x: set([y.strip().lower() for y in readings_mp[x] if y.strip()]) for x in readings_mp}

    def gen_readings(trad):
        if trad == '':
            yield ''
        elif trad[0] not in readings_mp or ord(trad[0]) < 0x3E00:
            yield from gen_readings(trad[1:])
        else:
            for x in readings_mp[trad[0]]:
                for y in gen_readings(trad[1:]):
                    yield x.lower() + ("'" if y and y[0] in 'aāáǎàeēéěèoōóǒò' else '') + y

    for filename in ['cccc-expanded.csv', 'tocfl-expanded.csv']:
        print(filename)
        for row in pd.read_csv(filename, dtype='str').fillna('').itertuples():
            trad, pinyin = row.Traditional,row.Pinyin
            readings = list(gen_readings(trad))
            if pinyin.lower() not in readings:
                print(row.ID, list(row._asdict().values())[2:5], 'vs.', readings)

cccc-expanded.csv
L2-104 ['去年', '去年', 'qùnian'] vs. ['qunián', 'qùnián']
L3-203 ['新鮮', '新鲜', 'xīnxian'] vs. ['xīnxiǎn', 'xīnxiān']
tocfl-expanded.csv
L1-0079 ['部份', '部份', 'bùfen'] vs. ['bùfèn']
L1-0284 ['差不多', '差不多', 'chabùduō'] vs. ['cībúduó', 'cībúduō', 'cībuduó', 'cībuduō', 'cībùduó', 'cībùduō', 'cībūduó', 'cībūduō', 'chābúduó', 'chābúduō', 'chābuduó', 'chābuduō', 'chābùduó', 'chābùduō', 'chābūduó', 'chābūduō', 'chāibúduó', 'chāibúduō', 'chāibuduó', 'chāibuduō', 'chāibùduó', 'chāibùduō', 'chāibūduó', 'chāibūduō', 'chàbúduó', 'chàbúduō', 'chàbuduó', 'chàbuduō', 'chàbùduó', 'chàbùduō', 'chàbūduó', 'chàbūduō']
L2-0009 ['白天', '白天', 'báitian'] vs. ['báitiān', 'baitiān']
L2-0051 ['差', '差', 'cha'] vs. ['cī', 'chā', 'chāi', 'chà']
L2-0414 ['新鮮', '新鲜', 'xīnxian'] vs. ['xīnxiǎn', 'xīnxiān']
L3-0205 ['多多少少', '多多少少', 'duōduoshǎoshǎo'] vs. ['duóduóshàoshào', 'duóduóshàoshǎo', 'duóduóshàoshao', 'duóduóshǎoshào', 'duóduóshǎoshǎo', 'duóduóshǎoshao', 'duóduóshaoshào', 'duóduóshaoshǎo', 'duóduóshaosh

## Join with CC-CEDICT

In [15]:
UNTONE_MP = {
    'a': 'a', 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
    'e': 'e', 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
    'o': 'o', 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
    'i': 'i', 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
    'u': 'u', 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
    'ü': 'ü', 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
}

# Check if pinyin from the list (py1) matches cedict's (py2)
def pinyin_matches(py1, py2, hz='', untone=False):
    py1 = py1.lower()
    py2 = py2.lower()
    i, j = 0, 0
    while i < len(py1) or j < len(py2):
        a = ''
        if i < len(py1):
            a = py1[i]
            if a in "-',/() ":
                i += 1
                continue

        b = ''
        if j < len(py2):
            b = py2[j]
            if b in "-',/() ":
                j += 1
                continue

        match = (a == b)
        match |= untone and (UNTONE_MP.get(a, a) == b or a == UNTONE_MP.get(b, b))
        if match:
            i += 1
            j += 1
        else:
            return False

    return i == len(py1) and j == len(py2)

In [16]:
tocfl_df = pd.read_csv('tocfl.csv', dtype='str').fillna('')
cedict_df = pd.read_csv('../cedict/cedict.csv', dtype='str').fillna('')
cedict_idx_mp = cedict_df.assign(idx=cedict_df.index).groupby('Traditional').idx.apply(list)

rows = []

for row in tocfl_df.to_dict(orient='records'):
    pinyin_set = set([row['Pinyin']])
    matches = cedict_idx_mp.get(row['Traditional'], [])
    if len(matches) == 0 and row['Variants']:
        variants = json.loads(row['Variants']) if row['Variants'] else [{}]
        for variant in variants:
            matches.extend(cedict_idx_mp.get(variant['Traditional'], []))
            pinyin_set.add(variant['Pinyin'])

    matches = list(sorted(set(matches)))

    if len(matches) == 0:
        print('No entry for %s' % row)
    else:
        # Prioritize pronunciation matches, downpriorize names and variants
        # TODO: match based on taiwanese pronunciation
        if len(matches) > 1:
            matches.sort(key=lambda i: (
                -int(any(pinyin_matches(py, cedict_df.Pinyin[i], untone=False) for py in pinyin_set))
                -int(any(pinyin_matches(py, cedict_df.Pinyin[i], untone=True) for py in pinyin_set))
                +10*int(re.match('^variant', cedict_df.Definitions[i]) is not None)
                +100*int(cedict_df.Pinyin[i][0].isupper())
            ))

        ce_simp = set([cedict_df.Simplified[i] for i in matches])
        cc_simp = opencc_tw2s.convert(row['Traditional'])
        if not row['Variants'] and ce_simp:
            if row['Simplified'] not in ce_simp:
                print('Simplified diff:', row, 'ce', ce_simp, 'cc', cc_simp)
            if len(ce_simp) > 1:
                print('Ambigous simplified:', row, 'ce', ce_simp, 'cc', cc_simp)

        defs = []
        for i in matches:
            py1 = list(pinyin_set)[0] if len(pinyin_set) == 1 else ''
            defn = cedict_df.Definitions[i]
            defn = re.sub(r'/CL:個\|个\[ge4\](|/.*)$', r'\1', defn)  # uninformative
            if row['Variants']:
                defn = '%s [%s] %s' % (cedict_df.Traditional[i], cedict_df.Pinyin[i], defn)
            elif not pinyin_matches(py1, cedict_df.Pinyin[i], untone=False):
                defn = '[%s] %s' % (cedict_df.Pinyin[i], defn)
            defs.append(defn)

        row['Meaning'] = '<br> '.join(defs)

    rows.append(row)

merged_df = pd.DataFrame(rows)
merged_df.to_csv('tocfl-cedict.csv', index=False)

# diffs mostly due to variants chars

Ambigous simplified: {'ID': 'L0-2234', 'Traditional': '著', 'Simplified': '着', 'Pinyin': 'zhe', 'POS': 'Ptc', 'Variants': ''} ce {'着', '著'} cc 着
Ambigous simplified: {'ID': 'L2-0171', 'Traditional': '乾', 'Simplified': '干', 'Pinyin': 'gān', 'POS': 'Vp', 'Variants': ''} ce {'干', '乾'} cc 干
No entry for {'ID': 'L2-0178', 'Traditional': '汙染', 'Simplified': '污染', 'Pinyin': 'wūrǎn', 'POS': 'V', 'Variants': ''}
No entry for {'ID': 'L2-0292', 'Traditional': '月台', 'Simplified': '月台', 'Pinyin': 'yuètái', 'POS': 'N', 'Variants': ''}
No entry for {'ID': 'L3-0326', 'Traditional': '還要', 'Simplified': '还要', 'Pinyin': 'háiyào', 'POS': 'Adv', 'Variants': ''}
No entry for {'ID': 'L3-0335', 'Traditional': '好了', 'Simplified': '好了', 'Pinyin': 'hǎole', 'POS': 'Ptc', 'Variants': ''}
No entry for {'ID': 'L3-0809', 'Traditional': '說起來', 'Simplified': '说起来', 'Pinyin': 'shuōqǐlái', 'POS': 'Adv', 'Variants': ''}
Ambigous simplified: {'ID': 'L3-1144', 'Traditional': '著', 'Simplified': '着', 'Pinyin': 'zhuó', 'POS': '

## Generate Anki deck

*Taiwan TOCFL 2023 wordlist with audio (Traditional)*

Complete wordlist of TOCFL (Test of Chinese as a Foreign Language), a taiwanese equivalent of HSK.

Parsed from official excel sheets from [TOCFL](https://tocfl.edu.tw/) website. This is a new 2022/2023 version ([8000zhuyin_202307.zip](https://tocfl.edu.tw/assets/files/vocabulary/8000zhuyin_202307.zip)) of the list with 7517 entries (previous 2018 list had 7945 entries.)

Columns:
  * `ID`: term's level + index (row number in original excel file which has one sheet per level):
    * `L0-1nnn` = Novice 1 (準備級一級), `L0-2nnn` = Novice 2 (準備級二級), both pre-A1, `L1-nnnn`..`L5-nnnn` = Level 1..5 (入門級/基礎級/進階級/高階級/流利級) = CEFR A1/A2/B1/B2/C1+.
    * Levels are also added as tags.
  * `Traditional`: term in traditional characters per TOCFL.
  * `Simplified`: term converted to simplified characters.
  * `Pinyin`: pinyin with diacritics, slightly cleaned up from TOCFL sheets, e.g. missing apostrophes added and a few clear errors corrected. Tone changes are not indicated.
  * `POS`: part of speech, `/`-separated. See [description](https://tocfl.edu.tw/assets/files/vocabulary/8000_description_202204.pdf) on TOCFL website for the meaning of abbreviations (202204 list is essentially same)
  * `Meaning`: definitions from [CC-CEDICT](https://www.mdbg.net/chinese/dictionary?page=cedict) for convenience. Note it mainly lists mainland pronunciations which may differ from taiwanese in some cases. [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) licensed.
  * `Audio`: good quality neural TTS audio with a taiwanese mandarin voice.
  * `Variants`: for entries where TOCFL gives multiple variants of a term, an expanded disambiguated list as a JSON list of objects with alternatives column values. If using this deck for an automatic analysis (such as merging with other sources or your anki decks), you might find this field useful as the original source is inconsistent in formatting variants.

In [17]:
df = pd.read_csv('tocfl-cedict.csv', dtype='str').fillna('')
df['Audio'] = ''

!mkdir -p data/media
!cp -f ../downloads/fonts/MoeStandardKai.ttf data/media/_MoeStandardKai.ttf
if os.path.exists('../Anki2/hypertts.tsv'):
    tts_mp = pd.read_csv('../Anki2/hypertts.tsv', sep='\t').set_index('Text').Hash.to_dict()
    for row in df.itertuples():
        text = json.loads(row.Variants)[0]['Traditional'] if row.Variants else row.Traditional
        dst = 'data/media/tocfl-tts-%s.mp3' % text
        if not os.path.exists(dst) and text in tts_mp:
            shutil.copy('../Anki2/tts/collection.media/hypertts-%s.mp3' % tts_mp[text], 'data/media/tocfl-tts-%s.mp3' % text)
        df.loc[row.Index, 'Audio'] = '[sound:tocfl-tts-%s.mp3]' % text

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Variants', 'Audio']

model = genanki.Model(
    1698016000,
    'TOCFL',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'TOCFL',
        'qfmt': open('../dangdai/dangdai-qfmt.html').read(),
        'afmt': open('../dangdai/dangdai-afmt.html').read().replace(
            'if (pinyinEl && hanziEl)',
            'if (pinyinEl && hanziEl {{#Variants}}&& false{{/Variants}})'),
    }],
    css=open('../dangdai/dangdai.css').read(),
)

deck = genanki.Deck(1698016001, name='tocfl')

for row in df.reset_index().to_dict(orient='records'):
    note = genanki.Note(
        model=model,
        fields=[row[c] for c in cols],
        guid=genanki.guid_for('tocfl', row['ID']),
        tags=[row['ID'][:2]],
    )
    deck.add_note(note)

!rm -f tocfl.apkg
genanki.Package(deck, media_files=glob.glob('data/media/*')).write_to_file('tocfl.apkg')
!ls -l tocfl.apkg

-rw-r--r-- 1 jovyan users 153082035 Nov  7 13:53 tocfl.apkg
