# TOCFL wordlists parser

Specify source spreadsheet/URL to parse:

In [1]:
APPLY_ERRATA=1  # apply corrections from errata.csv?
#TOCFL_XLS='https://tocfl.edu.tw/index.php/exam/download'; OUTPUT_CSV='tocfl-latest.csv'  # find latest file automatically
TOCFL_XLS='https://tocfl.edu.tw/assets/files/vocabulary/8000zhuyin_202307.zip'; OUTPUT_CSV='tocfl-202307.csv'
##TOCFL_XLS='https://tocfl.edu.tw/assets/files/vocabulary/8000zhuyin_202204.zip'; OUTPUT_CSV='tocfl-202204.csv' # same as 202307
#TOCFL_XLS=('https://web.archive.org/web/20200227052851/http://www.sc-top.org.tw/download/8000zhuyin.zip', '8000zhuyin_20180419.zip'); OUTPUT_CSV='tocfl-20180419.csv'
#TOCFL_XLS=('https://web.archive.org/web/20170621183549/http://www.sc-top.org.tw/download/8000zhuyin.zip', '8000zhuyin_20170324.zip'); OUTPUT_CSV='tocfl-20170324.csv'
#TOCFL_XLS=('https://web.archive.org/web/20170223051537/http://www.sc-top.org.tw/download/8000zhuyin.zip', '8000zhuyin_20161230.zip'); OUTPUT_CSV='tocfl-20161230.csv'
#TOCFL_XLS=('https://web.archive.org/web/20160818155004/http://www.sc-top.org.tw/download/8000zhuyin.zip', '8000zhuyin_20160316.zip'); OUTPUT_CSV='tocfl-20160316.csv'
#TOCFL_XLS=('https://web.archive.org/web/20161215061819/http://www.sc-top.org.tw/download/8000zhuyin.rar', '8000zhuyin_20160215.rar'); OUTPUT_CSV='tocfl-20160215.csv'
#TOCFL_XLS='https://web.archive.org/web/20140908011551/http://www.sc-top.org.tw/download/L1-L5vocabualry%20list20111208.xls'; OUTPUT_CSV='top-20111208.csv'
#TOCFL_XLS='https://web.archive.org/web/20160116074948/http://www.sc-top.org.tw/download/800+800020100915.xls'; OUTPUT_CSV='top-20100915.csv'
#TOCFL_XLS='https://tocfl.edu.tw/assets/files/vocabulary/CCCC_Vocabulary_2022.xls'; OUTPUT_CSV='cccc.csv'
##TOCFL_XLS='https://tocfl.edu.tw/assets/files/vocabulary/CCCC_Vocabulary_2017.xls'; OUTPUT_CSV='cccc-2017.csv'

In [2]:
!pip install -q opencc genanki
import os, re, glob, requests, io, urllib, json, shutil, opencc
import pandas as pd
import genanki
pd.options.display.max_rows = 1000

if TOCFL_XLS == 'https://tocfl.edu.tw/index.php/exam/download':
    # Find link to latest file automatically
    print(f'Downloading {TOCFL_XLS}')
    resp = requests.get(TOCFL_XLS).content.decode('utf-8')
    urls = re.findall('<a href="(/assets/files/vocabulary/8000zhuyin_[0-9]+.zip)"', resp)
    assert len(urls) == 1
    TOCFL_XLS = urllib.parse.urljoin(TOCFL_XLS, urls[0])

TOCFL_URL = ''
if type(TOCFL_XLS) is tuple:
    TOCFL_URL, TOCFL_XLS = TOCFL_XLS
    TOCFL_XLS = 'downloads/' + TOCFL_XLS
elif TOCFL_XLS.startswith('http'):
    TOCFL_URL = TOCFL_XLS
    TOCFL_XLS = f"downloads/{os.path.basename(TOCFL_URL)}"
    print(f'\nTOCFL_URL="{TOCFL_URL}"')

if not os.path.exists(TOCFL_XLS):
    assert TOCFL_URL != '', f'{TOCFL_XLS} does not exist and is not a URL'
    print(f'Downloading {TOCFL_URL} to {TOCFL_XLS}')
    ![[ ! -d downloads && -d ../downloads/tocfl ]] && ln -s ../downloads/tocfl downloads
    !mkdir -p downloads && curl -o "{TOCFL_XLS}" "{TOCFL_URL}"
    assert os.path.exists(TOCFL_XLS)

if TOCFL_XLS.endswith('.zip') or TOCFL_XLS.endswith('.rar'):
    TOCFL_ZIP = TOCFL_XLS
    if os.path.exists(TOCFL_XLS.replace('.zip', '.xls').replace('.rar', '.xls')):
        TOCFL_XLS = TOCFL_XLS.replace('.zip', '.xls').replace('.rar', '.xls')
    elif os.path.exists(TOCFL_XLS.replace('.zip', '.xlsx').replace('.rar', '.xlsx')):
        TOCFL_XLS = TOCFL_XLS.replace('.zip', '.xlsx').replace('.rar', '.xlsx')
    else:
        TOCFL_XLS = TOCFL_XLS.replace('.zip', '.xlsx').replace('.rar', '.xlsx')
        print(f'Unpacking {TOCFL_ZIP} to {TOCFL_XLS}')
        !rm -rf downloads/unpacked && mkdir downloads/unpacked
        if TOCFL_ZIP.endswith('.zip'):
            !cd downloads/unpacked && unzip "../{os.path.basename(TOCFL_ZIP)}"
        else:
            !cd downloads/unpacked && rar x "../{os.path.basename(TOCFL_ZIP)}"
        !cp -fv "$(find downloads/unpacked -name '*.xlsx')" "{TOCFL_XLS}"
        !rm -rf downloads/unpacked
        assert os.path.exists(TOCFL_XLS)
        !echo; ls -l "{TOCFL_XLS}"; sha256sum "{TOCFL_XLS}"; chmod a-w "{TOCFL_XLS}"
    del TOCFL_ZIP

print(f'TOCFL_XLS="{TOCFL_XLS}"')
print(f'OUTPUT_CSV="{OUTPUT_CSV}"')


TOCFL_URL="https://tocfl.edu.tw/assets/files/vocabulary/8000zhuyin_202307.zip"
TOCFL_XLS="downloads/8000zhuyin_202307.xlsx"
OUTPUT_CSV="tocfl-202307.csv"


### Parse and cleanup .xls

In [3]:
LEVELS_MP = {
    # 8000zhuyin_202204.xlsx, 8000zhuyin_202307.xlsx
    'Novice 1': 'L0-1',
    'Novice 2': 'L0-2',
    'Level 1': 'L1',
    'Level 2': 'L2',
    'Level 3': 'L3',
    'Level 4': 'L4',
    'Level 5': 'L5',

    # 20180419
    '準備級一級': 'L0-1',
    '準備級二級': 'L0-2',
    '入門級': 'L1',
    '基礎級': 'L2',
    '進階級': 'L3',
    '高階級': 'L4',
    '流利級': 'L5',

    # 20161230
    '準備一級': 'L0-1',
    '準備二級': 'L0-2',

    # SC-TOP 800.xls
    '基礎': 'L2', # Band A / L1-L2
    '初等': 'L3',
    '初等（舊）': 'drop-L3',
    '中等': 'L4',
    '高等': 'L5',

    # CCCC
    '萌芽級': 'L1',
    '成長級': 'L2',
    '茁壯級': 'L3',
    '三級詞彙': 'drop-vocab',
    '詞表說明': 'drop-expl',
    '詞性縮寫對照表': 'drop-pos',
}

print(f'Parsing {TOCFL_XLS}')
!sha256sum {TOCFL_XLS}

xls = pd.ExcelFile(TOCFL_XLS)
sheets = {}

for i, name in enumerate(xls.sheet_names):
    if OUTPUT_CSV.startswith('cccc'):
        df = xls.parse(name, dtype='str', skiprows=1).fillna('')
    else:
        df = xls.parse(name, dtype='str').fillna('')

    if 'Entry Number' in name or '各等詞條數' in name:
        break

    if name in LEVELS_MP:
        level = LEVELS_MP[name]
    else:
        level = LEVELS_MP[re.findall('[(](.*)[)]', name)[0]]
    if level.startswith('drop-'):
        print(f'Sheet {i+1}: {name:<15}\t{level}\t{len(df)} rows - ignoring')
        continue
    print(f'Sheet {i+1}: {name:<15}\t{level}\t{len(df)} rows')

    df = df.rename(columns=lambda s: s.strip().split('\n')[-1])
    df = df.rename(columns={
        'Parts of Speech': 'POS',
        '詞類': 'POS',
        '詞彙': 'Vocabulary',
        '漢語拼音': 'Pinyin',
        '拼音': 'Pinyin',
        '注音': 'Zhuyin',
        '編號': 'drop-No1',
        '拼音排序': 'drop-No2',
        '等級': 'drop-Level',
        '英文解釋': 'Meaning',
        '任務領域': 'Category',

        # CCCC
        '分類': 'Category',
        '細目': 'Subcategory',
        '正體字': 'Vocabulary',
        '简体字': 'Simplified',
        '漢拼': 'Pinyin',
        '詞性': 'POS',
        '英文': 'Meaning',
    })
    df['Level'] = level
    df = df[df.Vocabulary.fillna('') != ''].copy()
    df = df[~df.Vocabulary.str.match('^[A-Z]$')].copy()

    if OUTPUT_CSV.startswith('cccc'):
        df['ID'] = ['%s-%.3d' % (level, i+1) for i in range(len(df))]
    elif level.startswith('L0'):
        df['ID'] = ['%s%.3d' % (level, i+1) for i in range(len(df))]
    else:
        df['ID'] = ['%s-%.4d' % (level, i+1) for i in range(len(df))]
    for col in list(df):  # 800.xls
        if col.startswith('drop-') or repr(sorted(set(df[col]))) in ["['', '15']", "['', 'pī ']", "['', '匹_1']"]:
            df = df.drop(columns=[col])
    sheets[level] = df

excel_df = pd.concat(sheets.values())
print(f'Total: {len(excel_df)}')

if OUTPUT_CSV.startswith('tocfl'):
    print(f'\n{name}\n%s' % str(df))
    assert str(len(excel_df)) in str(df)

excel_df.describe()

Parsing downloads/8000zhuyin_202307.xlsx
e979ac6d953fb493502e54536b4b6ff534d06e3938700052aa15806d514efc92  downloads/8000zhuyin_202307.xlsx
Sheet 1: 準備級一級(Novice 1)	L0-1	160 rows
Sheet 2: 準備級二級(Novice 2)	L0-2	234 rows
Sheet 3: 入門級(Level 1)   	L1	347 rows
Sheet 4: 基礎級(Level 2)   	L2	485 rows
Sheet 5: 進階級(Level 3)   	L3	1173 rows
Sheet 6: 高階級(Level 4)   	L4	2342 rows
Sheet 7: 流利級(Level 5)   	L5	2776 rows
Total: 7517

各等詞條數(Entry Number)
  Unnamed: 0 準備1級 準備2級  入門級   基礎級   進階級   高階級   流利級    總計
0      各等詞彙量  160  234  347   485  1173  2342  2776  7517
1      累計詞彙量       394  741  1226  2399  4741  7517      


Unnamed: 0,Context,Vocabulary,Pinyin,POS,Level,ID
count,1226,7517,7517,7517,7517,7517
unique,12,7189,6780,104,7,7517
top,個人資料,中,jí,N,L5,L0-1001
freq,220,3,7,2985,2776,1


In [4]:
opencc_tw2s = opencc.OpenCC('tw2s')

tgh_level = None
if not os.path.exists('../chars/tgh.csv'):
    print("../chars/tgh.csv doesn't exist - skipping verification against it")
else:
    # Character levels from Table of General Standard Chinese Characters for verification.
    tgh_level = pd.read_csv('../chars/tgh.csv').set_index('char').level.to_dict()

# Convert to simplified characters + verify
def to_simplified(trad):
    if trad == '什麼/甚麼': return '什么'
    if trad == '甚麼': return '什么'
    simp = opencc_tw2s.convert(trad)
    for x, y in ('擡抬', '砲炮', '牠它', '艶艳', '妳你'):
        simp = simp.replace(x, y)
    if '/' in simp and len(set(simp.split('/'))) == 1:
        simp = simp.split('/')[0]
    if tgh_level:
        for c in simp:
            assert tgh_level.get(c, 9) <= 2 or c in '/(),', (trad, simp, c, tgh_level.get(c))
    return simp

In [5]:
# Cleanup Pinyin column

pinyin_corr_df = pd.read_csv('errata.csv', comment='#', dtype='str')

def fix_pinyin(py, trad=''):
    for x, y in ['ăǎ', 'ŏǒ', 'ĭǐ', 'ŭǔ', 'ɑa', '；/', '（(', '）)', 
                 ('\u200b', ''), (' +[)]', ')'), (' */ *', '/'), (r'\s+', ' '),
                 ('; ', '/'), ('nǔ:', 'nǚ'),
                ]:
        py = re.sub(x, y, py).strip()

    if APPLY_ERRATA:
        for row in pinyin_corr_df.itertuples():
            if row.Pinyin == py and row.Traditional == trad:
                py = row.Corrected
    py = py.strip()
    assert re.match('^[a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ/(), \']+$', py.lower()), (py, repr(py))
    return py

assert fix_pinyin('xiăo') == 'xiǎo'   # a c -> a v
assert fix_pinyin('chuāng(zi )/chuānghu') == 'chuāng(zi)/chuānghu'
assert fix_pinyin('liànài', trad='戀愛') == ("liàn'ài" if APPLY_ERRATA else 'liànài')

In [6]:
# Cleanup Vocabulary column (hanzi)

def fix_vocabulary(s):
    for x, y in [
        ('\u200b', ''), ('（', '('), ('）', ')'), (' */ *', '/'), (r'\s+', ' '),
        ('[(]面˙ㄇㄧㄢ[)]', '(面)'), ('、', ','),
        (r'^([\u4E00-\u9FFF]{2,3})\[([\u4E00-\u9FFF]{2,3})\]$', r'\1/\2'), #CCCC simplified
    ]:
        s = re.sub(x, y, s).strip()

    # Zhuyin hints are reduntant with pinyin and not very relevant for foreigners, drop them
    # Take care to not drop (一) in 差(一)... etc
    s = s.replace('(˙ㄇ一)', '(˙ㄇㄧ)')
    s = s.replace('(一˙)', '(ㄧ˙)')
    s = re.sub(r'[(（][ㄅㄈㄉㄊㄋㄍㄎㄏㄐㄑㄒㄓㄕㄗㄙㄚㄛㄞㄌㄟㄠㄡㄢㄣㄤㄧㄨㄩㄇㄝㄆㄌㄨㄥㄔㄘㄖˊˋ˙\uf8f8]+[)）]', '', s).strip()

    if len(s) == 2 and s[-1] in '1234': s = s[0]  # 丟1 叫4
    if len(s) == 3 and s[-2] == '_' and s[-1] in '1234': s = s[0]

    assert re.match('^[\u4E00-\u9FFF/(),]+$', s), (row, s)
    return s

assert fix_vocabulary('名字(˙ㄗ)') == '名字'
assert fix_vocabulary('差(一)點(兒)') == '差(一)點(兒)'

In [7]:
# Ambiguous variant entries - manually disambiguated
# Variants column format here:
#  trad [pinyin] / ...
#  trad [pinyin] (POS) / ...
variants_exc_txt = '''
Vocabulary,Pinyin,POS,Variants
盒/盒(子),hé/hézi,M / N,盒 [hé] (M) / 盒子 [hézi] (N)
盤/盤(子),pán(zi),M / N,盤 [pán] (M) / 盤子 [pánzi] (N)
瓶/瓶(子),píng(zi),M / N,瓶 [píng] (M) / 瓶子 [píngzi] (N)
箱/箱(子),xiāng/xiāng(zi),M / N,箱 [xiāng] (M) / 箱子 [xiāngzi] (N)
葉(子),yè(zi),M / N,葉 [yè] / 葉子 [yèzi]
姊姊/姐姐/姊/姐,jiějie/jiě,N,姊姊 [jiějie] / 姐姐 [jiějie] / 姊 [jiě] / 姐 [jiě]
這裡/這裏/這兒,zhèlǐ/zhèr,N,這裡 [zhèlǐ] / 這裏 [zhèlǐ] / 這兒 [zhèr]
那裡/那裏/那兒,nàlǐ/nàr,N,那裡 [nàlǐ] / 那裏 [nàlǐ] / 那兒 [nàr]
哪裡/哪裏/哪兒,nǎlǐ/nǎr,N,哪裡 [nǎlǐ] / 哪裏 [nǎlǐ] / 哪兒 [nǎr]
畫/畫兒,huà/huàr,V / N,畫 [huà] / 畫兒 [huàr]
計畫/計劃,jìhuà,V / N,計畫 [jìhuà] / 計劃 [jìhuà]
手錶/手表/錶/表,shǒubiǎo/biǎo,N,手錶 [shǒubiǎo] / 手表 [shǒubiǎo] / 錶 [biǎo] / 表 [biǎo]
刷(子)/刷,shuā(zi)/shuā,N / V,刷 [shuā] (N) / 刷子 [shuāzi] (N) / 刷 [shuā] (V)
保證/証,bǎozhèng,N / V,保證 [bǎozhèng] / 保証 [bǎozhèng]
規畫/規劃,guīhuà,N / V,規畫 [guīhuà] / 規劃 [guīhuà]
駕駛/駕,jiàshǐ/jià,N / V,駕駛 [jiàshǐ] / 駕 [jià]
黏/粘,nián,V / Vs,黏 [nián] / 粘 [nián]
占/佔,zhàn,V / Vst,占 [zhàn] / 佔 [zhàn]
架/架(子),jià/jià(zi),V / N,架 [jià] (V) / 架子 [jiàzi] (N)
以至/以致/以至於/以致於,yǐzhì/yǐzhìyú,Conj,以至 [yǐzhì] / 以致 [yǐzhì] / 以至於 [yǐzhìyú] / 以致於 [yǐzhìyú]
嘴唇/嘴脣/唇/脣,zuǐchún/chún,N,嘴唇 [zuǐchún] / 嘴脣 [zuǐchún] / 唇 [chún] / 脣 [chún]
聲音/聲,shēngyīn,N,聲音 [shēngyīn] / 聲 [shēng]
訪問/訪,fǎngwèn,V,訪問 [fǎngwèn] / 訪 [fǎng]
舅舅/舅,jiùjiu,N,舅舅 [jiùjiu] / 舅 [jiù]
漸漸/漸,jiànjiàn,Adv,漸漸 [jiànjiàn] / 漸 [jiàn]
夜裡/裏,yèlǐ/lǐ,N,夜裡 [yèlǐ] / 夜裏 [yèlǐ]
默默/默,mòmò,Adv,默默 [mòmò] / 默 [mò]
偶而/偶爾,ǒuér,Adv,偶而 [ǒu'ér] / 偶爾 [ǒu'ěr]

部分/部份,bùfen,,部分 [bùfen] / 部份 [bùfen]
部分/部份,bùfèn,,部分 [bùfèn] / 部份 [bùfèn]
部分/部份,bùfen / bùfèn,,部分 [bùfen] / 部份 [bùfèn]
部分/份,bùfen / bùfèn,,部分 [bùfen] / 部份 [bùfèn]
差不多,chàbuduō / chābùduō,,差不多 [chàbuduō] / 差不多 [chābùduō]

# older tocfl
一點/一點兒,yīdiǎn(r),Det,一點 [yīdiǎn] / 一點兒 [yīdiǎnr]
窗/窗戶,chuānghu,N,窗 [chuāng] / 窗戶 [chuānghu]
差點/差點兒,chà(yī)diǎn/chà(yī)diǎnr,Adv,差點 [chàdiǎn] / 差點兒 [chàdiǎnr]
有點/有點兒,yǒu(yì)diǎn/yǒu(yì)diǎnr,Adv,有點 [yǒudiǎn] / 有點兒 [yǒudiǎnr]
以至/致(於),yǐzhì(yú),Conj,以至 [yǐzhì] / 以致 [yǐzhì] / 以至於 [yǐzhìyú] / 以致於 [yǐzhìyú]
凡/凡是,fánshì,Conj,凡 [fán] / 凡是 [fánshì]
偶而/爾,ǒuér/ěr,Adv,偶而 [ǒu'ér] / 偶爾 [ǒu'ěr]
角色,jiǎo/juésè,N,角色 [jiǎosè] / 角色 [juésè]
主角,zhǔjiǎo/jué,N,主角 [zhǔjiǎo] / 主角 [zhǔjué]
外(面),wàimiàn,N,外 [wài] / 外面 [wàimiàn]
窗/窗戶,chuānghù,N,窗 [chuāng] / 窗戶 [chuānghù]
沒有,méi(yǒu),,沒 [méi] / 沒有 [méiyǒu]
公共汽車(公車),gōnggòngqìchē(gōngchē),,公共汽車 [gōnggòngqìchē] / 公車 [gōngchē]
角/角色,jiǎo/juésè,,角色 [jiǎosè] / 角色 [juésè]
主角/角,zhǔjiǎo/jué,,主角 [zhǔjiǎo] / 主角 [zhǔjué]

# cccc
"災(火災,水災)","zāi(huǒzāi, shuǐzāi)",N,災 [zāi] / 火災 [huǒzāi] / 水災 [shuǐzāi]
星期天/星期日,xīngqítiān/xīngqītiān/xīngqírì/xīngqīrì,N/ADV,星期天 [xīngqítiān] / 星期天 [xīngqītiān] / 星期日 [xīngqírì] / 星期日 [xīngqīrì]
星期天/星期日,Xīngqítiān/Xīngqītiān/Xīngqírì/Xīngqīrì,N/ADV,星期天 [Xīngqítiān] / 星期天 [Xīngqītiān] / 星期日 [Xīngqírì] / 星期日 [Xīngqīrì]
圓/圓形,yuán/yuánxíng,VS/N,圓 [yuán] / 圓形 [yuánxíng]
杯(子),bēi(zi),N/M,杯 [bēi] (M) / 杯子 [bēizi] (N)
運動,yùndòng/yùndong,N/V,運動 [yùndòng] / 運動 [yùndong]
畫/畫兒,huà/huàr,V/N,畫 [huà] / 畫兒 [huàr]
和,hé/hàn,Prep/Conj,和 [hé] / 和 [hàn]
事/事情,shì/shìqíng/shìqing,N,事 [shì] / 事情 [shìqíng] / 事情 [shìqing]
盒(子),hé(zi),N/M,盒 [hé] (M) / 盒子 [hézi] (N)
盤(子),pán(zhi),N/M,盤 [pán] (M) / 盤子 [pánzi] (N)
有時/有時候,yǒushí/yǒushíhòu/yǒushíhou,ADV,有時 [yǒushí] / 有時候 [yǒushíhòu] / 有時候 [yǒushíhou]
廣播,guǎngbò/guǎngbō,N/V,廣播 [guǎngbò] / 廣播 [guǎngbō]
椰(子),yézi,N,椰 [yé] / 椰子 [yézi]

# top
點(兒),diǎn(ér),M/N,點 [diǎn] / 點兒 [diǎnr]
男孩(兒)(子),nánhái(ér)(zi),N,男孩 [nánhái] / 男孩兒 [nánháir] / 男孩子 [nánháizi]
女孩(兒)(子),nǚhái(ér)(zi),N,女孩 [nǚhái] / 女孩兒 [nǚháir] / 女孩子 [nǚháizi]
差(一)點(兒),chà(yì)diǎn(ér),Adv,差一點 [chàyīdiǎn] / 差一點兒 [chàyīdiǎnr] / 差點 [chàdiǎn] / 差點兒 [chàdiǎnr]
哪裡/裏(兒),nǎlǐ/er,Adv,哪裡 [nǎlǐ] / 哪裏 [nǎlǐ] / 哪兒 [nǎr]
男孩(子/兒),nánhái(zi/ér),N,男孩 [nánhái] / 男孩兒 [nánháir] / 男孩子 [nánháizi]
女孩(子/兒),nǚhái(zi/ér),N,女孩 [nǚhái] / 女孩兒 [nǚháir] / 女孩子 [nǚháizi]
小孩(子/兒),xiǎohái(zi/ér),N,小孩 [xiǎohái] / 小孩兒 [xiǎoháir] / 小孩子 [xiǎoháizi]
老頭(兒/子),lǎotóu(ér/zi),N,老頭 [lǎotóu] / 老頭兒 [lǎotóur] / 老頭子 [lǎotóuzi]
偶而/爾,ǒuér(ěr),Adv,偶而 [ǒu'ér] / 偶爾 [ǒu'ěr]
不至/致於,búzhìyú,Adv,不至於 [bùzhìyú] / 不致於 [bùzhìyú]
裡面,lǐ(miàn),N,裡 [lǐ] / 裡面 [lǐmiàn]
謝,xiè/xièxie,V,謝 [xiè] / 謝謝 [xièxie]
謝謝,xiè/xièxie,V,謝謝 [xièxie] / 謝 [xiè]
哪裡/兒,nǎlǐ/ér,,哪裡 [nǎlǐ] / 哪兒 [nǎr]
那裡/兒,nàlǐ/ér,,那裡 [nàlǐ] / 那兒 [nàr]
這裡/兒,zhèlǐ/ér,,這裡 [zhèlǐ] / 這兒 [zhèr]
那麼(樣),nàme(yàng),,那麼 [nàme] / 那樣 [nàyàng] / 那麼樣 [nàmeyàng]
這麼(樣),zhème(yàng),,這麼 [zhème] / 這樣 [zhèyàng] / 這麼樣 [zhèmeyàng]
部分/份,bùfèn/fèn,,部分 [bùfèn] / 部份 [bùfèn]
大部分/份,dàbùfèn/fèn,,大部分 [dàbùfèn] / 大部份 [dàbùfèn]
老闆/板,lǎobǎn/bǎn,,老闆 [lǎobǎn] / 老板 [lǎobǎn]
哪裡/兒,nǎlǐ/ér,,哪裡 [nǎlǐ] / 哪兒 [nǎr]
其他/它,qítā/tā,,其他 [qítā] / 其它 [qítā]
夜裡/裏,yèlǐ/lǐ,,夜裡 [yèlǐ] / 夜裏 [yèlǐ]
一下子/兒,yíxiàzi/ér,,一下子 [yīxiàzi] / 一下兒 [yīxiàr]
似的/地,sìde/di,,似的 [sìde] / 似地 [sìdi]
計畫/劃,jìhuà/huà,,計畫 [jìhuà] / 計劃 [jìhuà]
'''
variants_exc_df = pd.read_csv(io.StringIO(variants_exc_txt), comment='#', dtype='str').fillna('')
variants_exc_mp = variants_exc_df.set_index(['Vocabulary', 'Pinyin', 'POS']).Variants.to_dict()

# Some variants in TOCFL are specified as "...x/y..." character pairs. Valid pairs here:
variant_pairs = [
    '做作', '布佈', '嘗嚐', '溼濕', '分份', '畫劃', '裡裏', '秘祕', '台臺', '周週',
    '汙污', '消宵', '占佔', '證証', '雇僱', '迴回', '剎煞', '的地', '艶豔', '嘆歎',
    '連聯', '秘祕', '伙夥', '闆板', '煙菸', '至致', '蹟跡', '畫劃', '分份', '份分',
    '他它', '裡裏', '證証', '溼濕', '煙菸'
]

def get_variants(vocab, pinyin, pos) -> str:
    """Returns disambiguated list of variants from TOCFL's vocab+pinyin+pos strings."""

    pos = pos.strip('()')

    res = variants_exc_mp.get((vocab, pinyin, pos))
    if res:
        return res.strip()
    res = variants_exc_mp.get((vocab, pinyin, ''))
    if res:
        return res.strip()

    ps = re.sub('[^()/,]', '', pinyin)
    vs = re.sub('[^()/,]', '', vocab)
    if ps == '' and vs == '':
        return ''

    if ps == '' and vs == '/':
        for x, y in variant_pairs:
            if not (f'{x}/{y}' in vocab or f'{y}/{x}' in vocab): continue
            if f'{y}/{x}' in vocab:
                x, y = y, x
            assert f'{x}/{y}' in vocab
            vx = vocab.replace(f'{x}/{y}', x)
            vy = vocab.replace(f'{x}/{y}', y)
            return f'{vx} [{pinyin}] / {vy} [{pinyin}]'

    if (ps != vs and ps != '' and vs != '') or \
       (ps+vs != '' and '/' in pos and pos not in ('N/M', 'M/N')) or \
       (ps == '' and vs != '' and len(set(map(len, vocab.split('/')))) != 1):
        print('ps=%s vs=%s' % (ps, vs))
        raise Exception('Ambiguous entry: %s' % ','.join([vocab, pinyin, pos, vocab]))
        print('%s' % ','.join([vocab, pinyin, pos, vocab]))
        return ''

    if ps != '' and vs == '':
        assert '/' not in pos
        assert ps == '/'
        res = ' / '.join([f'{vocab} [{p.strip()}]' for p in pinyin.split('/')])
        return res
    if ps == '' and vs != '':
        assert '/' not in pos
        res = vocab.split('/')
        assert len(set(map(len, res))) == 1, vocab
        res = ' / '.join([f'{s} [{pinyin}]' for s in res])
        return res
    assert ps == vs

    res = []
    for vo, py in zip(vocab.split('/'), pinyin.split('/')):
        assert vo.count('(') == py.count('(') and vo.count('(') <= 1
        assert vo.count(')') == py.count(')') and vo.count(')') <= 1
        assert vo.count('(') == vo.count(')')
        if '(' in vo:
            vm = re.match('^([^() ]*) *[(]([^() ]+)[)] *([^() ]*)$', vo)
            assert vm, vo
            pm = re.match('^([^() ]*) *[(]([^() ]+)[)] *([^() ]*)$', py)
            assert pm, py

            if vm[2] == '兒' and vm[3] == '' and pm[2] == 'ér' and pm[3] == '':
                pm = [pm[0], pm[1], 'r', '']

            res.append(f'{vm[1]}{vm[3]} [{pm[1]}{pm[3]}]')
            res.append(f'{vm[1]}{vm[2]}{vm[3]} [{pm[1]}{pm[2]}{pm[3]}]')
        else:
            res.append(f'{vo} [{py}]')

    if pos in ('N/M', 'M/N'):
        assert len(res) == 2
        res.sort(key=lambda s: len(s))
        assert '子' not in res[0] and '子' in res[1]
        res[0] += ' (M)'
        res[1] += ' (N)'

    w = [x.split()[0] for x in res]
    if len(w) == 2 and len(w[0]) == 1 and w[1] == w[0] + '子' and pos != 'M/N':
        # reorder 房子 / 房
        res[0], res[1] = res[1], res[0]
        #print(res)
    if len(w) == 2 and len(w[0]) == 1 and w[1] == w[0] + w[0] and w[0] in ('弟哥姐姊爸媽'):
        res[0], res[1] = res[1], res[0]
        #print(res)

    res = ' / '.join(res)
    return res

def variants_to_json(variants):
    if not variants or variants != variants:
        return ''
    arr = []
    for var in variants.split(' / '):
        m = re.match(r'^([^ ()\[\]]+) \[([^()\[\]]+)\](?:$| [(]([A-Z]+)[)])$', var)
        assert m, variants
        var = {
            'Traditional': m[1],
            'Simplified': to_simplified(m[1]),
            'Pinyin': fix_pinyin(m[2], m[1]),
        }
        if m[3]:
            var['POS'] = m[3]
        arr.append(var)
    return json.dumps(arr, ensure_ascii=False)

assert get_variants('台灣/臺灣', 'táiwān', 'N') == '台灣 [táiwān] / 臺灣 [táiwān]'
assert get_variants('小孩(子)', 'xiăohái(zi)', 'N') == '小孩 [xiăohái] / 小孩子 [xiăoháizi]'
assert get_variants('公共汽車/公車', 'gōnggòngqìchē/gōngchē', 'N') == '公共汽車 [gōnggòngqìchē] / 公車 [gōngchē]'
assert get_variants('公共汽車(公車)', 'gōnggòngqìchē(gōngchē)', 'N') == '公共汽車 [gōnggòngqìchē] / 公車 [gōngchē]'
assert get_variants('盒/盒(子)', 'hé/hézi', 'M / N') == '盒 [hé] (M) / 盒子 [hézi] (N)'
assert get_variants('差(一)點/差(一)點兒', 'chà(yī)diǎn/chà(yī)diǎnr','Adv') == \
                    '差點 [chàdiǎn] / 差一點 [chàyīdiǎn] / 差點兒 [chàdiǎnr] / 差一點兒 [chàyīdiǎnr]'
assert get_variants('角色', 'jiǎo/juésè', 'N') == '角色 [jiǎosè] / 角色 [juésè]' #XXX
assert get_variants('計畫/劃', 'jìhuà', 'V') == '計畫 [jìhuà] / 計劃 [jìhuà]'
assert get_variants('(老)鼠', '(lǎo)shǔ', 'N') == '鼠 [shǔ] / 老鼠 [lǎoshǔ]'
assert get_variants('鼻(子)', 'bí(zi)', 'N') == '鼻子 [bízi] / 鼻 [bí]'
assert get_variants('瓶(子)', 'píng(zi)', 'M/N') == '瓶 [píng] (M) / 瓶子 [píngzi] (N)'
assert get_variants('唱歌(兒)', 'chànggē(ér)', 'V') == '唱歌 [chànggē] / 唱歌兒 [chànggēr]'

assert (variants_to_json('台灣 [Táiwān] / 臺灣 [Táiwān]') ==
        '[{"Traditional": "台灣", "Simplified": "台湾", "Pinyin": "Táiwān"}, ' +
        '{"Traditional": "臺灣", "Simplified": "台湾", "Pinyin": "Táiwān"}]')
assert (variants_to_json('盤 [pán] (M) / 盤子 [pánzi] (N)') == 
        '[{"Traditional": "盤", "Simplified": "盘", "Pinyin": "pán", "POS": "M"}, ' +
        '{"Traditional": "盤子", "Simplified": "盘子", "Pinyin": "pánzi", "POS": "N"}]')

In [8]:
def fix_meaning(text):
    text = text.strip()
    for x, y in [
        (r'\s+', ' '),
        (r'([^ ])/ ', r'\1/'),
        (r'([^ ])([:;,\]])([^ ])', r'\1\2 \3'),
        (r'([^ ])([(])([^ )]+[ ,/;:.])', r'\1 \2\3'),
        (r'([^ ])([(])([0-9]+[)])', r'\1 \2\3'),
        (r' [)] ', ') '), (r' [)]$', ')'), (r' \( ', ' ('), (r'^\( ', '('),
        (r' [)], ', '), '),
        (' ; ', '; '), (' , ', ', '),
        ('…+', '...'), (' [.]', '.'),
        (r'\.etc', '. etc'),
        (r'\.[(]', '. ('),
        (r'something[.]+', 'sth.'),
        (r'someone[.]+', 'sb.'),
        (r'something', 'sth'),
        (r'someone', 'sb'),
        (r'etc[.]+[)]', 'etc)'),
        (r'etc[.]+$', 'etc'),
        (r'(,|[.][.][.]) etc', ' etc'),
        (' : ', ': '),
        (' *[;,]$', ''),
        (r'\(1\) *', '①'),
        (r'\(2\) *', '②'),
        (r'\(3\) *', '③'),
        (r'\(4\) *', '④'),
        (r'\(5\) *', '⑤'),
        ('^Measure word', 'measure word'),
        ('^Partial measure word', 'partial measure word'),
        ('^Collective measure word', 'collective measure word'),
        ('^Coi?ntainer measure word', 'container measure word'),
        ('^Individual measure word', 'individual measure word'),
        (r'\[Commerce\]', '[commerce]'),
        (r'\[Economics\]', '[economics]'),
        (r'\[Formal\]', '[formal]'),
        (r'\[Informal\]', '[informal]'),
        (r'\[Medicine\]', '[medicine]'),
        (r'\[Military\]', '[military]'),
        (r'\[Music\]', '[music]'),
        (r'\[Physics\]', '[physics]'),
        (r'\[Polite\]', '[polite]'),
    ]:
        text = re.sub(x, y, text)   
    return text

def fix_pos(text):
    text = text.replace(' ', '').replace('；', '/').strip('()').strip()
    for x, y in [('[;；]', '/'), (' ', ''), ('ADV', 'Adv'), ('VS', 'Vs'), ('[Pp]article', 'Ptc'), ('affix', 'Affix')]:
        text = re.sub(x, y, text).strip()
    return text

In [9]:
rows = []
expanded_rows = []
prev_cat = ''

for row in excel_df.fillna('').to_dict(orient='records'):
    if row['ID'].endswith('-0001'):
        prev_cat = ''
    if row['Vocabulary'] == ' ' and row['ID'] == 'L2-0597' and row.get('Meaning') == 'Individual measure word for insects or animals.':
        row['Vocabulary'] = '隻'
        row['Pinyin'] = 'zhī'
    row['Traditional'] = fix_vocabulary(row['Vocabulary'])
    row['Pinyin'] = fix_pinyin(row['Pinyin'], row['Traditional'])
    row['Variants'] = variants_to_json(get_variants(row['Traditional'], row['Pinyin'], row['POS']))
    row['POS'] = fix_pos(row['POS'])
    row['Simplified'] = to_simplified(row['Traditional'])
    if 'Meaning' in row:
        row['Meaning'] = fix_meaning(row['Meaning'])
    if 'Category' in row:
        row['Category'] = row['Category'].strip()
        if row['Category']:
            prev_cat = row['Category']
        else:
            row['Category'] = prev_cat
    rows.append(row)

    variants = json.loads(row['Variants']) if row['Variants'] else [{}]
    for variant in variants:
        var = dict(row)
        var.update(variant)
        expanded_rows.append(var)

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS']
for col in ['Meaning', 'Category', 'Subcategory']:
    if col in excel_df:
        cols += [col]
!rm -f "{OUTPUT_CSV}"
tocfl_df = pd.DataFrame(rows)[cols + ['Variants']]
tocfl_df.to_csv(OUTPUT_CSV, index=False)
print('%s: %d terms, %d unique' % (OUTPUT_CSV, len(tocfl_df), len(set(tocfl_df.Traditional))))
assert list(tocfl_df.index) == list(sorted(tocfl_df.index))

expanded_csv = 'expanded/' + OUTPUT_CSV
expanded_df = pd.DataFrame(expanded_rows)[cols]
expanded_df.to_csv(expanded_csv, index=False)
assert list(expanded_df.index) == list(sorted(expanded_df.index))
print('%s: %d terms, %d unique' % (expanded_csv, len(expanded_df), len(set(expanded_df.Traditional))))

tocfl-202307.csv: 7517 terms, 7187 unique
expanded/tocfl-202307.csv: 7848 terms, 7480 unique


## Readings check

In [10]:
if os.path.exists('../cedict/syllables.csv'):
    readings_mp = {'一': set([]), '蹟': set(['jī']), '噢': set(['yǔ'])}
    syll_df = pd.read_csv('../cedict/syllables.csv', dtype='str').fillna('')
    for row in syll_df.itertuples():
        readings_mp.setdefault(row.Traditional, set()).add(row.Pinyin.lower())
    readings_mp = {x: set([y.strip().lower() for y in readings_mp[x] if y.strip()]) for x in readings_mp}
    readings_mp['不'] -= set(['bú'])

    def gen_readings(trad):
        if trad == '':
            yield ''
        elif trad[0] not in readings_mp or ord(trad[0]) < 0x3E00:
            yield from gen_readings(trad[1:])
        else:
            for x in readings_mp[trad[0]]:
                for y in gen_readings(trad[1:]):
                    yield x.lower() + ("'" if y and y[0] in 'aāáǎàeēéěèoōóǒò' else '') + y

    for filename in sorted(glob.glob('expanded/*.csv')):
        if 'tbcl' in filename: continue
        if 'chars' in filename: continue
        df0 = pd.read_csv(os.path.basename(filename), dtype='str')
        df = pd.read_csv(filename, dtype='str').fillna('')
        print('%s: %d terms (%d with variants, %d unique)' % (os.path.basename(filename), len(df0), len(df), len(set(df.Traditional))))
        for row in df.itertuples():
            trad, pinyin = row.Traditional, row.Pinyin.replace(' ', '')
            readings = list(gen_readings(trad))
            if pinyin.lower() not in readings:
                print(' ', row.ID, row.Traditional, list(row._asdict().values())[4:5], 'vs.', readings)
                #print('%s,%s,%s' % (row.Traditional, row.Pinyin, row.Pinyin.replace('yì', 'yī').replace('yí', 'yī').replace('bú', 'bù')))

cccc.csv: 1197 terms (1344 with variants, 1312 unique)
tocfl-20160215.csv: 7966 terms (8137 with variants, 7410 unique)
tocfl-20160316.csv: 7966 terms (8136 with variants, 7409 unique)
tocfl-20161230.csv: 7965 terms (8132 with variants, 7405 unique)
tocfl-20170324.csv: 7965 terms (8132 with variants, 7405 unique)
tocfl-20180419.csv: 7945 terms (8106 with variants, 7399 unique)
tocfl-202307.csv: 7517 terms (7848 with variants, 7480 unique)
tocfl.csv: 7517 terms (7848 with variants, 7480 unique)
top-20100915.csv: 8868 terms (9061 with variants, 7457 unique)
top-20111208.csv: 8013 terms (8177 with variants, 7427 unique)
  L1-0496 姊姊 ['zǐzi'] vs. ['jiějiě', 'jiězǐ', 'jiějie', 'zǐjiě', 'zǐzǐ', 'zǐjie', 'jiejiě', 'jiezǐ', 'jiejie']


## Export as HTML

In [11]:
# https://github.com/Mottie/tablesorter
# https://getmdl.io/components/index.html#tables-section
# https://materialui.co/colors
# gh-pages branch, publish to github.io

import html

HTML_HEAD = r'''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>{HTML_TITLE}</title>
<style>
html {
  font-family: 'Roboto', sans-serif;
  background: #999999;
}
.titlediv {
  text-align: center;
  padding: 10px 20px 10px 20px;
  background-color: #ffffff;
  font-size: 22pt;
  box-shadow: rgba(0, 0, 0, 0.6) 0px 15px 25px;
  margin-left: auto;
  margin-right: auto;
  /*width: fit-content;*/
  border-radius: 10px;
  margin-bottom: 5px;
}
table.datatable {
  background-color: #ffffff;
  border-spacing: 0px; border-collapse: separate;
  box-shadow: rgba(0, 0, 0, 0.6) 0px 15px 25px;
  border: 1px solid #999999;
}
table.datatable th {
  font-weight: bold;
  border-style: none;
  border-width: 1px;
  color: #000000;
  background-color: #ffffff;
  text-align: left;
  padding: 10px 10px 10px 10px;
  border-bottom: 1px solid #aaaaaa;
}
table.datatable td {
  padding: 5px 10px 5px 10px;
  vertical-align: top;
}
.CID, .CLe/*vel*/, .CTr/*aditional*/, .CSi/*mplified*/, .CAf/*fix*/ { 
  white-space: nowrap;
}
tr.L0.odd  { background-color: #FFCDD2CC; }  /* red */
tr.L0.even { background-color: #FFCDD266; }
tr.L01.odd  { background-color: #FFCDD2CC; }  /* red */
tr.L01.even { background-color: #FFCDD266; }
tr.L02.odd  { background-color: #FFCCBCCC; }  /* deep orange */
tr.L02.even { background-color: #FFCCBC66; }
tr.L1.odd   { background-color: #FFECB3CC; }  /* amber */
tr.L1.even  { background-color: #FFECB366; }
tr.L2.odd   { background-color: #DCEDC8FF; }  /* light green */
tr.L2.even  { background-color: #DCEDC866; }
tr.L3.odd   { background-color: #B2DFDBCC; }  /* teal */
tr.L3.even  { background-color: #B2DFDB66; }
tr.L4.odd   { background-color: #BBDEFBCC; }  /* blue */
tr.L4.even  { background-color: #BBDEFB66; }
tr.L5.odd   { background-color: #D1C4E9CC; }  /* deep purple */
tr.L5.even  { background-color: #D1C4E966; }
tr.L6.odd   { background-color: #E1BEE7CC; }  /* purple */
tr.L6.even  { background-color: #E1BEE766; }
/*tbody:hover, tr:hover td { background-color: #ffffff; }*/
</style>
</head>
<body>
<div style="max-width: 90%; width: fit-content; margin-left: auto; margin-right: auto">
{TITLE_DIV}
<table class="datatable" style="margin-left: auto; margin-right: auto">
'''

HTML_FOOTER ='''
</table>
</div>
</html>
'''

def gen_html(csv_path, html_path):
    df = pd.read_csv(csv_path, dtype='str').fillna('')
    #if 'Variants' in df: df = df[df.Variants != '']

    csv_name = os.path.basename(csv_path)
    if csv_name.startswith('tocfl-20') or \
       csv_name.startswith('tocfl-cedict') or \
       csv_name.startswith('tocfl-latest'):
        df.insert(1, 'Level', df.ID.str.extract('^(L0-.|L[1-9])'))
        df['Level'] = df.Level.map({
            'L0-1': 'Novice 1',
            'L0-2': 'Novice 2',
            'L1': '1',
            'L2': '2',
            'L3': '3',
            'L4': '4',
            'L5': '5',
        })

    html_title = csv_name
    title_div = ''
    if csv_name.startswith('tocfl-20'):
        html_title = 'TOCFL wordlist (' + csv_name[len('tocfl-'):].replace('.csv', '') + ')'
        #title_div = f'<div class="titlediv">{html_title}</div>'
    elif csv_name.startswith('top-20'):
        html_title = 'SC-TOP/TOCFL wordlist (' + csv_name[len('top-'):].replace('.csv', '') + ')'
        #title_div = f'<div class="titlediv">{html_title}</div>'
    elif csv_name == 'cccc.csv':
        html_title = "CCCC (Children's Chinese Competency Certification) wordlist"
        #title_div = f'<div class="titlediv">{html_title}</div>'

    columns = [c for c in df.columns if c not in ['Variants']]
    mm = {}

    with open(html_path, 'w') as fp:
        fp.write(HTML_HEAD.replace('{HTML_TITLE}', html_title).replace('{TITLE_DIV}', title_div))
        fp.write('  <thead>\n')
        fp.write('    <tr>\n')
        for col in columns:
            fp.write('      <th>%s</th>\n' % col)
        fp.write('    </tr>\n')
        fp.write('  </thead>\n')
        fp.write('  <tbody>\n')
        row_idx = 0
        used_anc = set()

        for row in df.to_dict(orient='records'):
            if 'ID' in row and row['ID'].startswith('L'):
                lev = row['ID'][:2]
                if lev == 'L0':
                    lev += row['ID'][3]
            elif 'Level' in row:
                lev = int(row['Level'][0])
                lev = f'L{lev}'
            else:
                lev = ''
            if ('tbcl' in csv_path) and lev:
                lev = 'L' + str(int(lev[1]) - 1) + lev[2:]

            if row.get('Variants'):
                #vrows = [row]
                vrows = []
                for var in json.loads(row['Variants']):
                    vrows.append(dict(row))
                    vrows[-1].update(var)
            else:
                vrows = [row]

            row_idx += 1
            for vi, vrow in enumerate(vrows):
                fp.write('  <tr class="%s %s">\n' % (lev, 'odd' if row_idx % 2 == 1 else 'even'))
                for col in columns:
                    val = vrow[col]
                    if val is None:
                        continue

                    cls = 'C%s' % (col[:2])
                    anchor = ''
                    if val:
                        if col in ('ID', 'Traditional', 'Simplified') and val not in used_anc:
                            anchor = val
                            used_anc.add(val)
                        else:
                            anchor = ''

                    rowspan = ''
                    colspan = ''
                    if vi == 0 and len(set(v[col] for v in vrows)) == 1:
                        if len(vrows) > 1:
                            rowspan = f' rowspan="{len(vrows)}"'
                        for v in vrows:
                            v[col] = None
                        if col == 'Meaning' and 'Meaning Compounds Examples' in ' '.join(columns) \
                           and not row['Compounds'] and not row['Examples']:
                            colspan = ' colspan="3"'
                            for v in vrows:
                                v['Compounds'] = None
                                v['Examples'] = None
                    fp.write(f'    <td class="{cls}"{rowspan}{colspan}>')

                    if val:
                        if anchor:
                            fp.write(f'<a name="{anchor}">{val}</a>')
                        elif col == 'MOE':
                            for id in val.split():
                                fp.write(f'<a href="https://dict.concised.moe.edu.tw/dictView.jsp?ID={id}">{id}</a> ')
                        elif '<br>' in val:
                            val = '<br>'.join([html.escape(s) for s in val.split('<br>')])
                            fp.write(val)
                        else:
                            fp.write(html.escape(val))
                    fp.write(f'</td>\n')

                fp.write('  </tr>\n')

        fp.write('  </tbody>\n')
        fp.write(HTML_FOOTER)
    print(f'Generated {html_path}')

!mkdir -p html/
for fn in glob.glob('*.csv'):
    if fn == 'errata.csv': continue
    gen_html(fn, 'html/' + fn.replace('.csv', '.html'))

Generated html/z.html
Generated html/tbcl-grammar.html
Generated html/tbcl.html
Generated html/tbcl-affix.html
Generated html/tocfl-20160215.html
Generated html/cccc.html
Generated html/top-20111208.html
Generated html/tocfl-202307.html
Generated html/tocfl-cedict.html
Generated html/top-20100915.html
Generated html/tocfl-20170324.html
Generated html/tocfl-20180419.html
Generated html/tbcl-chars.html
Generated html/tocfl.html
Generated html/tocfl-20161230.html
Generated html/tbcl-cedict.html
Generated html/tocfl-20160316.html


## Export as Pleco user dictionary

In [12]:
EAC1_TAG = '\uEAC1\uEC00\uEC00\uECCC\uEC99'  # tag color, #00cc99 green

def gen_pleco(input_fn, output_fn, title, tag):
    with open(output_fn, 'w') as fout:
        last_header = ''
        for row in pd.read_csv(input_fn, dtype='str').fillna('').to_dict(orient='records'):
            cefr = {'0': 'pre-A1', '1': 'A1', '2': 'A2', '3': 'B1', '4': 'B2', '5': 'C1+'}[row['ID'][1]]
            if 'cccc' in input_fn:
                cefr = {'1': 'pre-A1', '2': 'A1', '3': 'A2'}[row['ID'][1]]
                header = f"//{title}/Level {row['ID'][1]} ({cefr})"
            elif row['ID'].startswith('L0'):
                header = f"//{title}/Novice {row['ID'][3]} ({cefr})"
            else:
                header = f"//{title}/Level {row['ID'][1]} ({cefr})"

            if header != last_header:
                last_header = header
                fout.write(header + '\n')

            variants = json.loads(row['Variants']) if row['Variants'] else [{}]
            for variant in variants:
                var = dict(row)
                var.update(variant)
                defn = ' '.join([
                    f"{row['Traditional']} [{row['Pinyin']}]\uEAB1" if row['Variants'] else '',
                    f"({row['POS']})" if row.get('POS') else '',
                    f"{row['Meaning']}" if row.get('Meaning') else '',
                    f"{EAC1_TAG}[{tag}{row['ID'][1]}]\uEAC2"
                ])
                defn = re.sub(r'\s+', ' ', defn).replace('\uEAB1 ', '\uEAB1').strip()
                key = f"{var['Simplified']}[{var['Traditional']}]\t{var['Pinyin']}"
                fout.write(f'{key}\t{defn}\n')

gen_pleco('top-20111208.csv', 'pleco/top-pleco.txt', 'TOP 20111208', 'TOP')
gen_pleco('tocfl.csv', 'pleco/tocfl-pleco.txt', 'TOCFL 2023', 'T')
gen_pleco('cccc.csv', 'pleco/cccc-pleco.txt', 'CCCC 2022', 'CCCC')
!ls -l pleco/

total 788
-rw-r--r-- 1 jovyan users   94605 Apr 15 00:37 cccc-pleco.txt
-rw-r--r-- 1 jovyan users 1121374 Apr 15 00:19 tbcl-pleco.txt
-rw-r--r-- 1 jovyan users  414516 Apr 15 00:37 tocfl-pleco.txt
-rw-r--r-- 1 jovyan users  634408 Apr 15 00:37 top-pleco.txt


## Join with CC-CEDICT

In [13]:
UNTONE_MP = {
    'a': 'a', 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
    'e': 'e', 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
    'o': 'o', 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
    'i': 'i', 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
    'u': 'u', 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
    'ü': 'ü', 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
}

# Check if pinyin from the list (py1) matches cedict's (py2)
def pinyin_matches(py1, py2, hz='', untone=False):
    py1 = py1.lower()
    py2 = py2.lower()
    i, j = 0, 0
    while i < len(py1) or j < len(py2):
        a = ''
        if i < len(py1):
            a = py1[i]
            if a in "-',/() ":
                i += 1
                continue

        b = ''
        if j < len(py2):
            b = py2[j]
            if b in "-',/() ":
                j += 1
                continue

        match = (a == b)
        match |= untone and (UNTONE_MP.get(a, a) == b or a == UNTONE_MP.get(b, b))
        if match:
            i += 1
            j += 1
        else:
            return False

    return i == len(py1) and j == len(py2)

In [14]:
tocfl_df = pd.read_csv('tocfl.csv', dtype='str').fillna('')
top_df = pd.read_csv('top-20111208.csv', dtype='str').fillna('')
cedict_df = pd.read_csv('../cedict/cedict.csv', dtype='str').fillna('')
cedict_idx_mp = cedict_df.assign(idx=cedict_df.index).groupby('Traditional').idx.apply(list)

rows = []

for row in tocfl_df.to_dict(orient='records'):
    pinyin_set = set([row['Pinyin']])
    matches = cedict_idx_mp.get(row['Traditional'], [])
    if len(matches) == 0 and row['Variants']:
        variants = json.loads(row['Variants']) if row['Variants'] else [{}]
        for variant in variants:
            matches.extend(cedict_idx_mp.get(variant['Traditional'], []))
            pinyin_set.add(variant['Pinyin'])

    matches = list(sorted(set(matches)))

    if len(matches) == 0:
        print('No entry for %s' % row)
    else:
        # Prioritize pronunciation matches, downpriorize names and variants
        # TODO: match based on taiwanese pronunciation
        if len(matches) > 1:
            matches.sort(key=lambda i: (
                -int(any(pinyin_matches(py, cedict_df.Pinyin[i], untone=False) for py in pinyin_set))
                -int(any(pinyin_matches(py, cedict_df.Pinyin[i], untone=True) for py in pinyin_set))
                +10*int(re.match('^(variant|used in)', cedict_df.Definitions[i]) is not None)
                +100*int(cedict_df.Pinyin[i][0].isupper())
                +1000*int(cedict_df.Pinyin[i][0].isupper())*int(re.match('^surname ', cedict_df.Definitions[i]) is not None)
            ))

        ce_simp = set([cedict_df.Simplified[i] for i in matches])
        cc_simp = opencc_tw2s.convert(row['Traditional'])
        if not row['Variants'] and ce_simp:
            if row['Simplified'] not in ce_simp:
                print('Simplified diff:', row, 'ce', ce_simp, 'cc', cc_simp)
            if len(ce_simp) > 1:
                print('Ambigous simplified:', row, 'ce', ce_simp, 'cc', cc_simp)

        defs = []
        for i in matches:
            py1 = list(pinyin_set)[0] if len(pinyin_set) == 1 else ''
            defn = cedict_df.Definitions[i]
            if defn.startswith('surname ') and cedict_df.Pinyin[i][0].isupper() and len(defs) > 0:
                continue
            if len(defs) == 0 and cedict_df.Pinyin[i][0].isupper() and row['Pinyin'][0].islower():
                print('%s,%s,%s' % (row['Traditional'], row['Pinyin'], row['Pinyin'][0].upper() + row['Pinyin'][1:]))
            defn = re.sub(r'/CL:個\|个\[ge4\](|/.*)$', r'\1', defn)  # uninformative
            defn = defn.replace(' (CL:個|个[ge4])/', '/')
            if row['Variants']:
                defn = '%s [%s] %s' % (cedict_df.Traditional[i], cedict_df.Pinyin[i], defn)
            elif not pinyin_matches(py1, cedict_df.Pinyin[i], untone=False):
                defn = '[%s] %s' % (cedict_df.Pinyin[i], defn)
            defs.append(defn)

        row['Meaning'] = '<br> '.join(defs)

    rows.append(row)

merged_df = pd.DataFrame(rows)
merged_df.to_csv('tocfl-cedict.csv', index=False)

# diffs mostly due to variants chars

Ambigous simplified: {'ID': 'L0-2234', 'Traditional': '著', 'Simplified': '着', 'Pinyin': 'zhe', 'POS': 'Ptc', 'Variants': ''} ce {'著', '着'} cc 着
Ambigous simplified: {'ID': 'L2-0171', 'Traditional': '乾', 'Simplified': '干', 'Pinyin': 'gān', 'POS': 'Vp', 'Variants': ''} ce {'乾', '干'} cc 干
No entry for {'ID': 'L2-0178', 'Traditional': '汙染', 'Simplified': '污染', 'Pinyin': 'wūrǎn', 'POS': 'V', 'Variants': ''}
No entry for {'ID': 'L2-0292', 'Traditional': '月台', 'Simplified': '月台', 'Pinyin': 'yuètái', 'POS': 'N', 'Variants': ''}
No entry for {'ID': 'L3-0326', 'Traditional': '還要', 'Simplified': '还要', 'Pinyin': 'háiyào', 'POS': 'Adv', 'Variants': ''}
No entry for {'ID': 'L3-0335', 'Traditional': '好了', 'Simplified': '好了', 'Pinyin': 'hǎole', 'POS': 'Ptc', 'Variants': ''}
No entry for {'ID': 'L3-0809', 'Traditional': '說起來', 'Simplified': '说起来', 'Pinyin': 'shuōqǐlái', 'POS': 'Adv', 'Variants': ''}
Ambigous simplified: {'ID': 'L3-1144', 'Traditional': '著', 'Simplified': '着', 'Pinyin': 'zhuó', 'POS': '

## Generate Anki deck

*Taiwan TOCFL 2023 wordlist with audio (Traditional)*

Complete wordlist of TOCFL (Test of Chinese as a Foreign Language), a taiwanese equivalent of HSK.

Parsed from official excel sheets from [TOCFL](https://tocfl.edu.tw/) website. This is a new 2022/2023 version ([8000zhuyin_202307.zip](https://tocfl.edu.tw/assets/files/vocabulary/8000zhuyin_202307.zip)) of the list with 7517 entries (previous 2018 list had 7945 entries.)

Columns:
  * `ID`: term's level + index (row number in original excel file which has one sheet per level):
    * `L0-1nnn` = Novice 1 (準備級一級), `L0-2nnn` = Novice 2 (準備級二級), both pre-A1, `L1-nnnn`..`L5-nnnn` = Level 1..5 (入門級/基礎級/進階級/高階級/流利級) = CEFR A1/A2/B1/B2/C1+.
    * Levels are also added as tags.
  * `Traditional`: term in traditional characters per TOCFL.
  * `Simplified`: term converted to simplified characters.
  * `Pinyin`: pinyin with diacritics, slightly cleaned up from TOCFL sheets, e.g. missing apostrophes added and a few clear errors corrected. Tone changes are not indicated.
  * `POS`: part of speech, `/`-separated. See [description](https://tocfl.edu.tw/assets/files/vocabulary/8000_description_202204.pdf) on TOCFL website for the meaning of abbreviations (202204 list is essentially same)
  * `Meaning`: definitions from [CC-CEDICT](https://www.mdbg.net/chinese/dictionary?page=cedict) for convenience. Note it mainly lists mainland pronunciations which may differ from taiwanese in some cases. [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) licensed.
  * `Audio`: good quality neural TTS audio with a taiwanese mandarin voice.
  * `Variants`: for entries where TOCFL gives multiple variants of a term, an expanded disambiguated list as a JSON list of objects with alternatives column values. If using this deck for an automatic analysis (such as merging with other sources or your anki decks), you might find this field useful as the original source is inconsistent in formatting variants.

In [15]:
df = pd.read_csv('tocfl-cedict.csv', dtype='str').fillna('')
df['Audio'] = ''

!mkdir -p downloads/media
if os.path.exists('downloads/media/_MoeStandardKai.ttf'):
    !cp -f ../downloads/fonts/MoeStandardKai.ttf downloads/media/_MoeStandardKai.ttf
if os.path.exists('../Anki2/hypertts.tsv'):
    tts_mp = pd.read_csv('../Anki2/hypertts.tsv', sep='\t').set_index('Text').Hash.to_dict()
    for row in df.itertuples():
        text = json.loads(row.Variants)[0]['Traditional'] if row.Variants else row.Traditional
        dst = 'downloads/media/tocfl-tts-%s.mp3' % text
        if not os.path.exists(dst) and text in tts_mp:
            shutil.copy('../Anki2/tts/collection.media/hypertts-%s.mp3' % tts_mp[text], dst)
        df.loc[row.Index, 'Audio'] = '[sound:tocfl-tts-%s.mp3]' % text

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Variants', 'Audio']

model = genanki.Model(
    1698016000,
    'TOCFL',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'TOCFL',
        'qfmt': open('../dangdai/dangdai-qfmt.html').read(),
        'afmt': open('../dangdai/dangdai-afmt.html').read().replace(
            'if (pinyinEl && hanziEl)',
            'if (pinyinEl && hanziEl {{#Variants}}&& false{{/Variants}})'),
    }],
    css=open('../dangdai/dangdai.css').read(),
)

deck = genanki.Deck(1698016001, name='tocfl')

for row in df.reset_index().to_dict(orient='records'):
    note = genanki.Note(
        model=model,
        fields=[row[c] for c in cols],
        guid=genanki.guid_for('tocfl', row['ID']),
        tags=[row['ID'][:2]],
    )
    deck.add_note(note)

!rm -f tocfl.apkg
genanki.Package(deck, media_files=glob.glob('downloads/media/*')).write_to_file('tocfl.apkg')
!ls -l tocfl.apkg

-rw-r--r-- 1 jovyan users 154530539 Apr 15 00:37 tocfl.apkg
