# TBCL parser

In [1]:
import os, re, glob, requests, io, urllib, json
import pandas as pd
import opencc

pd.options.display.max_rows = 2000

opencc_tw2s = opencc.OpenCC('tw2s')

Download files from TBCL home page: https://coct.naer.edu.tw/download/tech_report/

In [2]:
![[ ! -d downloads && -d ../downloads/tbcl ]] && ln -s ../downloads/tbcl downloads
!mkdir -p downloads

if not os.path.exists('downloads/.done'):
    home_url = 'https://coct.naer.edu.tw/download/tech_report/'
    resp = requests.get(home_url).content.decode('utf-8')
    for url in sorted(re.findall('<a href="([^"]+[.](?:xlsx|docx))"', resp)):
        url = os.path.join(home_url, url)
        !cd downloads && wget -nc "{url}"
    !chmod a-w downloads/*.xlsx; touch downloads/.done

Symlinks for convenience and checksums:

In [3]:
%%bash -e
cd downloads
chmod a-w *.xlsx *.ods *.pdf
ln -sf '臺灣華語文能力基準詞語表_111-11-14.xlsx' words.xlsx
ln -sf '臺灣華語文能力基準詞語表_111-11-14.ods' words.ods
ln -sf '臺灣華語文能力基準漢字表_111-09-20.docx' chars.docx
ln -sf '臺灣華語文能力基準漢字表_111-09-20.xlsx' chars.xlsx
ln -sf '臺灣華語文能力基準類詞綴表_111-09-20.docx' affixes.docx
ln -sf '臺灣華語文能力基準類詞綴表_111-09-20.xlsx' affixes.xlsx
ln -sf '臺灣華語文能力基準語法點表_112-01-04.xlsx' grammar.xlsx
ln -sf '臺灣華語文能力基準語法點表_112-01-04.docx' grammar.docx
ln -sf '臺灣華語文能力基準基礎詞彙表_111-09-20.xlsx' glossary.xlsx
sha256sum *.xlsx

5e92ac49c5bb203e16fea29c53a2b2cb790033fb332a699c7689adde21528b8f  affixes.xlsx
6329e2516c5dbe416b85f6a94d200ebe95493f24f233a63dc10d85aa257a088f  chars.xlsx
b6ce3747a06c8482ce5f4059689463de01a45d2b78707feb85917404ccffae62  glossary.xlsx
c587989cf89992d55d97a2f932289ef071648ca80339ccfaff7bb823914e5bcf  grammar.xlsx
cb16dcd262eb3e499273f972c9a3a404c40042a7def35631fc40b2a64dd50eb0  words.xlsx
b6ce3747a06c8482ce5f4059689463de01a45d2b78707feb85917404ccffae62  臺灣華語文能力基準基礎詞彙表_111-09-20.xlsx
6329e2516c5dbe416b85f6a94d200ebe95493f24f233a63dc10d85aa257a088f  臺灣華語文能力基準漢字表_111-09-20.xlsx
cb16dcd262eb3e499273f972c9a3a404c40042a7def35631fc40b2a64dd50eb0  臺灣華語文能力基準詞語表_111-11-14.xlsx
c587989cf89992d55d97a2f932289ef071648ca80339ccfaff7bb823914e5bcf  臺灣華語文能力基準語法點表_112-01-04.xlsx
5e92ac49c5bb203e16fea29c53a2b2cb790033fb332a699c7689adde21528b8f  臺灣華語文能力基準類詞綴表_111-09-20.xlsx


## Parse wordlist

In [4]:
glossary_df = pd.read_excel('downloads/glossary.xlsx').rename(columns={
    '序號': 'ID',
    '詞語': 'Traditional',
    '注音': 'Zhuyin',
    '漢拼': 'Pinyin',
    '詞類/性質': 'POS',
    '詞彙英譯': 'Meaning',
    '語義/義項': 'Meaning2',
    '用法-常用搭配詞': 'Compounds',
    '例句': 'Examples',
    '級別': 'Level',
})
assert list(glossary_df.ID - 1) == list(glossary_df.index)
glossary_df['Level'] = glossary_df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(glossary_df.Level.isnull()) == 0
glossary_df.to_csv('glossary.csv', index=False)
print('glossary.csv: %d rows' % len(glossary_df))

glossary.csv: 1518 rows


In [5]:
VARIANTS_EXC = {
    ('姊姊/姐姐/姊/姐', 'jiějie/jiě'): [['姊姊','jiějie'], ['姐姐','jiějie'], ['姊','jiě'], ['姐','jiě']],
    ('那/那裡/那裏/那兒', 'nà/nàlǐ/nàr'): [['那', 'nà'], ['那裡', 'nàlǐ'], ['那裏', 'nàlǐ'], ['那兒', 'nàr']],
    ('這/這裡/這裏/這兒', 'zhè/zhèlǐ/zhèr'): [['這', 'zhè'], ['這裡', 'zhèlǐ'], ['這裏', 'zhèlǐ'], ['這兒', 'zhèr']],
    ('手錶/手表/錶/表', 'shǒubiǎo/biǎo'): [['手錶', 'shǒubiǎo'], ['手表', 'shǒubiǎo'], ['錶', 'biǎo'], ['表', 'biǎo']],
    ('新台幣/新臺幣/台幣/臺幣', 'xīntáibì/táibì'): [['新台幣', 'xīntáibì'], ['新臺幣', 'xīntáibì'], ['台幣', 'táibì'], ['臺幣', 'táibì']],
    ('慾望/欲望/慾', 'yùwàng/yù'): [['慾望', 'yùwàng'], ['欲望', 'yùwàng'], ['慾', 'yù']],
    ('侄子/姪子/侄兒/姪兒', 'zhízi/zhír'): [['侄子', 'zhízi'], ['姪子', 'zhízi'], ['侄兒', 'zhír'], ['姪兒', 'zhír']],
    ('嘴脣/嘴唇/脣/唇', 'zuǐchún/chún'): [['嘴脣', 'zuǐchún'], ['嘴唇', 'zuǐchún'], ['脣', 'chún'], ['唇', 'chún']],
    ('沒(有)用', 'méi(yǒu)yòng'): [['沒用', 'méiyòng'], ['沒有用', 'méiyǒuyòng']],
    ('一邊(兒)', 'yìbiān(r)'): [['一邊', 'yìbiān'], ['一邊兒', 'yìbiānr']],
}

def get_variants(vocab, pinyin):
    vocab = vocab.strip()
    pinyin = re.sub(' */ *', '/', pinyin.strip())

    ps = re.sub('[^()/]', '', pinyin)
    vs = re.sub('[^()/]', '', vocab)
    if ps == '' and vs == '':
        return []

    if (vocab, pinyin) in VARIANTS_EXC:
        return VARIANTS_EXC[(vocab, pinyin)]

    if vs == '' and ps:
        assert set(ps) == {'/'}
        return [[vocab, p.strip()] for p in pinyin.split('/')]

    if vs and ps == '':
        assert set(vs) == {'/'}
        assert len(set(map(len, vocab.split('/')))) == 1, vocab  # all terms same length
        return [[v, pinyin] for v in vocab.split('/')]

    assert vs == ps and set(ps) == {'/'}, (vocab, pinyin)
    return [[v.strip(), p.strip()] for (v, p) in zip(vocab.split('/'), pinyin.split('/'))]

def get_variants_str(vocab, pinyin):
    variants = get_variants(vocab, pinyin)
    return json.dumps(variants, ensure_ascii=False) if variants else ''

def fix_traditional(s):
    # number suffixes for different pronunciations, +3 weird duplicate entries 空檔 道 來往
    s = re.sub('[0-9]', '', s)
    s = re.sub('／', '/', s)
    assert re.match('^[\u4E00-\u9FFF/()]+$', s), (row, s)
    return s

PINYIN_MP = {
    ('nǚér', '女兒'): "nǚ'ér",
    ('wǎnān', '晚安'): "wǎn'ān",
    ('zǎoān', '早安'): "zǎo'ān",
    ('kěài', '可愛'): "kě'ài",
    ('xiǎpéngyǒu', '小朋友'): 'xiǎopéngyǒu',
    ('dáàn', '答案'): "dá'àn",
    ('jú', '橘子'): 'júzi',
    ('pèngchù', '碰觸/觸碰'): 'pèngchù/chùpèng',
    ('wányèr', '玩意兒'): "wányìr",
    ('qīněr', '親耳'): "qīn'ěr",
    ('yāgēr', '壓根兒'): "yāgēnr",
    ('yìdiǎn/yìdiǎndiǎn/yìdiǎr', '一點/一點點/一點兒'): 'yìdiǎn/yìdiǎndiǎn/yìdiǎnr',
    ('xiáchí', '挾持'): 'xiéchí', #https://dict.revised.moe.edu.tw/dictView.jsp?ID=105985&word=%E6%8C%BE%E6%8C%81
}

def fix_pinyin(py, trad=''):
    for x, y in ['ɑa', (' */ *', '/'), (r'\s+', ' '), (' */$', ''), ('^/ *', '')]:
        py = re.sub(x, y, py).strip()
    if (py.replace(' ', ''), trad) in PINYIN_MP:
        return PINYIN_MP[(py.replace(' ', ''), trad)]
    # Pinyin spaces are not meaningul in TBCL lists, mostly just syllable spaces there.
    # Remove to make more mergeable with TOCFL. Also no upper letters.
    assert py == py.lower() and "'" not in py
    merged = ''
    for part in py.split():
        if merged and merged[-1] not in '/()' and part[0] in 'aeoāáǎàēéěèōóǒò':
            merged += "'"
        merged += part
    py = merged
    assert re.match("^[a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ/()']+$", py), (py, repr(py))
    return py

def to_simplified(trad):
    simp = opencc_tw2s.convert(trad)
    simp = simp.replace('擡', '抬') # opencc bug
    simp = simp.replace('砲', '炮') # TGH char
    return simp


df = pd.read_excel('downloads/words.xlsx').rename(columns={
    '序號': 'ID',
    '詞語': 'Traditional',
    '等別': 'Grade',
    '級別': 'Level',
    '情境': 'Context',
    '書面字頻(每百萬字)': 'WritingFreq',
    '口語字頻(每百萬字)': 'SpeakingFreq',
    '簡編本系統號': 'MOE', # MOE dict IDs, https://dict.concised.moe.edu.tw/dictView.jsp?ID=.
    '參考注音': 'Zhuyin',
    '參考漢語拼音': 'Pinyin'
})

assert list(df.ID - 1) == list(df.index)
df = df.drop(columns=['Grade'])

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df['glossary_key'] = (df.Level.str.slice(0, 1) + df.Traditional)
glossary_df = glossary_df.fillna('')
glossary_df['glossary_key'] = (glossary_df.Level.str.slice(0, 1) + glossary_df.Traditional)
glossary_mp = glossary_df.assign(idx=glossary_df.index).groupby('glossary_key').idx.apply(list)

df['Traditional'] = df.Traditional.map(fix_traditional)
df.insert(2, 'Simplified', df.Traditional.map(to_simplified))
df['Pinyin'] = [fix_pinyin(row.Pinyin, row.Traditional) for row in df.itertuples()]
df['Variants'] = [get_variants_str(row.Traditional, row.Pinyin) for row in df.itertuples()]
df['MOE'] = df['MOE'].str.replace("'", '"')

for row in df.itertuples():
    variants = [v for v,p in json.loads(row.Variants)] if row.Variants else [row.Traditional]
    for v, ids in json.loads(row.MOE):
        assert v in variants

# Join with vocab_df
for col in ['POS', 'Meaning', 'Compounds', 'Examples']:
    df[col] = ''
    for row in df.itertuples():
        text = [glossary_df.loc[i, col] for i in glossary_mp.get(row.glossary_key, [])]
        text = [s.strip() for s in text if s.strip()]
        assert ' / ' not in ''.join(text), text
        if not text: continue
        dedup = []
        for s in text:
            if s not in dedup: dedup.append(s)
        text = ' / '.join(dedup)
        if col == 'POS':
            text = text.replace(' ', '')
            text = text.replace('Phrase', 'Ph')
        elif col == 'Compounds':
            text = text.replace(';', '')
            text = text.replace(' / ', '，').split('，')
            dedup = []
            for s in text:
                if s not in dedup: dedup.append(s)
            text = '，'.join(text)
            if text: text += '。'
        for x, y in [(' *[(] +', ' ('), (' +[)]', ')'), (' +/ +', ' / ')]:
            text = re.sub(x, y, text).strip()
        assert '\n' not in text
        df.loc[row.Index, col] = text.strip()

print('Unjoined glossary vocab: %s' % ' '.join(set(glossary_df.glossary_key) - set(df.glossary_key)))
df = df.drop(columns=['glossary_key'])

df.to_csv('tbcl.csv', index=False)
print('tbcl.csv: %d rows' % len(df))

Unjoined glossary vocab: 1應該/應 1臺灣/台灣 3韓國 2德國 1車/車子 1還 1美國 3亞洲 1日本 1法國 3歐洲 3月台/月臺 1中國 2罐 1英國 3義大利 2瓶子/瓶瓶 3非洲 3美洲
tbcl.csv: 14425 rows


In [6]:
# Generate version with variants expanded.

expanded_rows = []
for row in df.fillna('').to_dict(orient='records'):
    row['Simplified'] = to_simplified(row['Traditional'])
    if not row['Variants']:
        expanded_rows.append(row)
    else:
        for trad, pinyin in json.loads(row['Variants']):
            row_v = dict(row)
            row_v['Traditional'] = trad
            row_v['Simplified'] = to_simplified(trad)
            row_v['Pinyin'] = pinyin
            expanded_rows.append(row_v)
            moe = [x for x in json.loads(row['MOE']) if x[0] == trad]
            row_v['MOE'] = json.dumps(moe, ensure_ascii=False)

expanded_df = pd.DataFrame(expanded_rows).drop(columns=['Variants'])
expanded_df.to_csv('tbcl-expanded.csv', index=False)
print('tbcl-expanded.csv: %d entries' % len(expanded_df))

tbcl-expanded.csv: 14868 entries


In [7]:
EAC1_EX  = '\uEAC1\uEC00\uEC05\uECAA\uECFF'  # examples, #05aaff blue
EAC1_HL  = '\uEAC1\uEC00\uEC00\uECCC\uECCC'  # term highlight in examples, teal

tbcl_indexed_df = pd.read_csv('tbcl.csv', dtype='str').fillna('').set_index('ID')

with open('tbcl-pleco.txt', 'w') as fout:
    last_level = ''
    for row in pd.read_csv('tbcl-expanded.csv', dtype='str').fillna('').itertuples():
        level = row.Level
        if last_level != level:
            last_level = level
            fout.write(f'//TBCL/L{level}\n')
        text = f'{row.Simplified}[{row.Traditional}]\t{row.Pinyin}\t'
        trow = tbcl_indexed_df.loc[row.ID]
        if trow.Variants:
            text += f'{trow.Traditional} [{trow.Pinyin}] '
        if trow.POS:
            text += f'({trow.POS}) '
        if trow.Meaning:
            text += f'{trow.Meaning} '
        text += f'[TBCL{level}]'
        # Compounds and examples in light blue on separate lines
        for ex in [row.Compounds, row.Examples]:
            if not ex: continue
            text += (
                f'\uEAB1{EAC1_EX}' +
                ex.replace(' / ', '\uEAB1').replace(
                    row.Traditional,
                    f'\uEAC2{EAC1_HL}{row.Traditional}\uEAC2{EAC1_EX}'
                ) +
                '\uEAC2'
            )
        fout.write(text + '\n')

!ls -l tbcl-pleco.txt

-rw-r--r-- 1 jovyan users 852010 Nov  4 14:58 tbcl-pleco.txt


## Convert other files

In [8]:
df = pd.read_excel('downloads/chars.xlsx').rename(columns={
    '序號': 'ID',
    '漢字': 'Traditional',
    '等別': 'Grade',
    '級別': 'Level',
    '情境': 'Context',
    '書面字頻（每百萬字）': 'WritingFreq',
    '口語字頻（每百萬字）': 'SpeakingFreq',
})

assert list(df.ID - 1) == list(df.index)
df = df.drop(columns=['Grade'])

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df['Traditional'] = df.Traditional.map(fix_traditional)
df.to_csv('chars.csv', index=False)
print('chars.csv: %d rows' % len(df))

chars.csv: 3100 rows


In [9]:
rows = []
for row in df.to_dict(orient='records'):
    for ch in row['Traditional'].split('/'):
        row['char'] = ch
        rows.append(dict(row))

expanded_df = pd.DataFrame(rows)[['char'] + list(df.columns)]
expanded_df.to_csv('chars-expanded.csv', index=False)
print('chars-expanded.csv: %d rows' % len(expanded_df))

chars-expanded.csv: 3133 rows


In [10]:
df = pd.read_excel('downloads/grammar.xlsx').rename(columns={
    '序號': 'ID',
    '語法點': 'Grammar',
    '等別': 'Grade',
    '級別': 'Level',
    '例句': 'Example',
})

assert list(df.ID - 1) == list(df.index)
df = df.drop(columns=['Grade'])

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df.to_csv('grammar.csv', index=False)
print('grammar.csv: %d rows' % len(df))

grammar.csv: 496 rows


In [11]:
df = pd.read_excel('downloads/affixes.xlsx').rename(columns={
    '序號': 'ID',
    '類詞綴': 'Affix',
    '語法點': 'Grammar',
    '級別': 'Level',
    '說明': 'Explanation',
    '相關詞彙': 'Words',
})
assert list(df.ID - 1) == list(df.index)

df['Level'] = df.Level.str.extract('^第([1-7][*]?)級$')[0]
assert sum(df.Level.isnull()) == 0

df.to_csv('affixes.csv', index=False)
print('affixes.csv: %d rows' % len(df))

affixes.csv: 73 rows


## Readings check

In [12]:
df = pd.read_csv('tbcl.csv', dtype='str').fillna('')

In [13]:
# Check readings
if os.path.exists('../cedict/syllables.csv'):
    readings_mp = {'一': set(['yì','yí'])}
    syll_df = pd.read_csv('../cedict/syllables.csv', dtype='str').fillna('')
    for row in syll_df.itertuples():
        readings_mp.setdefault(row.Traditional, set()).add(row.Pinyin.lower())
        readings_mp.setdefault(row.Simplified, set()).add(row.Pinyin.lower())
    readings_mp = {x: set([y.strip().lower() for y in readings_mp[x] if y.strip()]) for x in readings_mp}

    def gen_readings(trad):
        if trad == '':
            yield ''
        elif trad[0] not in readings_mp or ord(trad[0]) < 0x3E00:
            yield from gen_readings(trad[1:])
        else:
            for x in readings_mp[trad[0]]:
                for y in gen_readings(trad[1:]):
                    yield x.lower() + ("'" if y and y[0] in 'aāáǎàeēéěèoōóǒò' else '') + y

    for row in df.itertuples():
        variants = json.loads(row.Variants) if row.Variants else [[row.Traditional, row.Pinyin]]
        for trad, pinyin in variants:
            readings = list(gen_readings(trad))
            if re.sub('', '', pinyin) not in readings:
                print(list(row._asdict().values())[1:9], ':\t', trad, pinyin, 'vs.', readings[:min(10, len(readings))])

['1533', '噢', '噢', '4', '核心詞', '80', '19', '[["噢", ["44379"]]]'] :	 噢 yǔ vs. ['ō']
['1572', '阿嬤/阿媽', '阿嬷/阿妈', '4', '1.個人資料', '41', '301', '[]'] :	 阿嬤 āma vs. ['āmo', 'āmā', 'āmó', 'àmo', 'àmā', 'àmó', 'ēmo', 'ēmā', 'ēmó']
['2889', '尺寸', '尺寸', '5', '9.購物、商店', '13', '13', '[["尺寸", ["31455"]]]'] :	 尺寸 chícun vs. ['chǐcun', 'chǐcùn', 'chěcun', 'chěcùn']
['3331', '古跡/古蹟', '古迹/古迹', '5', '5.交通、旅遊', '7', '26', '[]'] :	 古蹟 gǔjī vs. ['gǔjì']
['4063', '奇蹟', '奇迹', '5', '核心詞', '31', '24', '[["奇蹟", ["24507"]]]'] :	 奇蹟 qíjī vs. ['qíjì', 'jījì']
['5329', '磅', '磅', '6', '', '12', '3', '[["磅", ["2839", "1146"]]]'] :	 磅 pāng vs. ['páng', 'bàng']
['7155', '摟', '搂', '6', '', '9', '1', '[["摟", ["13652", "13656", "13677"]]]'] :	 摟 lóu vs. ['lǒu', 'lōu', 'lou']
['7486', '鋪/舖', '铺/舖', '6', '', '32', '26', '[["鋪", ["3515", "3440"]]]'] :	 舖 pū vs. ['pù']
['7542', '齊', '齐', '6', '', '33', '22', '[["齊", ["21691", "24545", "36690"]]]'] :	 齊 zī vs. ['qí']
['7946', '事蹟', '事迹', '6', '', '15', '5', '[["事蹟", ["33653"]]]

## Merge with CEDICT and generate anki deck

In [14]:
df = pd.read_csv('tbcl.csv', dtype='str').fillna('')
cedict_df = pd.read_csv('../cedict/cedict.csv')
cedict_idx_mp = cedict_df.assign(idx=cedict_df.index).groupby('Traditional').idx.apply(list)

rows = []

for row in df.fillna('').to_dict(orient='records'):
    py = set([row['Pinyin']])
    variants = json.loads(row['Variants']) if row['Variants'] else [[row['Traditional'], row['Pinyin']]]
    if len(variants) == 1:
        variants[0].append(row['Simplified'])
    else:
        variants = [(t,p,to_simplified(t)) for t,p in variants]
        for var in variants:
            py.add(var[1])

    matches = cedict_idx_mp.get(row['Traditional'], [])
    if len(matches) == 0:
        for v in variants:
            matches.extend(cedict_idx_mp.get(v[0], []))

    flag = ''
    if len(matches) == 0:
        if row['MOE'] != '[]' and row['Level'] <= '5':
            flag = 'missing'
        row['CEDICT'] = ''
    else:
        ce_py = set([cedict_df.Pinyin[i].lower() for i in matches])
        for v in variants:
            if re.sub(' ', '', v[1]) not in ce_py:
                #flag += ' py%s/%s' % (ce_py, v[1])
                break

        ce_s = set([cedict_df.Simplified[i] for i in matches])
        for trad,p,simp in variants:
            if trad in cedict_idx_mp and simp not in ce_s:
                flag += ' simp%s/%s' % (ce_s, simp)
                break

        # Prioritize pronunciation matches
        # TODO extract to cedict-lib + taiwan pr.
        if len(matches) > 1:
            m = [i for i in matches if cedict_df.Pinyin[i].lower() in py]
            if len(m) > 0:
                matches = m + [i for i in matches if i not in m]

        defs = []
        for i in matches:
            py1 = list(py)[0] if len(py) == 1 else ''
            defn = cedict_df.Definitions[i]
            if row['Variants']:
                defn = '%s [%s] %s' % (cedict_df.Traditional[i], cedict_df.Pinyin[i], defn)
            elif cedict_df.Pinyin[i].lower() != py1:
                defn = '[%s] %s' % (cedict_df.Pinyin[i], defn)
            defn = re.sub(r'/CL:個\|个\[ge4\]$', '', defn)
            defs.append(defn)

        row['CEDICT'] = '<br> '.join(defs)

    row['Flag'] = flag
    rows.append(row)

merged_df = pd.DataFrame(rows).drop(columns=['Context', 'Zhuyin'])

*Taiwan TBCL wordlist (Traditional)*

TBCL (Taiwan Benchmarks for the Chinese Language) wordlist, 14425 words over 7 levels. Parsed from official excel sheets from [TBCL](https://coct.naer.edu.tw/TBCL/) website, including definitions/examples for about 1500 lower level words that they provide. CC-CEDICT definitions for the rest.

In [15]:
import genanki, shutil

df = merged_df.copy()
df['Audio'] = ''

!mkdir -p data/media
!cp -f ../downloads/fonts/MoeStandardKai.ttf data/media/_MoeStandardKai.ttf

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'Level', 'WritingFreq', 'SpeakingFreq', 'MOE',
        'POS', 'Meaning', 'Compounds', 'Examples', 'CEDICT', 'Variants', 'Audio']

model = genanki.Model(
    1698579990,
    'TBCL',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'TBCL',
        'qfmt': open('../dangdai/dangdai-qfmt.html').read().replace('{{ID}}', 'TBCL L{{Level}}'),
        # TODO fix template
        'afmt': '''{{FrontSide}}
<hr id=answer>
<div lang="en"><span id="ddzw-pinyin">{{Pinyin}}</span></div><br>
<div lang="en">{{#POS}}({{POS}}) {{/POS}}{{Meaning}}</div><br>
<div lang="en">{{CEDICT}}</div><br>
<div>{{Compounds}}<br>{{Examples}}</div><br>
<div>{{Audio}}</div>
''' + re.sub('^.*<script>', '<script>', open('../dangdai/dangdai-afmt.html').read()).replace(
            'if (pinyinEl && hanziEl)',
            'if (pinyinEl && hanziEl {{#Variants}}&& false{{/Variants}})'),
    }],
    css=open('../dangdai/dangdai.css').read(),
)

deck = genanki.Deck(1698579991, name='tbcl')

for row in df.reset_index().to_dict(orient='records'):
    note = genanki.Note(
        model=model,
        fields=[row[c] for c in cols],
        guid=genanki.guid_for('tbcl', row['ID']),
        tags=['L%s' % row['Level'][0]],
    )
    deck.add_note(note)

!rm -f tbcl.apkg
genanki.Package(deck, media_files=glob.glob('data/media/*')).write_to_file('tbcl.apkg')
!ls -l tbcl.apkg

-rw-r--r-- 1 jovyan users 17426742 Nov  4 14:58 tbcl.apkg
