# Modern Chinese (時代華語) vocabulary deck

Anki flashcards deck for the vocabulary from *Modern Chinese (時代華語)*, a recent (2019-2021) Mandarin textbook series by Tamkang University Chinese Language Center et al from Taiwan. It's a popular textbook in many language schools in Taiwan and one of TOCFL's recommended textbooks. This is a traditional Chinese textbook and biased towards taiwanese Mandarin.

Terms are from .pptx slides from the book's website: [https://sites.google.com/clc.tku.edu.tw/modernchinese-official/](https://sites.google.com/clc.tku.edu.tw/modernchinese-official/)

In total the series now has 7 books with around 8000 terms. But as the slides are only available for the first four books currently, that's only what this deck covers. There are some differences because of errors/omissions in the slides.

`ID`: a unique sortable term key e.g. `B1L16-3-1` = book 1, lesson 16, vocab part 3, term 1. You can use it to filter terms down to particular lessons you are need. Terms inside a section are numbered merely after the slides and would usually different from the books, doubly so because the books skip numbering some terms.

`POS`: part of speech.

`Audio`: neural TTS generated audio with a taiwanese config, but might still mispronounce some terms if there are multiple readings of the same hanzi. The book's website actually also provides [recorded audios](https://sites.google.com/clc.tku.edu.tw/modernchinese-official/%E9%9F%B3%E6%AA%94), but for whole vocab sections rather than split up by term, so it wouldn't be easy to add them to the deck.

TODO: perhaps transcribe with whisper and use some anki subtitles/srt tools to load them.

In [1]:
!pip install -q python-pptx genanki opencc

import glob
import os, os.path
import re
import pandas as pd
import genanki
import json
from pptx import Presentation
from opencc import OpenCC

opencc_tw2s = OpenCC('tw2s')

# Media files for the deck, point to Anki's collection.media dir
MEDIA_DIR = '/home/ivan/zhongwen/Anki2/ivan/collection.media'
TTS_MP3_PATTERN = f'{MEDIA_DIR}/modernchinese-tts-%s.mp3'

# Modify to point to your downloaded copy of the slides if you need to rerun slides parser.
PPTX_PATHS = {}
for d in sorted(glob.glob('/home/ivan/zhongwen/data/modernchinese/B?L??')):
    pp = glob.glob(d + '/*.pptx')
    pp = [s for s in pp if not re.match('.*(_0617.pptx|短文速讀|學習單1|學習單2附件).*', s)]
    assert len(pp) == 1
    PPTX_PATHS[os.path.basename(d)] = pp[0]

# Download ex.
# url="https://sites.google.com/clc.tku.edu.tw/modernchinese-official/%E7%AC%AC%E4%B8%80%E5%86%8A/b1-l1"
# for driveid in $(curl "$url" |
#                  egrep -o '<iframe[^>]*drive.google.com[^>]*preview[^>]*>' |
#                  sed -Ee 's|.*https://drive.google.com/file/d/([^/]+)/preview.*|\1|'); do
#   rclone backend copyid drive: "$driveid" ./
# done
#
# rclone via docker/podman:
# podman run -v ~/.config/rclone:/config/rclone:rw -v "$PWD:/pwd:rw" docker.io/rclone/rclone backend copyid drive: "$driveid" /pwd/

In [2]:
# Explore slides layout

if 0:
    rows = []
    for book in ['B1', 'B2', 'B3', 'B4']:
        for lesson, filepath in PPTX_PATHS.items():
            if not lesson.startswith(book): continue
            prs = Presentation(filepath)
            for i, slide in enumerate(prs.slides):
                rows.append({'Book': book, 'Lesson': lesson, 'Page': i+1, 'Layout': slide.slide_layout.name})
    df = pd.DataFrame(rows)
    print(df.groupby('Book').Layout.value_counts())

def explore(lesson, page=None):
    book = lesson[:2]
    filepath = PPTX_PATHS[lesson]
    prs = Presentation(filepath)
    res = []
    for slide_i, slide in enumerate(prs.slides):
        if page and slide_i+1 != page: continue
        paragraphs = []
        for shape in slide.shapes:
            if not shape.has_text_frame: continue
            for paragraph in shape.text_frame.paragraphs:
                text = ''.join(run.text for run in paragraph.runs).strip()
                if not text: continue
                pidx = shape.placeholder_format.idx if shape.is_placeholder else None
                #if (book, slide.slide_layout.name, pidx) in placeholders_mapping: continue
                paragraphs.append((text, pidx))
        if len(paragraphs) == 0: continue
        res.append({'page': slide_i+1, 'layout': slide.slide_layout.name, 'paragraphs': paragraphs})

    return res

#explore('B4L02', 44)

In [3]:
# Extract text from slides

# book/lesson, layout, placeholder idx -> field
PLACEHOLDER_MAP = {
    ('B1L00', ' 生詞', 10): 'Traditional',
    ('B1L00', ' 生詞', 11): 'Pinyin',
    ('B1L00', ' 生詞', 12): 'POS',
    ('B1L00', ' 生詞', 13): 'Meaning',
    ('B1', ' 生詞', 1): 'Traditional',
    ('B1', ' 生詞', 2): 'Pinyin',
    ('B1', ' 生詞', 3): 'Example',
    ('B1', ' 生詞', 4): 'ExampleP',
    ('B1', ' 生詞', 5): 'POS',
    ('B1', ' 生詞', 6): 'Meaning',
    ('B1', ' 生詞', 10): 'Traditional',
    ('B1', ' 生詞', 11): 'Pinyin',
    ('B1', ' 生詞', 12): 'Example',
    ('B1', ' 生詞', 13): 'ExampleP',
    ('B1', ' 生詞', 14): 'POS',
    ('B1', ' 生詞', 15): 'Meaning',
    ('B1', ' 生詞', None): 'Example2',
    ('B2', '自訂版面配置', 1): 'Traditional',
    ('B2', '自訂版面配置', 2): 'Meaning',
    ('B2', '自訂版面配置', 3): 'Pinyin',
    ('B2', '生詞', 1): 'Example',
    ('B2', '生詞', 2): 'Traditional',
    ('B2', '生詞', 3): 'Meaning',
    ('B2', '生詞', 4): 'Pinyin',
    ('B2', '生詞', 10): 'Example',
    ('B2', '生詞', 13): 'Traditional',
    ('B2', '生詞', 15): 'Meaning',
    ('B2', '生詞', 17): 'Pinyin',
    ('B2', '生詞_1', 10): 'Example',
    ('B2', '生詞_1', 13): 'Traditional',
    ('B2', '生詞_1', 15): 'Meaning',
    ('B2', '生詞_1', 17): 'Pinyin',
    ('B2', '1_生詞', 13): 'Traditional',
    ('B2', '1_生詞', 15): 'Meaning',
    ('B2', '1_生詞', 17): 'Pinyin',
    ('B2', '2_生詞_1', 13): 'Traditional',
    ('B2', '2_生詞_1', 15): 'Meaning',
    ('B2', '2_生詞_1', 17): 'Pinyin',
    ('B2', '1_生詞_1', 13): 'Traditional',
    ('B2', '1_生詞_1', 15): 'Meaning',
    ('B2', '1_生詞_1', 17): 'Pinyin',
    ('B3', '標題及內容', None): 'Freetext',
    ('B3', 'OBJECT', None): 'Freetext',
    ('B3', '標題投影片', None): 'TitleSlide',
    ('B3', '4_標題投影片', None): 'TitleSlide',
    ('B3', 'TITLE', None): 'TitleSlide',
    ('B4', '生詞_有例句', 10): 'Example',
    ('B4', '生詞_有例句', 13): 'Traditional',
    ('B4', '生詞_有例句', 15): 'Meaning',
    ('B4', '生詞_有例句', 17): 'Pinyin',
    ('B4', '生詞_無例句', 13): 'Traditional',
    ('B4', '生詞_無例句', 15): 'Meaning',
    ('B4', '生詞_無例句', 17): 'Pinyin',    
    ('B4', '生詞_無例句', 1): 'Traditional',
    ('B4', '生詞_無例句', 2): 'Meaning',
    ('B4', '生詞_無例句', 3): 'Pinyin',    
    ('B4', '生詞_有例句', 1): 'Example',
    ('B4', '生詞_有例句', 2): 'Traditional',
    ('B4', '生詞_有例句', 3): 'Meaning',
    ('B4', '生詞_有例句', 4): 'Pinyin',    
    ('B4', '短語', 1): 'Traditional',
    ('B4', '短語', 2): 'Meaning',
    ('B4', '短語', 3): 'Pinyin',
    ('B4', '短語', 4): 'Example',
    ('B4', '短語', 13): 'Traditional',
    ('B4', '短語', 15): 'Meaning',
    ('B4', '短語', 17): 'Pinyin',
}

entries = []

for lesson, filepath in PPTX_PATHS.items():
    book = lesson[:2]
    prs = Presentation(filepath)

    for slide_i, slide in enumerate(prs.slides):
        row = {
            'Book': lesson[:2],
            'Lesson': lesson,
            'Page': slide_i+1,
            'Layout': slide.slide_layout.name
        }
        
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue

            for paragraph in shape.text_frame.paragraphs:
                text = ''.join(run.text for run in paragraph.runs).strip()
                assert '\n' not in text

                if not text:
                    continue

                pidx = shape.placeholder_format.idx if shape.is_placeholder else None

                field = PLACEHOLDER_MAP.get((lesson, slide.slide_layout.name, pidx))
                if not field:
                    field = PLACEHOLDER_MAP.get((book, slide.slide_layout.name, pidx))
                    if not field:
                        continue

                if field == 'ExampleP':
                    continue

                if field in row:
                    assert '<' not in text
                    row[field] += '<br>' + text
                else:
                    row[field] = text

        if len(row) == 5 and 'Freetext' in row and len(row['Freetext']) < 10:
            continue

        if len(row) > 4:
            entries.append(row)

In [4]:
# Detects spans of consecutive pages with defs and assign IDs.

span = None
spans = {}

for i, row in enumerate(entries):
    if span:
        if row['Lesson'] == span[0] and row['Page'] == entries[i-1]['Page']+1:
            span[2] = row['Page']
            span[3].append(row)
            continue
    if span:
        spans.setdefault(span[0], []).append(span)
    span = [row['Lesson'], row['Page'], row['Page'], [row]]
spans.setdefault(span[0], []).append(span)

for lesson in spans:
    #print(lesson, [s[1:3] for s in spans[lesson]])
    assert len(spans[lesson]) <= 3
    for span_i, span in enumerate(spans[lesson]):
        title = None
        k = 0
        prev_row = None
        for row in span[3]:
            if 'TitleSlide' in row:
                title = row['TitleSlide']
                continue
            if title:
                row['Title'] = title
            row['Span'] = span_i + 1

            if lesson + row.get('Traditional', '') in ['B1L12事(情)', 'B1L15代表', 'B1L13壞', 'B1L12工作'] and prev_row and \
                prev_row['Traditional'] == row['Traditional']:
                prev_row['Example'] = prev_row.get('Example', '') + '<br>' + row.get('Example', '')
                row['Drop'] = 1
                continue
            prev_row = row

            k += 1
            row['ID'] = '%s-%d-%02d' % (row['Lesson'], row['Span'], k)

entries = [row for row in entries if 'Drop' not in row]
entries = [row for row in entries if 'TitleSlide' not in row]

In [5]:
# Parse free text on B3 slides which don't use placeholders

def cn_score(text):
    text = re.sub('([\t0-9 （）(  ) …、/‧-]|-$)', '', text)
    n = len(text)
    if n == 0: return 0
    k = sum(ord(c) >= 0x4E00 for c in text)
    if k == 0: return 0
    if k == n: return 1
    return k / n

ACC_POS = set(['Adv', 'Adv/ Vs-attr', 'Adv/Vs', 'Conj', 'Conj/Prep', 'Det', 'M', 'M/N', 'N',
               'N/V', 'N/Vi', 'N/Vs', 'Phrases', 'Prep', 'Ptc', 'V', 'V-sep', 'V/N', 'V/Vi',
               'V/Vs', 'Vaux', 'Vi', 'Vi/N', 'Vi/Vs', 'Vp', 'Vp-sep', 'Vp/N', 'Vpt', 'Vpt/N',
               'Vs', 'Vs-attr', 'Vs-attr/Adv', 'Vs-attr/Vi', 'Vs-pred', 'Vs-sep', 'Vs/Adv',
               'Vs/N', 'Vs/Vst', 'Vst', 'adv'])

ss = []

for row in entries:
    if row['Book'] != 'B3': continue
    text = row['Freetext'].split('<br>')

    sc = [cn_score(s) for s in text]
    if sc[:3] == [1, 1, 0]:
        text = [text[0] + text[1]] + text[2:]
    elif sc[:2] == [0, 0]:
        hanzi = {'guāi': '乖', 'dǎo': '倒', 'diàochá': '調查', 'shìyě': '視野', 'shǔ': '數',
                 'xiōngdì': '兄弟', 'wúguān': '無關', 'jiàoxué': '教學', 'bìng': '病',
                 'wǎngyǒu': '網友', 'kòng': '空', 'kōng': '空', 'ài': '愛', 'tiānzhēn': '天真',
                 'yǐlái': '老家', 'réngrán': '仍然'}[text[0]]
        i = text.index(hanzi)
        text = [hanzi] + text[:i] + text[(i+1):]
    sc = [cn_score(s) for s in text]
    assert sc[:3] == [1, 0, 0]

    if len(text) >= 4 and text[2] not in ACC_POS and text[3] in ACC_POS:
        text = text[:1] + [text[1] + ' ' + text[2]] + text[3:]
    if text[2] not in ACC_POS and sum(s in ACC_POS for s in text) == 1:
        i = [int(s in ACC_POS) for s in text].index(1)
        assert i != 0
        if i == 1 and any(c in 'āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ' for c in text[-1]):
            text = [text[0], text[-1]] + text[1:-1]
        elif i == 1 and any(c in 'āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ' for c in text[3]):
            text = [text[0], text[3], text[1], text[2]] + text[4:]
        else:
            assert i !=1
            text = text[:2] + [text[i]] + text[2:i] + text[(i+1):]

    if text[2] not in ACC_POS:
        for x,y in [('Wángmǔ', 'Niángniang'), ('shèqún', 'wǎngzhàn')]:
            if x in text:
                i = text.index(x)
                assert i+1 == text.index(y)
                text = text[:i] + [text[i] + ' ' + text[i+1]] + text[(i+2):]

    if text[2] not in ACC_POS:
        text = text[:2] + [''] + text[2:]

    sc = [0,0,0] + [int(cn_score(s) > 0.5) for s in text[3:]]
    if 1 in sc:
        i = sc.index(1)
        if i == 3:
            assert sc[-1] == 0
            row['Example'] = ' '.join(text[i:-1])
            text = text[:i] + [text[-1]]
        else:
            row['Example'] = ' '.join(text[i:])
            text = text[:i]

    row['Traditional'] = text[0]
    row['Pinyin'] = text[1]
    row['POS'] = text[2]
    row['Meaning'] = ' '.join(text[3:]).strip()

In [6]:
# Extract and normalize POS

POS_MAP = {
  '(Prep )': 'Prep',
  '(N, V)': 'N/V',
  '(Vi, N)': 'Vi/N',
  '(N, Vs)': 'N/Vs',
  '(N, M)': 'N/M',
  '(N, V)': 'N/V',
  '(N, V)': 'N/V',
  '(Adv)': 'Adv',
  '(Adv.)': 'Adv',
  '(Adv.,Vs)': 'Adv/Vs',
  '(Adv./Vs)': 'Adv/Vs',
  '(Conj)': 'Conj',
  '(Conj.)': 'Conj',
  '(Det)': 'Det',
  '(M)': 'M',
  '(M,N)': 'M/N',
  '(M/N)': 'M/N',
  '(M/V)': 'M/V',
  '(N)': 'N',
  '(N,V)': 'N/V',
  '(N,Vs-attr)': 'N/Vs-attr',
  '(N/M)': 'N/M',
  '(N/V)': 'N/V',
  '(N/V-sep)': 'N/V-sep',
  '(N/Vi)': 'N/Vi',
  '(N/Vp-sep)': 'N/Vp-sep',
  '(N/Vs)': 'N/Vs',
  '(N/Vst)': 'N/Vst',
  '(N/Vst/V)': 'N/Vst/V',
  '(Phrase)': 'Phrase',
  '(Prep)': 'Prep',
  '(Prep/V/Vst)': 'Prep/V/Vst',
  '(Ptc)': 'Ptc',
  '(V)': 'V',
  '(V-sep)': 'V-sep',
  '(V-sep/N)': 'V-sep/N',
  '(V/Adv.)': 'V/Adv',
  '(V/N)': 'V/N',
  '(V/Vs)': 'V/Vs',
  '(Vaux)': 'Vaux',
  '(Vaux/V)': 'Vaux/V',
  '(Vi)': 'Vi',
  '(Vi,N)': 'Vi/N',
  '(Vi/Adv)': 'Vi/Adv',
  '(Vi/N)': 'Vi/N',
  '(Vi/V-sep)': 'Vi/V-sep',
  '(Vp)': 'Vp',
  '(Vp-sep)': 'Vp-sep',
  '(Vp/Vpt)': 'Vp/Vpt',
  '(Vp/Vs)': 'Vp/Vs',
  '(Vpt)': 'Vpt',
  '(Vpt/N)': 'Vpt/N',
  '(Vpt/Vp)': 'Vpt/Vp',
  '(Vs)': 'Vs',
  '(Vs-attr / Vi)': 'Vs-attr/Vi',
  '(Vs-attr/Vp)': 'Vs-attr/Vp',
  '(Vs-attr)': 'Vs-attr',
  '(Vs-attr/Adv.)': 'Vs-attr/Adv',
  '(Vs-attr/Vp)': 'Vs-attr/Vp',
  '(Vs-pred)': 'Vs-pred',
  '(Vs-sep)': 'Vs-sep',
  '(Vs/V)': 'Vs/V',
  '(Vs/Vst)': 'Vs/Vst',
  '(Vst)': 'Vst',
  '(Vst/N)': 'Vst/N',
  '(Vst/Prep)': 'Vst/Prep',
  '(Vst/Vs/Adv./Vi)': 'Vst/Vs/Adv/Vi',
  '（Det)': 'Det',
  '(Vs-attr, N)': 'Vs-attr/N',

  '(Vs-attr': 'Vs-attr',
  '(Vs-attr,': 'Vs-attr,',
}
POS_REMAP = {
  'Names': 'Name',
  'Phrase': 'Ph',
  'phrase': 'Ph',
  'Phrases': 'Ph',
  'VS': 'Vs',
  'Vs-attr,': 'Vs-attr',
  'adv': 'Adv',
  'Cong': 'Conj',
  'Adv/ Vs-attr': 'Adv/Vs-attr',
  'V-sep/ N': 'V-sep/N',
}

for row in entries:
    text = row.get('Meaning', '').replace('<br>', ' ').strip()
    text0 = text
    pos = ''
    for pref in POS_MAP:
        if text.startswith(pref):
            pos = POS_MAP[pref]
            text = text[len(pref):]
    if not pos and 'measure' in text.lower() or 'classifier' in text.lower():
        pos = 'M'
    if not pos and not row.get('POS') and row.get('Title') == 'Phrase':
        pos = 'Phrase'
    if not pos and not row.get('POS') and row.get('Title') == 'Names':
        pos = 'Name'
    if pos:
        assert row.get('POS', pos) in (pos, ''), (row, pos)
        row['POS'] = pos
    pos = row.get('POS', '')
    pos = pos.strip()
    pos = POS_REMAP.get(pos, pos)
    row['POS'] = pos
    text = text.strip()
    row['Meaning'] = text

In [7]:
# Cleanup/normalize entries

for row in entries:
    if not row.get('Meaning') and row['Traditional'] == '保護':
        row['POS'], row['Meaning'] = 'V', 'to protect'
    if not row.get('Pinyin'):
        row['Pinyin'] = {'隻': 'zhī', 'KTV': 'KTV'}[row['Traditional']]
    if not row.get('Traditional'):
        row['Traditional'] = {'yìnzhāng/túzhāng': '印章/圖章'}[row['Pinyin']]

    trad = row['Traditional']
    for x, y in (('<br>', ''), (' ', ''), (',', '，'), ('（', '('), ('）', ')'), ('／', '/'), ('∕', '/'), ('。', '')):
        trad = trad.replace(x, y).strip()
    trad = {
        '你/妳好': '你好/妳好',
        '夜裡(裏)': '夜裡',
        '裡頭/裏頭': '裡頭',
        '計畫/劃': '計畫/計劃',
        '部分/份': '部分/部份',
        '健康檢查(健檢)': '健康檢查/健檢',
    }.get(trad, trad)
    assert re.match(r'^([\u4e00-\u9fff()，/]|KTV|101|卡拉OK|3C產品|……|？$)+$', trad)
    row['Traditional'] = trad

    pinyin = row['Pinyin']
    for x, y in (('<br>', ' '), ('\t', ' '), (' ', ' '), ('\u200b', ' '), ('\ufeff', ' '),
                 ('\u00a0', ' '), ('  ', ' '), (' , ', ', '), ('’', "'"), (' / ', '/'),
                 ('/ ', '/'), (' /', '/'), (' \ ', '/'), (' ?', '?'), ('‑', ' '),
                 ('-', ' '), ('ă', 'ǎ'), ('ĕ', 'ě'), ('ĭ', 'ǐ'), ('ŏ', 'ǒ'), ('ŭ', 'ǔ'),
                 ('ǎ', 'ǎ'), ('a\u030c', 'ǎ'), ('e\u030c', 'ě'),
                 (' (miàn)', '(miàn)'), (' (qíng)', '(qíng)'), (' (zi)', '(zi)'),
                 (' (jī)', '(jī)'), ('yèlǐ (lǐ)', 'yèlǐ'), (' (zhe)', '(zhe)'),
                 (' (cháng)', '(cháng)'), ('Xīngqí tiān', 'xīngqítiān'),
                 ('Jiànkāngjiǎnchá(jiànjiǎn)', 'Jiànkāngjiǎnchá/jiànjiǎn'),
                ):
        pinyin = pinyin.replace(x, y).strip()
    m = re.match(r"^([a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ/ '()]|\b, |3c |[?]$|……)+$", pinyin.lower())
    if not m: print(pinyin)
    assert m
    row['Pinyin'] = pinyin

    ex = row.get('Examples', row.get('Example', ''))
    if row.get('Example2'):
        for s in row['Example2'].split('<br>'):
            s = s.strip()
            if len(s) >= 2 and cn_score(s) > 0.5:
                if ex: ex += '<br>'
                ex += s
    ex = re.sub(r'(<br>)+', '<br>', ex).strip()
    ex = re.sub(r'^(<br>)+', '', ex).strip()
    ex = re.sub(r'(<br>)+$', '', ex).strip()
    ex = re.sub(r'([\t \u200b\ufeff\u00a0])+', ' ', ex).strip()
    row['Examples'] = ex
    if 'Example' in row: del row['Example']
    if 'Example2' in row: del row['Example2']

    row.setdefault('POS', '')

    for col in ['Traditional', 'Pinyin', 'Meaning', 'POS', 'Examples']:
        row[col] = row[col].strip()
    for col in ['Traditional', 'Pinyin', 'Meaning']:
        assert row[col]

    text = row['Meaning']
    for x, y in [
        (r'([\t \u200b\ufeff\u00a0])+', ' '),
        (r'“(.*)’’', r'"\1"'),
        ('’', "'"), ('‘', "'"), ('’', "'"), 
        ('”;', '";'), ('”[.]', '".'), ('”=', '"='), ('“of ”', '"of"'), ('“', ' "'), ('”', '" '),
        ('ﬃ', 'ffi'), ('ﬄ', 'ffl'), ('ﬁ', 'fi'), ('ﬂ', 'fl'), ('ﬀ', 'ff'),
        ('…[.]{3}', '...'), ('…', '...'),
        (' *[(] *M[:：] *', ' (M: '), (' *： *', ': '), (' *； *', '; '),
        ('、 *', ', '),
        ('  +', ' '),
        ('（', ' ('), ('）', ') '),
        (' [)] , ', '), '), (' ([:;,]) ', r'\1 '),
        (' ,([a-z])', r', \1'),
        (r'\.\.\., *', '..., '),
        ('([a-z)])([,;:])([a-z])', r'\1\2 \3'),
        ('([a-z])([(])([a-z])', r'\1 (\3'), ('([a-z])([)])([a-z])', r'\1) \3'),
        ('[(] *', '('), (' *[)]', ')'),
        ('！,ah', '! ah'), ('！', '! '), (' *! *', '! '), ('! "', '!"'), ('!,', '!, '),
        (r'\(someone~\)', '(someone)'), (r'\[foolishly\]', '(foolishly)'),
        ('^N[)] ', ''),
        ('ă', 'ǎ'), ('ĕ', 'ě'), ('ĭ', 'ǐ'), ('ŏ', 'ǒ'), ('ŭ', 'ǔ'), ('ǎ', 'ǎ'), ('a\u030c', 'ǎ'), ('e\u030c', 'ě'),
        ('^[)] *', ''),
        ('^To ', 'to '),
        (' +', ' '),
    ]:
        text = re.sub(x, y, text).strip()
    m = re.match(r'''^([-a-z0-9āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ ,.:;!?+&/=()"'\u4e00-\u9fff])+$''', text.lower())
    if not m: print(text)
    assert m
    row['Meaning'] = text

    row['Simplified'] = opencc_tw2s.convert(row['Traditional'])

    row['Tags'] = ''
    if re.match('.*([/()]).*', row['Traditional']):
        row['Tags'] = 'Variants'
        trad = row['Traditional']
        expanded = []
        for s in trad.split('/'):
            if '(' in s:
                expanded.extend([re.sub('[(][^)]*[)]', '', s), re.sub('[()]', '', s)])
            else:
                expanded.append(s)
        expanded = {
            '你/妳好': ['你好', '妳好'],
            '有(一)點(兒)': ['有點', '有一點','有點兒', '有一點兒'],
            '(一)點(兒)': ['一點', '點', '一點兒', '點兒'],
            '計畫/劃': ['計畫', '計劃'],
            '部分/份': ['部分', '部份'],
            '白白/白(白)': ['白白', '白'],
        }.get(trad, expanded)
        row['TraditionalExpanded'] = '/'.join(expanded)
    else:
        row['TraditionalExpanded'] = row['Traditional']

    audio = ''
    if os.path.exists(TTS_MP3_PATTERN % row['ID']):
        audio = '[sound:%s]' % os.path.basename(TTS_MP3_PATTERN % row['ID'])
    row['Audio'] = audio

df = pd.DataFrame(entries)
assert list(df.ID) == list(sorted(df.ID))
cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Examples', 'Audio', 'TraditionalExpanded', 'Tags']
#cols += [c for c in df if c not in cols]
df = df[cols].set_index('ID').copy()
df.to_csv('modernchinese.tsv', sep='\t', index=True)
len(df)

4061

In [8]:
# Generate anki package

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Examples', 'Audio']

model = genanki.Model(
    1696565462,
    'ModernChinese',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'ModernChinese',
        'qfmt': open('../dangdai/dangdai-qfmt.html').read(),
        'afmt': open('../dangdai/dangdai-afmt.html').read().replace(
            '<div>{{Audio}}</div>',
            '<div>{{Audio}}</div>\n<br>\n<div>{{Examples}}</div>')
    }],
    css=open('../dangdai/dangdai.css').read(),
)

deck = genanki.Deck(1696565463, name='modernchinese', description='Modern Chinese vocabulary deck')

for row in df.reset_index()[cols].to_records(index=False):
    tags = df.loc[row[0], 'Tags'].split()
    deck.add_note(genanki.Note(model=model, fields=row, tags=tags))

media = list(set(df.Audio.str.extract('sound:(.*.mp3)')[0])) + ['_MoeStandardKai.ttf']
media = [os.path.join(MEDIA_DIR, s) for s in media]
!rm -f modernchinese.apkg
genanki.Package(deck, media_files=media).write_to_file('modernchinese.apkg')
!ls -l modernchinese.apkg

-rw-r--r-- 1 jovyan users 100363485 Oct  7 08:32 modernchinese.apkg


In [9]:
# Also export in pleco's flashcard / user dictionary format

with open('modernchinese_pleco.txt', 'w') as fout:
    last_header = ''
    for row in df.itertuples():
        m = re.match('^B([1-4])(L[0-9]{2}-[1-3])-.*', row.Index)
        header = f'//時代華語/Book {m[1]}/{m[2]}'
        if header != last_header:
            fout.write(header + '\n')
            last_header = header

        if row.POS:
            defn = f'({row.POS}) {row.Meaning}'
        else:
            defn = row.Meaning

        text = f'{row.Simplified}[{row.Traditional}]\t{row.Pinyin}\t{defn}'
        if row.Examples:
            text += '\uEAB1'  # new line
            text += '\uEAC1\uEC00\uEC05\uECAA\uECFF'  # text color 05AAFF, light blue
            text += row.Examples \
                .replace('<br>', '\uEAB1') \
                .replace(
                    row.Traditional,
                    # highlight term with #05BBBB teal
                    '\uEAC2\uEAC1\uEC00\uEC05\uECBB\uECBB' +
                    row.Traditional +
                    '\uEAC2\uEAC1\uEC00\uEC05\uECAA\uECFF'
                )
            text += '\uEAC2' # end of text color
            assert '\n' not in row.Examples
        fout.write(text + '\n')

!ls -l modernchinese_pleco.txt

-rw-r--r-- 1 jovyan users 617940 Oct  7 08:32 modernchinese_pleco.txt
