# Modern Chinese (時代華語)

Parser for book's vocabulary from slides from its supplementary website: https://sites.google.com/clc.tku.edu.tw/modernchinese-official/

In [1]:
!pip install -q python-pptx genanki opencc

import glob, os, re, json
import pandas as pd
import genanki
import opencc
from pptx import Presentation

In [2]:
# Modify to point to a copy of the slides if you need to rerun slides parser.
PPTX_PATHS = {}
for d in sorted(glob.glob('downloads/B?L??')):
    pp = glob.glob(d + '/*.pptx')
    pp = [s for s in pp if not re.match('.*(_0617.pptx|短文速讀|學習單1|學習單2附件).*', s)]
    assert len(pp) == 1
    PPTX_PATHS[os.path.basename(d)] = pp[0]
assert len(PPTX_PATHS) == 16*4+1

# Download ex.
# url="https://sites.google.com/clc.tku.edu.tw/modernchinese-official/%E7%AC%AC%E4%B8%80%E5%86%8A/b1-l1"
# for driveid in $(curl "$url" |
#                  egrep -o '<iframe[^>]*drive.google.com[^>]*preview[^>]*>' |
#                  sed -Ee 's|.*https://drive.google.com/file/d/([^/]+)/preview.*|\1|'); do
#   rclone backend copyid drive: "$driveid" ./
# done
#
# rclone via docker/podman:
# podman run -v ~/.config/rclone:/config/rclone:rw -v "$PWD:/pwd:rw" docker.io/rclone/rclone backend copyid drive: "$driveid" /pwd/

In [3]:
opencc_tw2s = opencc.OpenCC('tw2s')

# Character levels from Table of General Standard Chinese Characters for verification.
tgh_level = pd.read_csv('../chars/tgh.csv').set_index('char').level.to_dict()

# Convert to simplified characters + verify
def to_simplified(trad):
    simp = opencc_tw2s.convert(trad)
    for x, y in ('擡抬', '砲炮', '妳你'):
        simp = simp.replace(x, y)
    if '/' in simp and len(set(simp.split('/'))) == 1:
        simp = simp.split('/')[0]
    for c in simp:
        assert tgh_level.get(c, 9) <= 2 or c in '/（），？…101 3C KTV BBC OK 蚵', (trad, simp, c, tgh_level.get(c))
    return simp

In [4]:
# Explore slides layout

if 0:
    rows = []
    for book in ['B1', 'B2', 'B3', 'B4']:
        for lesson, filepath in PPTX_PATHS.items():
            if not lesson.startswith(book): continue
            prs = Presentation(filepath)
            for i, slide in enumerate(prs.slides):
                rows.append({'Book': book, 'Lesson': lesson, 'Page': i+1, 'Layout': slide.slide_layout.name})
    df = pd.DataFrame(rows)
    print(df.groupby('Book').Layout.value_counts())

def explore(lesson, page=None):
    book = lesson[:2]
    filepath = PPTX_PATHS[lesson]
    prs = Presentation(filepath)
    res = []
    for slide_i, slide in enumerate(prs.slides):
        if page and slide_i+1 != page: continue
        paragraphs = []
        for shape in slide.shapes:
            if not shape.has_text_frame: continue
            for paragraph in shape.text_frame.paragraphs:
                text = ''.join(run.text for run in paragraph.runs).strip()
                if not text: continue
                pidx = shape.placeholder_format.idx if shape.is_placeholder else None
                #if (book, slide.slide_layout.name, pidx) in placeholders_mapping: continue
                paragraphs.append((text, pidx))
        if len(paragraphs) == 0: continue
        res.append({'page': slide_i+1, 'layout': slide.slide_layout.name, 'paragraphs': paragraphs})

    return res

#explore('B2L07', 82)

In [5]:
# Extract text from slides

# book/lesson, layout, placeholder idx -> field
PLACEHOLDER_MAP = {
    ('B1L00', ' 生詞', 10): 'Traditional',
    ('B1L00', ' 生詞', 11): 'Pinyin',
    ('B1L00', ' 生詞', 12): 'POS',
    ('B1L00', ' 生詞', 13): 'Meaning',
    ('B1', ' 生詞', 1): 'Traditional',
    ('B1', ' 生詞', 2): 'Pinyin',
    ('B1', ' 生詞', 3): 'Example',
    ('B1', ' 生詞', 4): 'ExamplePinyin',
    ('B1', ' 生詞', 5): 'POS',
    ('B1', ' 生詞', 6): 'Meaning',
    ('B1', ' 生詞', 10): 'Traditional',
    ('B1', ' 生詞', 11): 'Pinyin',
    ('B1', ' 生詞', 12): 'Example',
    ('B1', ' 生詞', 13): 'ExamplePinyin',
    ('B1', ' 生詞', 14): 'POS',
    ('B1', ' 生詞', 15): 'Meaning',
    ('B1', ' 生詞', None): 'Example2',
    ('B2', '自訂版面配置', 1): 'Traditional',
    ('B2', '自訂版面配置', 2): 'Meaning',
    ('B2', '自訂版面配置', 3): 'Pinyin',
    ('B2', '生詞', 1): 'Example',
    ('B2', '生詞', 2): 'Traditional',
    ('B2', '生詞', 3): 'Meaning',
    ('B2', '生詞', 4): 'Pinyin',
    ('B2', '生詞', 10): 'Example',
    ('B2', '生詞', 13): 'Traditional',
    ('B2', '生詞', 15): 'Meaning',
    ('B2', '生詞', 17): 'Pinyin',
    ('B2', '生詞_1', 10): 'Example',
    ('B2', '生詞_1', 13): 'Traditional',
    ('B2', '生詞_1', 15): 'Meaning',
    ('B2', '生詞_1', 17): 'Pinyin',
    ('B2', '1_生詞', 13): 'Traditional',
    ('B2', '1_生詞', 15): 'Meaning',
    ('B2', '1_生詞', 17): 'Pinyin',
    ('B2', '2_生詞_1', 13): 'Traditional',
    ('B2', '2_生詞_1', 15): 'Meaning',
    ('B2', '2_生詞_1', 17): 'Pinyin',
    ('B2', '1_生詞_1', 13): 'Traditional',
    ('B2', '1_生詞_1', 15): 'Meaning',
    ('B2', '1_生詞_1', 17): 'Pinyin',
    ('B3', '標題及內容', None): 'Freetext',
    ('B3', 'OBJECT', None): 'Freetext',
    ('B3', '標題投影片', None): 'TitleSlide',
    ('B3', '4_標題投影片', None): 'TitleSlide',
    ('B3', 'TITLE', None): 'TitleSlide',
    ('B4', '生詞_有例句', 10): 'Example',
    ('B4', '生詞_有例句', 13): 'Traditional',
    ('B4', '生詞_有例句', 15): 'Meaning',
    ('B4', '生詞_有例句', 17): 'Pinyin',
    ('B4', '生詞_無例句', 13): 'Traditional',
    ('B4', '生詞_無例句', 15): 'Meaning',
    ('B4', '生詞_無例句', 17): 'Pinyin',    
    ('B4', '生詞_無例句', 1): 'Traditional',
    ('B4', '生詞_無例句', 2): 'Meaning',
    ('B4', '生詞_無例句', 3): 'Pinyin',    
    ('B4', '生詞_有例句', 1): 'Example',
    ('B4', '生詞_有例句', 2): 'Traditional',
    ('B4', '生詞_有例句', 3): 'Meaning',
    ('B4', '生詞_有例句', 4): 'Pinyin',    
    ('B4', '短語', 1): 'Traditional',
    ('B4', '短語', 2): 'Meaning',
    ('B4', '短語', 3): 'Pinyin',
    ('B4', '短語', 4): 'Example',
    ('B4', '短語', 13): 'Traditional',
    ('B4', '短語', 15): 'Meaning',
    ('B4', '短語', 17): 'Pinyin',
}

entries = []

for lesson, filepath in PPTX_PATHS.items():
    book = lesson[:2]
    prs = Presentation(filepath)

    for slide_i, slide in enumerate(prs.slides):
        row = {
            'Book': lesson[:2],
            'Lesson': lesson,
            'Page': slide_i+1,
            'Layout': slide.slide_layout.name
        }

        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue

            for paragraph in shape.text_frame.paragraphs:
                text = ''.join(run.text for run in paragraph.runs).strip()
                assert '\n' not in text

                if not text:
                    continue

                pidx = shape.placeholder_format.idx if shape.is_placeholder else None

                field = PLACEHOLDER_MAP.get((lesson, slide.slide_layout.name, pidx))
                if not field:
                    field = PLACEHOLDER_MAP.get((book, slide.slide_layout.name, pidx))
                if not field and (text.lower() in ('names', 'name', 'phrase', 'phrases')):
                    field = 'POS'
                if not field:
                    continue

                if field == 'ExamplePinyin':
                    continue

                if field in row:
                    assert '<' not in text
                    row[field] += '<br>' + text
                else:
                    row[field] = text

        if len(row) == 5 and 'Freetext' in row and len(row['Freetext']) < 10:
            continue

        if len(row) > 4:
            entries.append(row)

In [6]:
# Detects spans of consecutive pages with defs and assign IDs.

span = None
spans = {}

for i, row in enumerate(entries):
    if span:
        if row['Lesson'] == span[0] and row['Page'] == entries[i-1]['Page']+1:
            span[2] = row['Page']
            span[3].append(row)
            continue
    if span:
        spans.setdefault(span[0], []).append(span)
    span = [row['Lesson'], row['Page'], row['Page'], [row]]
spans.setdefault(span[0], []).append(span)

for lesson in spans:
    #print(lesson, [s[1:3] for s in spans[lesson]])
    assert len(spans[lesson]) <= 3
    for span_i, span in enumerate(spans[lesson]):
        title = None
        k = 0
        prev_row = None
        for row in span[3]:
            if 'TitleSlide' in row:
                title = row['TitleSlide']
                continue
            if title:
                row['Title'] = title
            row['Span'] = span_i + 1

            if lesson + row.get('Traditional', '') in ['B1L12事(情)', 'B1L15代表', 'B1L13壞', 'B1L12工作'] and prev_row and \
                prev_row['Traditional'] == row['Traditional']:
                prev_row['Example'] = prev_row.get('Example', '') + '<br>' + row.get('Example', '')
                row['Drop'] = 1
                continue
            prev_row = row

            k += 1
            row['ID'] = '%s-%d-%02d' % (row['Lesson'], row['Span'], k)

entries = [row for row in entries if 'Drop' not in row]
entries = [row for row in entries if 'TitleSlide' not in row]

In [7]:
# Parse free text on B3 slides which don't use placeholders

def cn_score(text):
    text = re.sub('([\t0-9 （）(  ) …、/‧-]|-$)', '', text)
    n = len(text)
    if n == 0: return 0
    k = sum(ord(c) >= 0x4E00 for c in text)
    if k == 0: return 0
    if k == n: return 1
    return k / n

ACC_POS = set(['Adv', 'Adv/ Vs-attr', 'Adv/Vs', 'Conj', 'Conj/Prep', 'Det', 'M', 'M/N', 'N',
               'N/V', 'N/Vi', 'N/Vs', 'Phrases', 'Prep', 'Ptc', 'V', 'V-sep', 'V/N', 'V/Vi',
               'V/Vs', 'Vaux', 'Vi', 'Vi/N', 'Vi/Vs', 'Vp', 'Vp-sep', 'Vp/N', 'Vpt', 'Vpt/N',
               'Vs', 'Vs-attr', 'Vs-attr/Adv', 'Vs-attr/Vi', 'Vs-pred', 'Vs-sep', 'Vs/Adv',
               'Vs/N', 'Vs/Vst', 'Vst', 'adv'])

ss = []

for row in entries:
    if row['Book'] != 'B3': continue
    text = row['Freetext'].split('<br>')

    sc = [cn_score(s) for s in text]
    if sc[:3] == [1, 1, 0]:
        text = [text[0] + text[1]] + text[2:]
    elif sc[:2] == [0, 0]:
        hanzi = {'guāi': '乖', 'dǎo': '倒', 'diàochá': '調查', 'shìyě': '視野', 'shǔ': '數',
                 'xiōngdì': '兄弟', 'wúguān': '無關', 'jiàoxué': '教學', 'bìng': '病',
                 'wǎngyǒu': '網友', 'kòng': '空', 'kōng': '空', 'ài': '愛', 'tiānzhēn': '天真',
                 'yǐlái': '老家', 'réngrán': '仍然'}[text[0]]
        i = text.index(hanzi)
        text = [hanzi] + text[:i] + text[(i+1):]
    sc = [cn_score(s) for s in text]
    assert sc[:3] == [1, 0, 0]

    if len(text) >= 4 and text[2] not in ACC_POS and text[3] in ACC_POS:
        text = text[:1] + [text[1] + ' ' + text[2]] + text[3:]
    if text[2] not in ACC_POS and sum(s in ACC_POS for s in text) == 1:
        i = [int(s in ACC_POS) for s in text].index(1)
        assert i != 0
        if i == 1 and any(c in 'āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ' for c in text[-1]):
            text = [text[0], text[-1]] + text[1:-1]
        elif i == 1 and any(c in 'āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ' for c in text[3]):
            text = [text[0], text[3], text[1], text[2]] + text[4:]
        else:
            assert i != 1
            text = text[:2] + [text[i]] + text[2:i] + text[(i+1):]

    if text[2] not in ACC_POS:
        for x,y in [('Wángmǔ', 'Niángniang'), ('shèqún', 'wǎngzhàn')]:
            if x in text:
                i = text.index(x)
                assert i+1 == text.index(y)
                text = text[:i] + [text[i] + ' ' + text[i+1]] + text[(i+2):]

    if text[2] not in ACC_POS:
        text = text[:2] + [''] + text[2:]

    sc = [0,0,0] + [int(cn_score(s) > 0.5) for s in text[3:]]
    if 1 in sc:
        i = sc.index(1)
        if i == 3:
            assert sc[-1] == 0
            row['Example'] = ' '.join(text[i:-1])
            text = text[:i] + [text[-1]]
        else:
            row['Example'] = ' '.join(text[i:])
            text = text[:i]

    row['Traditional'] = text[0]
    row['Pinyin'] = text[1]
    row['POS'] = text[2]
    row['Meaning'] = ' '.join(text[3:]).strip()

In [8]:
# Extract and normalize POS

POS_MAP = {
  '(Prep )': 'Prep',
  '(N, V)': 'N/V',
  '(Vi, N)': 'Vi/N',
  '(N, Vs)': 'N/Vs',
  '(N, M)': 'N/M',
  '(N, V)': 'N/V',
  '(N, V)': 'N/V',
  '(Adv)': 'Adv',
  '(Adv.)': 'Adv',
  '(Adv.,Vs)': 'Adv/Vs',
  '(Adv./Vs)': 'Adv/Vs',
  '(Conj)': 'Conj',
  '(Conj.)': 'Conj',
  '(Det)': 'Det',
  '(M)': 'M',
  '(M,N)': 'M/N',
  '(M/N)': 'M/N',
  '(M/V)': 'M/V',
  '(N)': 'N',
  '(N,V)': 'N/V',
  '(N,Vs-attr)': 'N/Vs-attr',
  '(N/M)': 'N/M',
  '(N/V)': 'N/V',
  '(N/V-sep)': 'N/V-sep',
  '(N/Vi)': 'N/Vi',
  '(N/Vp-sep)': 'N/Vp-sep',
  '(N/Vs)': 'N/Vs',
  '(N/Vst)': 'N/Vst',
  '(N/Vst/V)': 'N/Vst/V',
  '(Phrase)': 'Phrase',
  '(Prep)': 'Prep',
  '(Prep/V/Vst)': 'Prep/V/Vst',
  '(Ptc)': 'Ptc',
  '(V)': 'V',
  '(V-sep)': 'V-sep',
  '(V-sep/N)': 'V-sep/N',
  '(V/Adv.)': 'V/Adv',
  '(V/N)': 'V/N',
  '(V/Vs)': 'V/Vs',
  '(Vaux)': 'Vaux',
  '(Vaux/V)': 'Vaux/V',
  '(Vi)': 'Vi',
  '(Vi,N)': 'Vi/N',
  '(Vi/Adv)': 'Vi/Adv',
  '(Vi/N)': 'Vi/N',
  '(Vi/V-sep)': 'Vi/V-sep',
  '(Vp)': 'Vp',
  '(Vp-sep)': 'Vp-sep',
  '(Vp/Vpt)': 'Vp/Vpt',
  '(Vp/Vs)': 'Vp/Vs',
  '(Vpt)': 'Vpt',
  '(Vpt/N)': 'Vpt/N',
  '(Vpt/Vp)': 'Vpt/Vp',
  '(Vs)': 'Vs',
  '(Vs-attr / Vi)': 'Vs-attr/Vi',
  '(Vs-attr/Vp)': 'Vs-attr/Vp',
  '(Vs-attr)': 'Vs-attr',
  '(Vs-attr/Adv.)': 'Vs-attr/Adv',
  '(Vs-attr/Vp)': 'Vs-attr/Vp',
  '(Vs-pred)': 'Vs-pred',
  '(Vs-sep)': 'Vs-sep',
  '(Vs/V)': 'Vs/V',
  '(Vs/Vst)': 'Vs/Vst',
  '(Vst)': 'Vst',
  '(Vst/N)': 'Vst/N',
  '(Vst/Prep)': 'Vst/Prep',
  '(Vst/Vs/Adv./Vi)': 'Vst/Vs/Adv/Vi',
  '（Det)': 'Det',
  '(Vs-attr, N)': 'Vs-attr/N',
  '(Vs-attr': 'Vs-attr',
  '(Vs-attr,': 'Vs-attr,',
}
POS_REMAP = {
  'name': 'Name',
  'Names': 'Name',
  'Phrase': 'Ph',
  'phrase': 'Ph',
  'phrases': 'Ph',
  'Phrases': 'Ph',
  'VS': 'Vs',
  'Vs-attr,': 'Vs-attr',
  'adv': 'Adv',
  'Cong': 'Conj',
  'Adv/ Vs-attr': 'Adv/Vs-attr',
  'V-sep/ N': 'V-sep/N',
}

for row in entries:
    text = row.get('Meaning', '').replace('<br>', ' ').strip()
    text0 = text
    pos = ''
    for pref in POS_MAP:
        if text.startswith(pref):
            pos = POS_MAP[pref]
            text = text[len(pref):]
    if not pos and 'measure' in text.lower() or 'classifier' in text.lower():
        pos = 'M'
    if not pos and not row.get('POS') and row.get('Title') == 'Phrase':
        pos = 'Phrase'
    if not pos and not row.get('POS') and row.get('Title') == 'Names':
        pos = 'Name'
    if pos:
        assert row.get('POS', pos) in (pos, ''), (row, pos)
        row['POS'] = pos
    pos = row.get('POS', '')
    pos = pos.strip()
    pos = POS_REMAP.get(pos, pos)
    row['POS'] = pos
    text = text.strip()
    row['Meaning'] = text

In [9]:
# Cleanup/normalize entries

variants_df = pd.read_csv('variants.csv', dtype='str').fillna('').set_index('ID')

def variants_to_json(variants):
    arr = []
    for var in variants.split(' / '):
        m = re.match(r'^([^ ()\[\]]+) \[([^()\[\]]+)\]$', var)
        assert m, variants
        arr.append({
            'Traditional': m[1],
            'Simplified': to_simplified(m[1]),
            'Pinyin': m[2]
        })
    return json.dumps(arr, ensure_ascii=False)

for row in entries:
    if not row.get('Meaning') and row['Traditional'] == '保護':
        row['POS'], row['Meaning'] = 'V', 'to protect'
    if not row.get('Pinyin'):
        row['Pinyin'] = {'隻': 'zhī', 'KTV': 'KTV'}[row['Traditional']]
    if not row.get('Traditional'):
        row['Traditional'] = {'yìnzhāng/túzhāng': '印章/圖章'}[row['Pinyin']]

    trad = row['Traditional']
    for x, y in (('<br>', ''), (' ', ''), (',', '，'), ('(', '（'), (')', '）'), ('／', '/'), ('∕', '/'), ('。', ''),
                 ('髪', '髮'),
                ):
        trad = trad.replace(x, y).strip()

    trad = {
        '你/妳好': '你好/妳好',
        '計畫/劃': '計畫/計劃',
        '部分/份': '部分/部份',
        '健康檢查（健檢）': '健康檢查/健檢',
        '潮溼，潮濕': '潮溼/潮濕',
        '差（一）點兒': '差（一）點',
        '白白/白（白）': '白白/白',
        # 裏: HK variant - not in TW font, dropping due to rendering problem. These variants are in TOCFL though
        '夜裡（裏）': '夜裡',  #'夜裡/夜裏', 
        '裡頭/裏頭': '裡頭',
    }.get(trad, trad)
    assert re.match(r'^([\u4e00-\u9fff（），/]|KTV|101|卡拉OK|3C產品|……|？$)+$', trad)
    row['Traditional'] = trad

    pinyin = row['Pinyin']
    for x, y in (('<br>', ' '), ('\t', ' '), (' ', ' '), ('\u200b', ' '), ('\ufeff', ' '),
                 ('\u00a0', ' '), ('  ', ' '), (' , ', ', '), ('’', "'"), (' / ', '/'),
                 ('/ ', '/'), (' /', '/'), (' \ ', '/'), (' ?', '?'), ('‑', ' '),
                 ('-', ' '), ('ă', 'ǎ'), ('ĕ', 'ě'), ('ĭ', 'ǐ'), ('ŏ', 'ǒ'), ('ŭ', 'ǔ'),
                 ('ǎ', 'ǎ'), ('a\u030c', 'ǎ'), ('e\u030c', 'ě'),
                 (' (miàn)', '(miàn)'), (' (qíng)', '(qíng)'), (' (zi)', '(zi)'),
                 (' (jī)', '(jī)'), ('yèlǐ (lǐ)', 'yèlǐ'), (' (zhe)', '(zhe)'),
                 (' (cháng)', '(cháng)')):
        pinyin = pinyin.replace(x, y).strip()
    m = re.match(r"^([a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ/ '()]|\b, |3c |[?]$|……)+$", pinyin.lower())
    if not m: print(pinyin)
    assert m
    row['Pinyin'] = pinyin

    ex = row.get('Examples', row.get('Example', ''))
    if row.get('Example2'):
        for s in row['Example2'].split('<br>'):
            s = s.strip()
            if len(s) >= 2 and cn_score(s) > 0.5:
                if ex: ex += '<br>'
                ex += s
    for x, y in [
        (r'(<br>)+', '<br>'), (r'^(<br>)+', ''), (r'(<br>)+$', ''),
        (r'([\t \u200b\ufeff\u00a0])+', ' '),
        ('[?]','？'), ('!', '！'), ('([^a-z]),', r'\1，'),
        ('⑴', '(1)'), ('⑵', '(2)'), ('⑶', '(3)'),
        (r'\bA[：:]', 'A:'), (r'\bB[：:]', 'B:'),
        (r' *\(A\)[：:]', ' A:'), (r' *\(B\)[：:]', ' B:'),
        ('，<br>', '，'),
        ('？<br>([AB])[：:]', r'？ \1:'),
        ('。<br>([AB])[：:]', r'？ \1:'),
        ('^1[.]', '(1)'), ('<br>2[.]', '<br>(2)'), ('<br>3[.]', '<br>(3)'),
        ('lǎoshī.*shēngxiào.<br>', ''),
        (r'([\u4200-\u9fff]) +([\u4200-\u9fff])', r'\1\2'),
    ]:
        ex = re.sub(x, y, ex).strip()
    if '(2)' in ex:
        ex = re.sub('^[(][12][)]', '', ex).strip()
        ex = re.sub('<br>[(][2-9][)]', '\n', ex)
        ex = re.sub('([。！]|和體力) *[(][2-9][)]', r'\n', ex)
        ex = re.sub('<br>', '', ex).strip()
        ex = '<br>'.join([x.strip() for x in ex.split('\n') if x.strip()])
    elif '<br>' in ex:
        ex = re.sub('<br>', '', ex).strip()
    row['Examples'] = ex
    if 'Example' in row: del row['Example']
    if 'Example2' in row: del row['Example2']

    row.setdefault('POS', '')

    for col in ['Traditional', 'Pinyin', 'Meaning', 'POS', 'Examples']:
        row[col] = row[col].strip()
    for col in ['Traditional', 'Pinyin', 'Meaning']:
        assert row[col]

    text = row['Meaning']
    for x, y in [
        (r'([\t \u200b\ufeff\u00a0])+', ' '),
        (r'“(.*)’’', r'"\1"'),
        ('’', "'"), ('‘', "'"), ('’', "'"), 
        ('”;', '";'), ('”[.]', '".'), ('”=', '"='), ('“of ”', '"of"'), ('“', ' "'), ('”', '" '),
        ('ﬃ', 'ffi'), ('ﬄ', 'ffl'), ('ﬁ', 'fi'), ('ﬂ', 'fl'), ('ﬀ', 'ff'),
        ('…[.]{3}', '...'), ('…', '...'),
        (' *[(] *M[:：] *', ' (M: '), (' *： *', ': '), (' *； *', '; '),
        ('、 *', ', '), ('  +', ' '), ('（', ' ('), ('）', ') '),
        (' [)] , ', '), '), (' ([:;,]) ', r'\1 '), (' +; *', '; '),
        (' ,([a-z])', r', \1'),
        (r'\.\.\., *', '..., '),
        ('([a-z)])([,;:])([a-z])', r'\1\2 \3'),
        ('([a-z])([(])([a-z])', r'\1 (\3'), ('([a-z])([)])([a-z])', r'\1) \3'),
        ('[(] *', '('), (' *[)]', ')'),
        ('！,ah', '! ah'), ('！', '! '), (' *! *', '! '), ('! "', '!"'), ('!,', '!, '),
        (r'\(someone~\)', '(someone)'), (r'\[foolishly\]', '(foolishly)'),
        ('^N[)] ', ''),
        ('ă', 'ǎ'), ('ĕ', 'ě'), ('ĭ', 'ǐ'), ('ŏ', 'ǒ'), ('ŭ', 'ǔ'), ('ǎ', 'ǎ'), ('a\u030c', 'ǎ'), ('e\u030c', 'ě'),
        ('^[)] *', ''), ('^To ', 'to '), (' +', ' '),
        (' *[(]M: 個 *(ge|)[)]$', ''),
        ('M: 間zhāng', 'M: 間jiān'),
        (r'[(]M: ([\u4E00-\uFFFF])[; ]*([a-zāáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ]+)[)]', r'(M: \1\2)'),
        (r'[(]M: ([\u4E00-\uFFFF])[,;/ ]+ *([\u4E00-\uFFFF])[)]', r'(M: \1,\2)'),
        ('M: 片;朵',        'M: 片,朵'),
        ('M: 部/輛;liàng',  'M: 部,輛liàng'),
        ('M: 個/間jiān',    'M: 個,間jiān'),
        ('M: 棟dòng/個',    'M: 棟dòng,個'),
        ('M: 盤Pán, 道dào', 'M: 盤pán,道dào'),
        ('M: 根gēn, 支',    'M: 根gēn,支'),
        ('M: 種/股gǔ',      'M: 種,股gǔ'),
        ('M: 張, 幅fú',     'M: 張,幅fú'),
        ('M: 塊;份;客',     'M: 塊,份,客'),
    ]:
        text = re.sub(x, y, text).strip()
    m = re.match(r'''^([-a-z0-9āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜ ,.:;!?+&/=()"'\u4e00-\u9fff\[\]])+$''', text.lower())
    if not m: print(text)
    assert m
    row['Meaning'] = text

    row['Simplified'] = to_simplified(row['Traditional'])

    row['Variants'] = ''
    if row['ID'] in variants_df.index:
        assert variants_df.loc[row['ID'], 'Traditional'] == row['Traditional'], (row, variants_df.loc[row['ID'], 'Traditional'])
        row['Variants'] = variants_to_json(variants_df.loc[row['ID'], 'Variants'])

In [10]:
errata_df = pd.read_csv('errata.csv', dtype='str').fillna('').set_index(['ID', 'Column'])

lower_case_meaning_ids = set('''
B1L02-1-24 B1L02-3-02 B1L04-1-23 B1L05-1-01 B1L05-2-06 B1L05-3-04 B1L06-1-06 B1L06-2-02 B1L06-2-11 B1L06-2-14
B1L06-2-17 B1L06-3-07 B1L06-3-10 B1L08-1-01 B1L08-1-09 B1L08-1-21 B1L08-2-01 B1L08-2-02 B1L09-1-01 B1L09-2-08
B1L09-3-13 B1L14-3-04 B2L01-1-21 B2L02-1-35 B2L02-2-20 B2L02-2-23 B2L02-2-24 B2L04-1-15 B2L04-1-24 B2L04-1-34
B2L04-1-35 B2L04-2-30 B2L05-1-15 B2L05-1-29 B2L06-1-33 B2L06-1-37 B2L08-1-01 B2L08-1-03 B2L08-1-21 B2L10-1-32
B2L11-1-08 B2L13-1-23 B2L13-1-33 B3L02-2-04 B3L06-1-01 B3L06-1-07 B3L06-1-12 B3L06-1-13 B3L06-1-18 B3L06-1-22
B3L06-1-30 B3L06-1-31 B3L06-1-32 B3L06-1-38 B3L06-1-40 B3L06-2-04 B3L06-2-08 B3L06-2-10 B3L06-2-15 B3L06-2-18
B3L07-1-03 B3L07-1-11 B3L07-2-17 B3L08-1-08 B3L10-1-23 B3L15-1-41 B4L01-1-01 B4L02-1-28 B4L02-2-24 B4L06-1-04
B4L06-2-03 B4L06-2-31 B4L12-1-48 B4L13-1-23 B4L14-1-02 B1L04-1-13
'''.split())

for row in entries:
    for col in row:
        key = (row['ID'], col)
        if key in errata_df.index:
            old, corr = errata_df.loc[key, 'Original'], errata_df.loc[key, 'Corrected']
            assert row[col] in (old, corr), (row, col, old, row[col])
            row[col] = corr
    if row['ID'] in lower_case_meaning_ids:
        row['Meaning'] = row['Meaning'][0].lower() + row['Meaning'][1:]

In [11]:
df = pd.DataFrame(entries)
assert list(df.ID) == list(sorted(df.ID))
cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Examples', 'Variants']
#cols += [c for c in df if c not in cols]
df = df[cols].set_index('ID').copy()
df.to_csv('modernchinese.csv', index=True)
len(df), len(set(df.Traditional))

(4061, 3753)

## Expand variants

In [12]:
expanded_rows = []
for row in pd.read_csv('modernchinese.csv', dtype='str').fillna('').to_dict(orient='records'):
    for var_dict in json.loads(row['Variants'] or '[{}]'):
        var = dict(row)
        var.update(var_dict)
        var.pop('Variants')
        assert len(var['Simplified'].split()) == 1  # not empty and not spaces
        assert '/' not in var['Simplified']
        expanded_rows.append(var)

expanded_df = pd.DataFrame(expanded_rows)
expanded_df.to_csv('modernchinese-expanded.csv', index=False)
print('modernchinese-expanded.csv: %d rows\n' % len(expanded_df))

modernchinese-expanded.csv: 4218 rows



## Export in pleco format

In [13]:
# Export in pleco's flashcards format

EAC1_TAG = '\uEAC1\uEC00\uEC00\uECCC\uEC99'  # lesson tag color, #00cc99 green
EAC1_EX = '\uEAC1\uEC00\uEC05\uECAA\uECFF'   # examples, #05aaff blue
EAC1_HL = '\uEAC1\uEC00\uEC00\uECCC\uECCC'   # term highlight in examples, teal

df = pd.read_csv('modernchinese.csv', dtype='str').fillna('')

with open('modernchinese-pleco.txt', 'w') as fout:
    last_header = ''
    for row in df.itertuples():
        m = re.match('^B([1-4])L([0-9]{2})-([1-3])-.*', row.ID)
        assert m

        header = f'//時代華語/Book {m[1]}/L{m[2]}-{m[3]}'
        if header != last_header:
            fout.write(header + '\n')
            last_header = header

        tag = f' {EAC1_TAG}[M{m[1]}L{int(m[2])}]\uEAC2'  # lesson tag [MxLxx]

        defn = f'({row.POS}) {row.Meaning}' if row.POS else row.Meaning
        # remove brackets around MW so pleco doesn't enlarge char+brackets
        defn = re.sub(r' *[(](M: [^()]+)[)]', r'; \1', defn)

        variants = []
        trad_variants = set()
        for var_dict in json.loads(row.Variants or '[{}]'):
            var = dict(row._asdict())
            var.update(var_dict)
            variants.append(var)
            trad_variants.add(var['Traditional'])

        ex = ''
        if row.Examples:
            # highlight the term in example sentences
            ex = row.Examples
            assert '\n' not in ex
            masked = trad_variants
            found = any(s in ex for s in masked)
            if not found and '-sep' in row.POS and len(trad_variants) <= 1:
                a, b = row.Traditional[0], row.Traditional[1:]
                if ex.count(a) == 1 and ex.count(b) == 1 and ex.index(a) < ex.index(b):
                    masked = [a, b]
                    found = 2
            if found:
                ex = re.sub('(%s)' % '|'.join(sorted(masked, key=lambda s: -len(s))),
                            f'\uEAC2{EAC1_HL}\\1\uEAC2{EAC1_EX}', ex)
            ex = f'\uEAB1{EAC1_EX}{ex}\uEAC2'.replace('<br>', '\uEAB1')

        if set(trad_variants) != set([row.Traditional]):
            defn = f'{row.Traditional} [{row.Pinyin}]\uEAB1{defn}'

        trad_seen = set()
        for var in variants:
            # expand hanzi variants to separate entries, but for pinyin variants emit 'variant (variant)'
            trad = var['Traditional']
            if trad in trad_seen:
                continue
            trad_seen.add(trad)

            pinyin = var['Pinyin']
            pinyin_variants = [x['Pinyin'] for x in variants if x['Traditional'] == var['Traditional']]
            assert len(pinyin_variants) <= 2, variants
            if len(pinyin_variants) > 1:
                pinyin = '%s (%s)' % tuple(pinyin_variants)

            simp = var['Simplified']
            fout.write(f'{simp}[{trad}]\t{pinyin}\t{defn}{tag}{ex}\n')

## Generate anki package

In [14]:
!mkdir -p data/media
!cp -f ../downloads/fonts/MoeStandardKai.ttf data/media/_MoeStandardKai.ttf

import os, glob, pandas as pd

for row in pd.read_csv('audio.csv', dtype='str').fillna('').itertuples():
    dst = f'data/media/modernchinese-{row.ID}.mp3'
    if os.path.exists(f'downloads/shengzi/{row.Source}') and not os.path.exists(dst) and row.OK != '0':
        cmd = f"ffmpeg -v error -i 'downloads/shengzi/{row.Source}' -ss {row.Start} -to {row.End} -c copy -vn -sn -dn -y '{dst}'"
        assert os.system(cmd) == 0, cmd

In [15]:
cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Examples', 'Variants', 'Audio']

model = genanki.Model(
    1696565462,
    'ModernChinese',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'ModernChinese',
        'qfmt': open('../dangdai/dangdai-qfmt.html').read(),
        'afmt': open('../dangdai/dangdai-afmt.html').read().replace(
            '<div>{{Audio}}</div>',
            '<div>{{Audio}}</div>\n<br>\n<div>{{Examples}}</div>')
    }],
    css=open('../dangdai/dangdai.css').read(),
)

deck = genanki.Deck(
    1696565463,
    name='modernchinese',
    description='Modern Chinese vocabulary deck'
)

audio_df = pd.read_csv('audio.csv', dtype='str').set_index('ID')

for row in pd.read_csv('modernchinese.csv', dtype='str').fillna('').to_dict(orient='records'):
    row['Audio'] = ''
    if int(audio_df.loc[row['ID'], 'OK']) != 0:
        row['Audio'] = f"[sound:modernchinese-{row['ID']}.mp3]"
    note = genanki.Note(
        model=model,
        fields=[row[c] for c in cols],
        guid=genanki.guid_for('modernchinese', row['ID'], row['Traditional']),
    )
    deck.add_note(note)

!rm -f modernchinese.apkg
genanki.Package(deck, media_files=glob.glob('data/media/*')).write_to_file('modernchinese.apkg')