# PAVC

Source: https://www.plecoforums.com/threads/practical-audio-visual-chinese-dictionary-and-flashcards.2403/

In [1]:
import os, re, glob, opencc
import pandas as pd
opencc_tw2s = opencc.OpenCC('tw2s')

In [2]:
TONE_CHARS = {
  'a': 'aāáǎà',
  'A': 'AĀÁǍÀ',
  'e': 'eēéěè',
  'E': 'EĒÉĚÈ',
  'i': 'iīíǐì',
  'I': 'IĪÍǏÌ',
  'o': 'oōóǒò',
  'O': 'OŌÓǑÒ',
  'u': 'uūúǔù',
  'U': 'UŪÚǓÙ',
  'ü': 'üǖǘǚǜ',
  'Ü': 'ÜǕǗǙǛ',
}

def to_pinyin_with_diacritics(text):
    res = ''
    spec = None
    any_spec = False

    text = re.sub('([0-9]) *', r'\1 ', text)
    text = text.replace('-', ' ')  # Jin1-Xia4, Ou1-Mei3, Wei4-Jin4, Chun1-Qiu1 Shi2dai4

    for syll in text.split():
        m = re.match('^((?:[A-Za-züÜ]|[Uu]:)+)([1-5])', syll)
        if not m:
            assert len(syll) == 1 and re.match('[A-Za-z,·]', syll), syll
            if res and not spec and syll != ',' and not res.endswith(' '):
                res += ' '
            if syll != '·':
                res += syll
            spec = True
            any_spec = True
            continue

        syll, tone = m[1], m[2]
        syll = syll.replace('u:', 'ü').replace('U:', 'Ü')
        syll_l = syll.lower()

        # Determine index of vowel receiving the tone mark.
        # Priority: a, e, o, last vowel (https://pinyin.info/rules/where.html)
        i = -1
        if tone != 5:
            if 'a' in syll_l:
                i = syll_l.index('a')
            elif 'e' in syll_l:
                i = syll_l.index('e')
            elif 'o' in syll_l:
                i = syll_l.index('o')
            else:
                for j, c in enumerate(syll_l):
                    if c in 'iuü':
                        i = j

        if res:
            if (spec or syll[0] != syll_l[0]) and not res.endswith(' '):
                res += ' '
            elif syll_l[0] in 'aeo':
                res += "'"

        if i != -1 and tone in '1234':
            res += syll[:i] + TONE_CHARS[syll[i]][int(tone)] + syll[i+1:]
        else:
            res += syll
        spec = False

    return res

def to_simplified(trad):
    return opencc_tw2s.convert(trad)

In [3]:
rows = []
defs_seen = set()

for line in open("pavc-flashcards.txt"):
    line = line.strip()
    if line.startswith('// ') or line.startswith('\ufeff//') or '[TOP-' in line: continue

    m = re.match(r'^([^[]+)\[([^\]]+)\]\t([^\t]+)\t(.*)$', line)
    assert m, line

    trad = m[2]
    simp = m[1]
    pinyin = m[3]
    defn = m[4]

    if trad+defn in defs_seen: continue
    defs_seen.add(trad+defn)


    if '…' in defn:
        for match in list(re.findall(r' \(…?[a-z]{1,2}…?[a-z]{0,2}…?\) ', defn)):
            if '…' in match:
                defn = defn.replace(match, ' ').strip()

    #if '～' in line and len(m[1]) == 4:
    #    exp = re.findall(r' (\([\u4300-\u9FFFF]*～[\u4300-\u9FFFF]*\))', line)
    #    if len(exp) == 1 and (f'{exp[0]} [PAVC' in line or f': {exp[0]}' in line):
    #        print(line, '\n', exp[0])

    #if re.match('.*PAVC.*PAVC.*', defn):
    #    print(line)

    #if row['Simplified'] != to_simplified(row['Traditional']):
    #    print(line[:min(len(line), 50)], '\t', to_simplified(row['Traditional']), row['Simplified'])

    row = {
        'Traditional': trad,
        'Simplified': simp,
        'Pinyin': to_pinyin_with_diacritics(pinyin),
        'Meaning': defn,
    }
    rows.append(row)

df = pd.DataFrame(rows)
df.to_csv('pavc.csv', index=False)

!wc pavc.csv

  5768  42854 417242 pavc.csv
