# Download and parse CE-CCDICT

### Download

Download and unpack latest version from upstream:

In [1]:
!curl -s -o cedict.txt.gz "https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz"
!gzip -d <cedict.txt.gz | tr -d '\r' >cedict.txt && rm -f cedict.txt.gz
!grep '^#' cedict.txt

# CC-CEDICT
# Community maintained free Chinese-English dictionary.
# 
# Published by MDBG
# 
# License:
# Creative Commons Attribution-ShareAlike 4.0 International License
# https://creativecommons.org/licenses/by-sa/4.0/
# 
# Referenced works:
# CEDICT - Copyright (C) 1997, 1998 Paul Andrew Denisowski
# 
# CC-CEDICT can be downloaded from:
# https://www.mdbg.net/chinese/dictionary?page=cc-cedict
# 
# Additions and corrections can be sent through:
# https://cc-cedict.org/editor/editor.php
# 
# For more information about CC-CEDICT see:
# https://cc-cedict.org/wiki/
# 
#! version=1
#! subversion=0
#! format=ts
#! charset=UTF-8
#! entries=122111
#! publisher=MDBG
#! license=https://creativecommons.org/licenses/by-sa/4.0/
#! date=2023-11-04T07:10:40Z
#! time=1699081840


### Parse

In [2]:
import pandas as pd
import re

cedict_text = open('cedict.txt').read()
cedict_lines = cedict_text.split('\n')
cedict_entries = []
kexpect = 0

for lineno, line in enumerate(cedict_lines, start=1):
    if line.startswith('#'):
        m = re.match('^#! entries=([0-9]+)$', line)
        if m:
            kexpect = int(m[1])
        continue

    m = re.match(r'^([^[\] ]+) ([^[\] ]+) \[([^[\]]+)\] /(.*)/$', line)
    assert m, line

    entry = {
        'Traditional': m[1],
        'Simplified': m[2],
        'Pinyin': m[3],
        'Definitions': m[4],
    }
    cedict_entries.append(entry)

df = pd.DataFrame(cedict_entries)

assert len(df) == kexpect
assert all(df[['Traditional', 'Simplified', 'Pinyin']].value_counts() == 1)  # Trad+Simp+Pinyin is unique entry key

### Convert pinyin to the standard form with diacritics

In [3]:
TONE_CHARS = {
    'a': 'aāáǎà', 'e': 'eēéěè', 'o': 'oōóǒò', 'i': 'iīíǐì', 'u': 'uūúǔù', 'ü': 'üǖǘǚǜ',
    'A': 'AĀÁǍÀ', 'E': 'EĒÉĚÈ', 'O': 'OŌÓǑÒ', 'I': 'IĪÍǏÌ', 'U': 'UŪÚǓÙ', 'Ü': 'ÜǕǗǙǛ',
}

# Pinyin with tone marks from CEDICT's numbered pinyin with spaces
def to_pinyin_with_diacritics(text):
    res = ''
    spec = None

    for syll in text.split():
        m = re.match('^((?:[A-Za-z]|[Uu]:)+)([1-5])', syll)
        if not m:
            assert len(syll) == 1 and re.match('[A-Za-z,·]', syll)
            if res and not spec and syll != ',' and not res.endswith(' '):
                res += ' '
            if syll != '·':
                res += syll
            spec = True
            continue

        syll, tone = m[1], m[2]
        syll = syll.replace('u:', 'ü').replace('U:', 'Ü')
        syll_l = syll.lower()

        # Determine index of vowel receiving the tone mark.
        # Priority: a, e, o, last vowel (https://pinyin.info/rules/where.html)
        i = -1
        if tone != 5:
            if 'a' in syll_l:
                i = syll_l.index('a')
            elif 'e' in syll_l:
                i = syll_l.index('e')
            elif 'o' in syll_l:
                i = syll_l.index('o')
            else:
                for j, c in enumerate(syll_l):
                    if c in 'iuü':
                        i = j

        if res:
            if (spec or syll[0] != syll_l[0]) and not res.endswith(' '):
                res += ' '
            elif syll_l[0] in 'aeo':
                res += "'"

        if i != -1 and tone in '1234':
            res += syll[:i] + TONE_CHARS[syll[i]][int(tone)] + syll[i+1:]
        else:
            res += syll
        spec = False

    return res

df['PinyinNumbered'] = df['Pinyin']
df['Pinyin'] = [to_pinyin_with_diacritics(s) for s in df['Pinyin']]
df = df[['Traditional', 'Simplified', 'PinyinNumbered', 'Pinyin', 'Definitions']]
df.to_csv('cedict.csv', index=False)

df.tail()

Unnamed: 0,Traditional,Simplified,PinyinNumbered,Pinyin,Definitions
122106,𨭆,𬭶,hei1,hēi,hassium (chemistry)
122107,𨭎,𬭳,xi3,xǐ,seaborgium (chemistry)
122108,𩧢,𱅒,cheng3,chěng,variant of 騁|骋[cheng3]
122109,𰻞,𰻝,biang2,biáng,used in 𰻞𰻞麵|𰻝𰻝面[biang2 biang2 mian4]
122110,𰻞𰻞麵,𰻝𰻝面,biang2 biang2 mian4,biángbiángmiàn,"broad, belt-shaped noodles, popular in Shaanxi"


### Syllable stats

Create `syllables.csv` mapping characters between traditional/simplified and possible pronunciations.

TODO: merge pronunciations from variant characters.

In [4]:
syllables_data = {}

for entry in pd.read_csv('cedict.csv').itertuples():
    trad = entry.Traditional
    simp = entry.Simplified
    pinyin = entry.PinyinNumbered
    defs = entry.Definitions

    all_pinyin = [pinyin]
    pr_prefix = r'((?:also|colloquial|coll\.|always|old|Beijing|Taiwan|in this sense, commonly) pr\.|/pr\.)(?: for this sense is|)'
    for g in re.findall(pr_prefix + r' \[([^]/]+)\]', defs):
        all_pinyin.append(g[1])
    for g in re.findall(pr_prefix + r' \[([^]/]+)\](?:,| or) \[([^]/]+)\]', defs):
        # also pr. [lian2 lei3] or [lian2 lei4]
        all_pinyin.extend([g[1], g[2]])
    for g in re.findall(pr_prefix + r' \[([^]/]+)\](?:,| or) \[([^]/]+)\](?:,| or) \[([^]/]+)\]', defs):
        # Taiwan pr. [pang2], [bang1], [bang4]
        all_pinyin.extend([g[1], g[2], g[3]])

    for pinyin in all_pinyin:
        syllables = pinyin.lower().split(' ')
        for syll in syllables:
            m = re.fullmatch('^((?:[A-Za-z]|[aeoui]:)+)([1-5])', syll)
            if not m:
                # only foreign letters, one by one, and separators
                assert len(syll) == 1 and re.fullmatch('[A-Za-z,·]', syll)
                continue

        assert len(simp) == len(trad)
        if len(syllables) != len(trad):
            if 'single-character equivalent' in defs or defs == 'centiwatt (old)':
                assert len(trad) == 1
                trad_syllables = [trad[0]]
            print(f'Exception: {trad} [{pinyin}]')
            continue

        trad_syllables = list(trad)
        simp_syllables = list(simp)
        assert len(trad_syllables) == len(simp_syllables)

        for t,s,p in zip(trad_syllables, simp_syllables, syllables):
            key = t,s,p
            m = re.fullmatch('^((?:[A-Za-z]|[aeoui]:)+)([1-5])', p)
            if not m: continue
            syllables_data.setdefault(key, {
                'Traditional': t,
                'Simplified': s,
                'PinyinNumbered': p,
                'Pinyin': to_pinyin_with_diacritics(p),
                'Freq': 0
            })
            syllables_data[key]['Freq'] += 1

syllables_df = pd.DataFrame(syllables_data.values()).sort_values('Freq', ascending=False)
syllables_df.to_csv('syllables.csv', index=False)
print('syllables.csv: %d rows' % len(syllables_df))

syllables_df.head()

Exception: 21三體綜合症 [er4 shi2 yi1 san1 ti3 zong1 he2 zheng4]
Exception: PO [po1]
Exception: PO [pou1]
Exception: PO文 [po1 wen2]
Exception: TA [ta1]
Exception: ㄅㄧㄤˋ [biang4]
Exception: 兙 [shi2 ke4]
Exception: 兛 [qian1 ke4]
Exception: 兝 [fen1 ke4]
Exception: 兞 [hao2 ke4]
Exception: 兡 [bai3 ke4]
Exception: 兣 [li2 ke4]
Exception: 瓧 [shi2 wa3]
Exception: 瓩 [qian1 wa3]
Exception: 瓰 [fen1 wa3]
Exception: 瓱 [hao2 wa3]
Exception: 瓸 [bai3 wa3]
Exception: 瓼 [li3 wa3]
Exception: 粨 [bai3 mi3]
Exception: 美國51區 [Mei3 guo2 Wu3 shi2 yi1 Qu1]
syllables.csv: 14015 rows


Unnamed: 0,Traditional,Simplified,PinyinNumbered,Pinyin,Freq
333,不,不,bu4,bù,2154
1058,縣,县,xian4,xiàn,1856
409,大,大,da4,dà,1854
43,人,人,ren2,rén,1703
326,一,一,yi1,yī,1254
