In [1]:
import re
from collections import Counter
from utils import load_json, get_toned_bpm

pat_chr = re.compile(r'[一-龜]')

asbc_fq = load_json("ASBC_unigrams.json")
char_phon = load_json("moe_char_phon.json")
word_phon = load_json("moe_word_phon.json")

## Number of pronouncations in Moe Dict (Word)

In [2]:
Counter(len(v) for v in word_phon.values())

Counter({1: 150948, 2: 619, 3: 9})

## Number of pronouncations in Moe Dict (Character)

In [3]:
Counter(len(v) for v in char_phon.values())

Counter({1: 8643, 2: 1541, 3: 220, 4: 35, 6: 3, 5: 9})

## Frequency of phones in ASBC

In [4]:
char_phon_fq = Counter()
guessed_fq = Counter()
unresolved_fq = Counter()

for w, fq in asbc_fq.items():
    # Not chinese
    if not pat_chr.search(w):
        continue
    
    # Word
    if w in word_phon:
        phons = word_phon[w]
        phon = phons[0]
        for i, ch in enumerate(phon['bpm']):
            char_phon_fq.update({ch: fq})
            if len(phons) > 1:
                if len(char_phon.get(w[i], [])) > 1:
                    guessed_fq.update({w[i]: fq})
    # Character
    else:
        for ch in w:
            if ch in char_phon:
                phons = char_phon[ch]
                phon = get_toned_bpm(phons[0])
                char_phon_fq.update({phon: fq})
                if len(phons) > 1:
                    guessed_fq.update({ch: fq})
            else:
                unresolved_fq.update({ch: fq})

In [8]:
char_phon_fq.most_common(20)

[('ㄉㄧˋ', 666127),
 ('ㄕˋ', 354501),
 ('ㄧ', 230911),
 ('ㄅㄨˋ', 204599),
 ('ㄗㄞˋ', 161114),
 ('ㄧㄡˇ', 155121),
 ('ㄕˊ', 154902),
 ('ㄖㄣˊ', 138061),
 ('ㄧˇ', 115095),
 ('ㄊㄚ', 114845),
 ('ㄨㄛˇ', 102508),
 ('ㄓㄨㄥ', 98136),
 ('ㄌㄧㄠˇ', 97431),
 ('ㄨㄟˊ', 95407),
 ('ㄧˋ', 94025),
 ('ㄉㄠˋ', 93724),
 ('ㄉㄚˋ', 93322),
 ('ㄍㄜˋ', 92693),
 ('ㄌㄧˋ', 92592),
 ('ㄓ', 90190)]

In [5]:
corp_size = sum(len(w)*fq for w, fq in asbc_fq.items() if pat_chr.search(w))
guessed_size = sum(guessed_fq.values())
unresolved_size = sum(unresolved_fq.values())
resolved_size = sum(char_phon_fq.values())
print('Resolved:', resolved_size / corp_size)
print('Guessed:', guessed_size / resolved_size)

Resolved: 0.9969062128057027
Guessed: 0.19880567237306349
