In [1]:
import json, os
from pypinyin import pinyin, lazy_pinyin, Style
from pypinyin_dict.pinyin_data import kxhc1983
kxhc1983.load()

In [2]:

def load_json(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

pair_list = load_json('./data_pre/biaobei_list.json') + load_json('./data_pre/aishell3_list.json') + load_json('./data_pre/thchs30_list.json')


In [3]:
pair_list[:3]

[['/home/xingxing/tts/dataset/BZNSYP/Wave/000001.wav',
  '卡尔普陪外孙玩滑梯。',
  ['ka2', 'er2', 'pu3', 'pei2', 'wai4', 'sun1', 'wan2', 'hua2', 'ti1']],
 ['/home/xingxing/tts/dataset/BZNSYP/Wave/000002.wav',
  '假语村言别再拥抱我。',
  ['jia2', 'yu3', 'cun1', 'yan2', 'bie2', 'zai4', 'yong1', 'bao4', 'wo3']],
 ['/home/xingxing/tts/dataset/BZNSYP/Wave/000003.wav',
  '宝马配挂跛骡鞍，貂蝉怨枕董翁榻。',
  ['bao2',
   'ma3',
   'pei4',
   'gua4',
   'bo3',
   'luo2',
   'an1',
   'diao1',
   'chan2',
   'yuan4',
   'zhen3',
   'dong3',
   'weng1',
   'ta4']]]

In [4]:
import re

def filter_punctuations(text):
    # Define the pattern to match English and Chinese punctuations
    pattern = r'[^\w\s\u4e00-\u9fa5]'

    # Remove the punctuations from the text using regex
    filtered_text = re.sub(pattern, '', text)

    return filtered_text

# Example usage
chinese_text = "这是一个示例文本，包含一些标点符号！..."
filtered_text = filter_punctuations(chinese_text)
print(filtered_text)

这是一个示例文本包含一些标点符号


In [15]:
total_chars = 0
total_polyphones = 0
total_pp_sents = 0

pp_char_stats = {}
total_uni_chars = set()

for p in pair_list:
    pf = filter_punctuations(p[1])
    pys = pinyin(pf, heteronym=True, style=Style.TONE3, neutral_tone_with_five=True)
    total_chars += len(pys)
    len_phs = len(list(filter(lambda x: len(x) > 1, pys)))
    for i, p in enumerate(pys):
        total_uni_chars.add(pf[i])
        if len(p) > 1:
            if pf[i] in pp_char_stats:
                pp_char_stats[pf[i]] += 1
            else:
                pp_char_stats[pf[i]] = 1
    total_polyphones += len_phs
    if len_phs > 0:
        total_pp_sents += 1

In [7]:
total_chars, total_polyphones

(1599493, 263743)

In [8]:
total_polyphones/total_chars

0.16489162503368254

In [10]:
total_pp_sents / len(pair_list)

0.8616161978783364

In [13]:
len(pp_char_stats)

700

In [16]:
len(total_uni_chars)

4844

In [17]:
700/4844

0.14450867052023122

In [19]:
total_pys = 0
r_pys = 0
er_pys = 0

for p in pair_list:
    pys = p[-1]
    total_pys += len(pys)
    for p in pys:
        if p[-2] == 'r':
            r_pys += 1
        if p[:2] == 'er':
            er_pys += 1    
    

In [20]:
total_pys, r_pys, er_pys

(1597455, 17998, 15955)

In [23]:
(r_pys - er_pys) / r_pys

0.11351261251250139

In [24]:
17998 - 15955

2043

In [35]:
ts_pys = 0

for p in pair_list:
    pf = filter_punctuations(p[1])
    tpys = p[2]
    pys = pinyin(pf, heteronym=True, style=Style.TONE3, neutral_tone_with_five=True)
    if len(tpys) != len(pys):
        continue
    for i in range(len(tpys)):
        if tpys[i] not in pys[i]:
            ts_pys += 1

In [36]:
ts_pys, ts_pys/total_chars

(81792, 0.05113620378457424)

In [42]:
pypinyin_err = 0
p_total_pys = 0
err_chars = {}

for p in pair_list:
    pf = filter_punctuations(p[1])
    tpys = p[2]
    pys = lazy_pinyin(pf, style=Style.TONE3, neutral_tone_with_five=True, tone_sandhi=True)
    if len(tpys) != len(pys):
        continue
    for i in range(len(tpys)):
        if tpys[i] != pys[i]:
            pypinyin_err += 1
            if pf[i] not in err_chars:
                err_chars[pf[i]] = 1
            else:
                err_chars[pf[i]] += 1
        p_total_pys += 1

In [43]:
pypinyin_err, p_total_pys, pypinyin_err/p_total_pys

(158336, 1573494, 0.10062701224154652)

In [45]:
sorted(err_chars.items(), key=lambda x: x[1], reverse=True)

[('一', 10274),
 ('上', 3731),
 ('市', 3153),
 ('要', 3099),
 ('有', 3053),
 ('万', 2901),
 ('着', 2573),
 ('个', 2406),
 ('与', 2361),
 ('说', 2250),
 ('作', 2160),
 ('已', 2159),
 ('什', 1966),
 ('我', 1820),
 ('为', 1667),
 ('从', 1644),
 ('那', 1522),
 ('并', 1520),
 ('看', 1518),
 ('得', 1458),
 ('无', 1457),
 ('区', 1378),
 ('过', 1269),
 ('子', 1236),
 ('更', 1190),
 ('行', 1117),
 ('五', 1107),
 ('吗', 1106),
 ('不', 1083),
 ('百', 1078),
 ('其', 919),
 ('产', 914),
 ('场', 907),
 ('种', 905),
 ('九', 898),
 ('影', 897),
 ('点', 889),
 ('给', 874),
 ('你', 866),
 ('育', 862),
 ('约', 846),
 ('教', 759),
 ('据', 755),
 ('几', 750),
 ('管', 734),
 ('化', 704),
 ('格', 694),
 ('指', 691),
 ('厂', 689),
 ('地', 671),
 ('称', 645),
 ('保', 642),
 ('罗', 631),
 ('也', 623),
 ('南', 612),
 ('各', 611),
 ('所', 592),
 ('小', 590),
 ('智', 579),
 ('只', 552),
 ('台', 551),
 ('吧', 546),
 ('处', 544),
 ('片', 537),
 ('正', 520),
 ('打', 514),
 ('以', 473),
 ('红', 450),
 ('派', 447),
 ('长', 440),
 ('跑', 438),
 ('十', 435),
 ('量', 432),
 ('很', 416),
 ('曲', 