# Prepare for G2PA

In [2]:
import json, re, os, shutil
from pypinyin_dict.phrase_pinyin_data import large_pinyin
from pypinyin_dict.pinyin_data import zdic
large_pinyin.load()
zdic.load()
from pypinyin import pinyin, lazy_pinyin, Style
import jieba.posseg as psg
import jieba
jieba.enable_parallel()
from g2pM import G2pM


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.541 seconds.
Prefix dict has been built successfully.


In [3]:
with open('../line_list.json', 'r') as of:
    line_list = json.load(of)
    
line_list[:2]

[['000001', '卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1'],
 ['000002', '假语村言别再拥抱我。', 'jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3']]

In [4]:
def filter_punc(text):
    return re.sub(r'[,.!?;:、。？！ 《》<>，。_…“”：]', '', text).replace(' ', '')

In [5]:
g2pm = G2pM()

In [6]:
line_list[-1]

['010000',
 '在狱中，张明宝悔恨交加，写了一份忏悔书。',
 'zai4 yu4 zhong1 zhang1 ming2 bao2 hui3 hen4 jiao1 jia1 xie3 le5 yi2 fen4 chan4 hui3 shu1']

In [7]:
test_py_list = []
cp_dir = '/home/xingxing/tts/dataset/BZNSYP/Wave/'
base_dir = 'lab_wav_pairs'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

for l in line_list[8000:]:
    file_name, file_text, py = l
    # print(file_name, file_text, py)
    line_py = lazy_pinyin(list(jieba.cut(file_text)), style=Style.TONE3, tone_sandhi=True, neutral_tone_with_five=True)
    line_py = list(filter(lambda x: len(filter_punc(x)) == len(x), line_py))
    g2pm_py = g2pm(file_text, tone=True)
    g2pm_py = list(filter(lambda x: len(filter_punc(x)) == len(x), g2pm_py))
    # g2pw_py = g2pw(file_text)[0]
    # g2pw_py = list(filter(lambda x: x != None and len(filter_punc(x)) == len(x), g2pw_py))
    hetero_py = [pinyin(char, heteronym=True, style=Style.TONE3, neutral_tone_with_five=True)[0] for char in filter_punc(file_text)]
    # print(line_py, hetero_py)
    assert len(line_py) == len(hetero_py)
    if len(line_py) == len(py.split(' ')):
        test_py_list.append({
            'file_name': file_name,
            'line_text': file_text,
            'gt_py': py,
            'line_py': line_py,
            'hetero_py': hetero_py,
            'g2pm_py': g2pm_py,
        })
        shutil.copyfile(cp_dir + file_name + '.wav', base_dir + '/' + file_name + '.wav')
        save_path = os.path.join(base_dir, file_name + '.lab')
        f = open(save_path, "w")
        f.write(' '.join(line_py))
        f.close()

In [8]:
test_py_list

[{'file_name': '008002',
  'line_text': '喜欢把李敖的书插在牛仔裤的前面？',
  'gt_py': 'xi3 huan1 ba2 li3 ao2 de5 shu1 cha1 zai4 niu2 zai3 ku4 de5 qian2 mian4',
  'line_py': ['xi3',
   'huan5',
   'ba3',
   'li3',
   'ao2',
   'de5',
   'shu1',
   'cha1',
   'zai4',
   'niu2',
   'zai3',
   'ku4',
   'de5',
   'qian2',
   'mian4'],
  'hetero_py': [['xi3'],
   ['huan1'],
   ['ba3', 'ba4'],
   ['li3'],
   ['ao2'],
   ['de5', 'di2', 'di4'],
   ['shu1'],
   ['cha1'],
   ['zai4'],
   ['niu2'],
   ['zi1', 'zi3', 'zai3'],
   ['ku4'],
   ['de5', 'di2', 'di4'],
   ['qian2'],
   ['mian4']],
  'g2pm_py': ['xi3',
   'huan1',
   'ba3',
   'li3',
   'ao2',
   'de5',
   'shu1',
   'cha1',
   'zai4',
   'niu2',
   'zai3',
   'ku4',
   'de5',
   'qian2',
   'mian4']},
 {'file_name': '008003',
  'line_text': '乍眼一看，想起的是广州的区号“零二零”。',
  'gt_py': 'zha4 yan3 yi2 kan4 xiang2 qi3 de5 shi4 guang3 zhou1 de5 qu1 hao4 ling2 er4 ling2',
  'line_py': ['zha4',
   'yan3',
   'yi1',
   'kan4',
   'xiang2',
   'qi3',
   'de5',
   'shi4

In [9]:
for l in test_py_list:
    pypinyin_py = l['line_py']
    file_name = l['file_name']
    save_path = os.path.join('lab_wav_pairs', file_name + '.lab')
    f = open(save_path, "w")
    f.write(py)

In [10]:
def write_to_json(data, output_file):
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

write_to_json(test_py_list, 'test_py_list.json')

# Evaluate

In [11]:
total = 0
correct = 0

for l in test_py_list:
    gt_py_list = l['gt_py'].split(' ')
    # print(gt_py_list)
    pypinyin_py = l['line_py']
    if len(gt_py_list) != len(pypinyin_py):
        # print(gt_py_list, pypinyin_py)
        continue
    for i in range(len(gt_py_list)):
        if gt_py_list[i][:-1] == pypinyin_py[i][:-1]:
            correct += 1
        total += 1

print(correct/total)

0.9954270103337689


In [12]:
total = 0
correct = 0

for l in test_py_list:
    gt_py_list = l['gt_py'].split(' ')
    # print(gt_py_list)
    g2pm_py = l['g2pm_py']
    if len(g2pm_py) != len(gt_py_list):
        # print(gt_py_list, pypinyin_py)
        continue
    for i in range(len(gt_py_list)):
        if gt_py_list[i][:-1] == g2pm_py[i][:-1]:
            correct += 1
        total += 1

print(correct/total)

0.9927433795118273


In [24]:

with open('g2pa_pairs4.json', 'r') as of:
    g2pa_pairs = json.load(of)
    
g2pa_pairs.keys()


dict_keys(['009419', '008493', '009112', '009334', '008763', '009497', '009668', '009872', '008073', '008822', '008846', '008336', '009713', '009274', '008895', '009302', '009510', '009856', '008547', '009227', '008300', '008707', '009453', '008661', '008482', '009976', '008673', '008729', '008618', '009104', '008517', '008553', '009592', '009457', '009776', '009723', '009705', '009398', '009350', '008454', '008701', '009670', '008225', '008243', '008170', '009115', '008610', '008605', '009451', '008163', '009684', '008451', '009589', '008337', '008543', '008007', '009476', '009149', '009638', '009809', '008640', '009793', '008190', '009247', '008360', '008676', '008124', '009676', '008565', '008119', '008563', '008233', '009093', '008276', '009077', '008017', '008039', '008371', '008603', '009932', '009516', '009975', '009584', '009060', '008704', '009771', '008509', '009595', '009963', '009161', '009139', '009525', '008599', '008718', '009677', '008143', '008970', '008237', '009824',

In [25]:
total = 0
correct = 0

for l in test_py_list:
    gt_py_list = l['gt_py'].split(' ')
    # print(gt_py_list)
    print(l['file_name'])
    if l['file_name'] not in g2pa_pairs:
        print(l['file_name'])
        continue
    g2pa_py = g2pa_pairs[l['file_name']].split(' ')
    if len(g2pa_py) != len(gt_py_list):
        # print(gt_py_list, pypinyin_py)
        continue
    for i in range(len(gt_py_list)):
        if gt_py_list[i][:-1] == g2pa_py[i][:-1]:
            correct += 1
        total += 1

print(correct/total)

008002
008003
008004
008005
008006
008007
008008
008009
008010
008011
008012
008013
008014
008015
008016
008017
008018
008019
008020
008021
008022
008023
008024
008025
008027
008028
008029
008030
008031
008032
008033
008034
008035
008036
008037
008038
008039
008040
008041
008042
008043
008044
008045
008046
008047
008048
008049
008050
008051
008052
008053
008054
008055
008056
008057
008058
008060
008061
008062
008063
008064
008065
008066
008068
008069
008070
008071
008073
008074
008075
008076
008077
008078
008079
008080
008081
008082
008083
008084
008085
008086
008087
008088
008089
008090
008092
008093
008094
008095
008096
008097
008098
008099
008100
008101
008102
008103
008104
008105
008106
008107
008108
008109
008110
008111
008112
008113
008114
008115
008116
008117
008118
008119
008120
008121
008122
008123
008124
008125
008126
008127
008128
008129
008130
008131
008132
008133
008134
008135
008136
008137
008138
008139
008140
008141
008142
008143
008144
008145
008146
008147
008148
008149