In [1]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from espnet2.bin.tts_inference import Text2Speech

from prosody.pinyin import *
from prosody.en_to_zh import PinyinArpaSpeech, all_tones

PWD = %pwd
PWD = Path(PWD)
LJSPEECH_DIR = (PWD / '../egs2/ljspeech/tts1/').resolve()
device = 'cuda'

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
os.chdir(LJSPEECH_DIR)
pretrained_dir = LJSPEECH_DIR / "exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space"
pretrained_model_file = pretrained_dir / "train.total_count.ave_5best.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / "config_prosody.yaml",
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model
os.chdir(PWD)

 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm
 See also: https://github.com/espnet/espnet/pull/5240


In [4]:
arpa_tokenizer = PhonemeTokenizer(g2p_type='g2p_en_no_space')
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)

In [5]:
save_dir = PWD / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/mandarin'
pas = PinyinArpaSpeech(token_id_converter=id_converter, tts_inference_fn=pretrained_model.tts.generator.inference)

In [6]:
initial_pinyins = INITIAL_TO_PINYINS['zh']
initial_pinyin_tones = all_tones(initial_pinyins)
' '.join(initial_pinyin_tones)

'zhɨ1 zhɨ2 zhɨ3 zhɨ4 zha1 zha2 zha3 zha4 zhan1 zhan2 zhan3 zhan4 zhang1 zhang2 zhang3 zhang4 zhai1 zhai2 zhai3 zhai4 zhao1 zhao2 zhao3 zhao4 zhe1 zhe2 zhe3 zhe4 zhen1 zhen2 zhen3 zhen4 zheng1 zheng2 zheng3 zheng4 zhei1 zhei2 zhei3 zhei4 zhong1 zhong2 zhong3 zhong4 zhou1 zhou2 zhou3 zhou4 zhu1 zhu2 zhu3 zhu4 zhua1 zhua2 zhua3 zhua4 zhuo1 zhuo2 zhuo3 zhuo4 zhuai1 zhuai2 zhuai3 zhuai4 zhui1 zhui2 zhui3 zhui4 zhuan1 zhuan2 zhuan3 zhuan4 zhun1 zhun2 zhun3 zhun4 zhuang1 zhuang2 zhuang3 zhuang4'

In [7]:
rime_pinyins = RIME_TO_PINYINS['uang']
rime_pinyin_tones = all_tones(rime_pinyins)
' '.join(rime_pinyin_tones)

'duang1 duang2 duang3 duang4 guang1 guang2 guang3 guang4 kuang1 kuang2 kuang3 kuang4 huang1 huang2 huang3 huang4 zhuang1 zhuang2 zhuang3 zhuang4 chuang1 chuang2 chuang3 chuang4 shuang1 shuang2 shuang3 shuang4'

In [8]:
chinese = all_tones('zhuang') + ['.']
# chinese = rime_pinyin_tones[0:20] + ['.']
# chinese = rime_pinyin_tones[20:40] + ['.']
# chinese = rime_pinyin_tones[40:60] + ['.']
# chinese = rime_pinyin_tones[60:80] + ['.']
# chinese = '我提到的这个问题并不难处理.'
# chinese = '这个议会代表着欧洲民众.'
# chinese = '我相信你在那裡涉及到了某個要點.'
# chinese = '各項報導反映出這種主流態度.'
# chinese = initial_pinyin_tones[0:20] + ['.']
# chinese = initial_pinyin_tones[20:40] + ['.']
# chinese = initial_pinyin_tones[40:60] + ['.']
# chinese = '他晕倒了.'

pas_update_dict = {
    'pinyin_to_arpa_durations': {},
    'tone_duration_split': {},
    'tone_contours': {},
    'nucleus_tone_only': False,
    'max_pitch_change': 2.5,
}
infer_overrides = {
    # 'd_split_factor': None,
    # 'd_factor': None,
    # 'p_factor': None,
    # 'e_factor': None,
    # 'd_mod_fns': None,
    # 'p_mod_fns': None,
    # 'e_mod_fns': None,
}
inputs = pas.gen_audio(
    chinese,
    save_dir,
    inputs=None,
    pac_update_dict=pas_update_dict,
    infer_overrides=infer_overrides,
    overall_d_factor=1.0,
    vowel_duration=(9.0, 15.0),
    arpa_in_filename=len(chinese)<10,
    device=device,
)
arpas, tones, d_factor, d_split_factor, pitch_values, p_mod_fns = inputs

      -1         0    1    2    3    4    5    6    7         8        9    10    11    12         13         14    15    16    17         18          19     20
    arpas        T   SH    W   AH1  NG    T   SH    W        AH1      NG    T     SH    W         AH1         NG    T     SH    W          AH1         NG     .
   d_factor     1.0  0.0  1.0  1.0  1.0  1.0  0.0  1.0       1.0      1.0  1.0   0.0   1.0        1.0        1.0   1.0   0.0   1.0         1.0         1.0   1.0
d_split_factor   1    1    1    2    1    1    1    1         3        1    1     1     1          3          1     1     1     1           3           1     1
    tones        1    1    1    1    1    2    2    2         2        2    3     3     3          3          3     4     4     4           4           4     0
 pitch_values    2    2    2    2    2   -1   -1   -0.4  0.2|0.8|1.4   2    -1    -1   -1.4  -1.8|-1.8|-1.4   -1   1.5   1.52  0.94  0.36|-0.22|-0.8  -1.38   -1
Duration pred: tensor([ 8.5702,  9.48