In [1]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from espnet2.bin.tts_inference import Text2Speech

from prosody.pinyin import *
from prosody.en_to_zh import PinyinArpaSpeech, all_tones
%load_ext autoreload
%autoreload 2

PWD = %pwd
PWD = Path(PWD)
LJSPEECH_DIR = (PWD / '../egs2/ljspeech/tts1/').resolve()
device = 'cuda'

In [2]:
os.chdir(LJSPEECH_DIR)
pretrained_dir = LJSPEECH_DIR / "exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space"
pretrained_model_file = pretrained_dir / "train.total_count.ave_5best.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / "config_prosody.yaml",
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model
os.chdir(PWD)

Vocabulary size: 78
encoder self-attention layer type = self-attention
encoder self-attention layer type = self-attention
Initialize encoder.encoders.0.self_attn.linear_q.bias to zeros
Initialize encoder.encoders.0.self_attn.linear_k.bias to zeros
Initialize encoder.encoders.0.self_attn.linear_v.bias to zeros
Initialize encoder.encoders.0.self_attn.linear_out.bias to zeros
Initialize encoder.encoders.0.feed_forward.w_1.bias to zeros
Initialize encoder.encoders.0.feed_forward.w_2.bias to zeros
Initialize encoder.encoders.0.norm1.bias to zeros
Initialize encoder.encoders.0.norm2.bias to zeros
Initialize encoder.encoders.1.self_attn.linear_q.bias to zeros
Initialize encoder.encoders.1.self_attn.linear_k.bias to zeros
Initialize encoder.encoders.1.self_attn.linear_v.bias to zeros
Initialize encoder.encoders.1.self_attn.linear_out.bias to zeros
Initialize encoder.encoders.1.feed_forward.w_1.bias to zeros
Initialize encoder.encoders.1.feed_forward.w_2.bias to zeros
Initialize encoder.encoder

In [3]:
arpa_tokenizer = PhonemeTokenizer(g2p_type='g2p_en_no_space')
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)

In [47]:
save_dir = PWD / 'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3'
os.makedirs(save_dir, exist_ok=True)
pas = PinyinArpaSpeech(token_id_converter=id_converter, tts_inference_fn=pretrained_model.tts.generator.inference)

In [None]:
# Try on the first line of the AISHELL-3 dataset.
line = 'SSB06930002.wav	武 wu3 术 shu4 始 shi3 终 zhong1 被 bei4 看 kan4 作 zuo4 我 wo3 国 guo2 的 de5 国 guo2 粹 cui4'
custom_filename, chinese = line.strip().split(maxsplit=1)
chinese = re.sub(r'[ a-z0-9]', '', chinese) + '。'

inputs = pas.gen_audio(
    chinese,
    save_dir,
    verbose=True,
    overall_d_factor=8.0,
    fix_durations=True,
    arpa_in_filename=False,
    custom_filename=custom_filename,
    device=device,
)

In [49]:
# Now run on entire dataset
transcript_file = (PWD / '../../datasets/data_aishell3/test/content.txt').resolve()
with open(transcript_file) as f:
    for line in f:
        custom_filename, chinese = line.strip().split(maxsplit=1)
        chinese = re.sub(r'[ a-z0-9]', '', chinese) + '。'


        pas_update_dict = {
            'pinyin_to_arpa_durations': {},
            'tone_duration_split': {},
            'tone_contours': {},
            'nucleus_tone_only': False,
            'max_pitch_change': 2.5,
        }
        infer_overrides = {
            'd_split_factor': None,
            'd_factor': None,
            'p_factor': None,
            'e_factor': None,
            'd_mod_fns': None,
            'p_mod_fns': None,
            'e_mod_fns': None,
        }
        inputs = pas.gen_audio(
            chinese,
            save_dir,
            verbose=True,
            # disable_tones=True,
            inputs=None,
            pac_update_dict=pas_update_dict,
            infer_overrides=infer_overrides,
            overall_d_factor=8.0,
            fix_durations=True,
            arpa_in_filename=False,
            custom_filename=custom_filename,
            device=device,
        )
        # arpas, tones, d_factor, d_split_factor, pitch_values, p_mod_fns = inputs


In [None]:
# Try again without tones
save_dir = PWD / 'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitch'
os.makedirs(save_dir, exist_ok=True)
with open(transcript_file) as f:
    for line in f:
        custom_filename, chinese = line.strip().split(maxsplit=1)
        chinese = re.sub(r'[ a-z0-9]', '', chinese) + '。'

        inputs = pas.gen_audio(
            chinese,
            save_dir,
            verbose=True,
            disable_tones=True,
            inputs=None,
            pac_update_dict=pas_update_dict,
            infer_overrides=infer_overrides,
            overall_d_factor=8.0,
            fix_durations=True,
            arpa_in_filename=False,
            custom_filename=custom_filename,
            device=device,
        )
