In [None]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path

import soundfile as sf
import torch
from prosody.utils.text import TextEffectProcessor
from prosody.utils.utils import get_p_mod_fns, print_table
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from espnet2.bin.tts_inference import Text2Speech

%load_ext autoreload
PWD = %pwd
PWD = Path(PWD)
LJSPEECH_DIR = (PWD / '../egs2/ljspeech/tts1/').resolve()
device = 'cuda'

In [None]:
cwd = os.getcwd()
os.chdir(LJSPEECH_DIR)
pretrained_dir = LJSPEECH_DIR / "exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space"
pretrained_model_file = pretrained_dir / "train.total_count.ave_5best.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / "config_prosody.yaml",
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model
os.chdir(cwd)

In [None]:
arpa_tokenizer = PhonemeTokenizer(g2p_type='g2p_en_no_space')
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)
tep = TextEffectProcessor(tokens2ids_fn=id_converter.tokens2ids)
save_dir = PWD / pretrained_dir.stem

In [19]:
orig_text = "THIS *is* ^ABso^LLLUUUTTTE_ly cr^aaaaaazy~~~!!"
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor = tep.get_inputs(orig_text, print_aligment=True)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor)
with torch.no_grad():
    output_dict = pretrained_model.tts.inference(text=phone_ids,
                                                 d_factor=d_factor,
                                                 p_factor=p_factor,
                                                 e_factor=e_factor,
                                                 d_split_factor=d_split_factor,
                                                 verbose=True,
                                                 )
    wav = output_dict['wav']
    filename = f'{orig_text}.wav'
    sf.write(save_dir / filename, wav.cpu().numpy(), 22050, "PCM_16")

      -1         0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16    17    18    19    20
   phonemes     DH   IH1   S   IH1   Z   AE2   B    S   AH0   L   UW1    T     L    IY0    K     R    EY1    Z    IY0    !     !
  phone_ids      9   12    6   12   11   50   25    6    2    8    28    4     8     29    10    7     31    11    29    72    72
   d_factor      1    1    1    1    1    1    1    1    1    1    1     1     1     1     1     1    1.17   1    1.25   1     1
   p_factor      0    0    0    0    0    1    0    0    0   0.5   1     0    -0.5   -1    0     0     1     0     0     0     0
   e_factor      1    2    1    2    1    2    1    0    0    1    2     1     0     0     0     0     0     0     0     0     0
d_split_factor   1    1    1    1    1    1    1    1    1    2    2     1     1     1     1     1     3     1     2     1     1
Duration pred: tensor([ 6.6845,  6.1176,  8.6422,  6.7246,  9.4612, 10.6020,  7.9712,  9.6626,


In [12]:
orig_text = "Suuuuuuuuure."
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor = tep.get_inputs(orig_text, print_alignment=True)
pitch_values = [[]] * len(phonemes)
pitch_values[1] = [-0.5, +1, -0.5]
combine_fn = lambda x,y: x+y
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
with torch.no_grad():
    output_dict = pretrained_model.tts.inference(text=phone_ids,
                                                 d_factor=d_factor,
                                                 d_split_factor=d_split_factor,
                                                 p_mod_fns=p_mod_fns,
                                                 verbose=True
                                                 )
    wav = output_dict['wav']
    filename = f'{orig_text}{",".join(str(p) for pitch_value in pitch_values for p in pitch_value)}.wav'
    sf.write(save_dir / filename, wav.cpu().numpy(), 22050, "PCM_16")

      -1         0        1        2    3
   phonemes     SH       UH1       R    .
  phone_ids     35       44        7   33
   d_factor      1        1        1    1
   p_factor      0        0        0    0
   e_factor      0        0        0    0
d_split_factor   1        5        1    1
 pitch_values        -0.5|1|-0.5
Duration pred: tensor([19.4989, 13.2591, 10.9338,  9.1970], device='cuda:0')
Pitch pred: tensor([ 0.8972,  0.1140, -0.8402, -0.9685], device='cuda:0')
Energy pred: tensor([ 0.1597,  0.8255, -0.3954, -1.0230], device='cuda:0')
Duration (new): tensor([19, 13, 13, 13, 13, 13, 11,  9], device='cuda:0')
Pitch (new): tensor([ 0.8972, -0.3860,  0.3640,  1.1140,  0.3640, -0.3860, -0.8402, -0.9685],
       device='cuda:0')
Energy (new): tensor([ 0.1597,  0.8255,  0.8255,  0.8255,  0.8255,  0.8255, -0.3954, -1.0230],
       device='cuda:0')
