## To install and use zm-text-tts pretrained model 

```
git clone https://github.com/iamanigeeit/zm-text-tts
# Install espnet
cd zm-text-tts/tools
./setup_anaconda.sh ${CONDA_PREFIX} zm-text-tts 3.10
conda activate zm-text-tts
make TH_VERSION=1.13.1 CUDA_VERSION=11.7
cd ..
pip install -e .
# Download the pretrained model
cd egs2
git clone https://huggingface.co/saefro991/tts_ipa_css10_7lang_textpretrain_residual_freeze
# Move the model so that the dump and exp folders are together with conf, scripts, utils etc in standard espnet format
mv tts_ipa_css10_7lang_textpretrain_residual_freeze/* tts1
cd tts1
```

## To install vocoder packages
```
conda activate zm-text-tts
git clone https://github.com/kan-bayashi/ParallelWaveGAN.git
cd ParallelWaveGAN
pip install -e .
```

Download the vocoder checkpoint `hifigan16k_libritts_css10_vctk` from [here](https://drive.google.com/drive/folders/1pemypbNBYJPf_rT2pzcnVk7GjK8WtX4E?usp=sharing).


In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path

import re
import soundfile as sf
import torch
from espnet2.text.token_id_converter import TokenIDConverter
from parallel_wavegan.utils import load_model as load_vocoder
from espnet2.bin.tts_inference import Text2Speech
from py2ipa import pinyin2ipa

PWD = %pwd

# Change the following paths
VOCODER_CKPT = '/home/perry/PycharmProjects/vocoders/hifigan16k_libritts_css10_vctk/checkpoint-2000000steps.pkl'
SAVE_DIR = Path('/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3')
TRANSCRIPT_FILE = '/home/perry/PycharmProjects/datasets/data_aishell3/test/content.txt' 
device = 'cuda'

In [5]:
pretrained_dir = Path(PWD) / "exp/tts_train_raw_phn_none"
pretrained_model_file = pretrained_dir / "latest.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / 'config.yaml',
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model

In [6]:
vocoder = load_vocoder(VOCODER_CKPT)
vocoder.remove_weight_norm()
vocoder = vocoder.eval().to(device)

In [7]:
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)

In [8]:
pretrained_model.tts.use_gst = False

In [9]:
from kaldiio import ReadHelper
def read_xvectors(tts_dir, filename):
    filetype = filename.split('.')[-1]
    pwd = os.getcwd()
    os.chdir(tts_dir)
    with ReadHelper(f'{filetype}:{filename}') as reader:
        xvector_dict = {i: xvector for i, xvector in reader}
    os.chdir(pwd)
    return xvector_dict

In [10]:
spk_xvectors = read_xvectors(PWD, 'dump/xvector/test/spk_xvector.ark')

In [11]:
lid2spk = {
    2: 'css10_de', 3: 'css10_el', 8: 'css10_fi', 9: 'css10_fr', 11: 'css10_hu', 14: 'css10_nl', 17: 'css10_ru',
}

In [12]:
def save_wav(phones, filename='', lid=None, save_dir=SAVE_DIR, **kwargs):
    if not filename:
        filename = f'{"".join(phones)}{lid}.wav'
        filename = filename.replace('<sos/eos>', '_')
    filepath = save_dir / filename
    if filepath.exists():
        return
    else:
        print(filepath)
    with torch.no_grad():
        phone_ids = torch.IntTensor(id_converter.tokens2ids(phones)).to(device)
        if lid is None:
            pretrained_model.tts.use_encoder_w_lid = False
            pretrained_model.tts.spk_embed_dim = None
            output_dic = pretrained_model.tts.inference(text=phone_ids, **kwargs)
        else:
            pretrained_model.tts.use_encoder_w_lid = True
            lids = torch.tensor([lid]).to(device)
            if lid in lid2spk:
                spk = lid2spk[lid]
                spembs = torch.tensor(spk_xvectors[spk]).to(device).squeeze()
                pretrained_model.tts.spk_embed_dims = len(spembs)
                output_dic = pretrained_model.tts.inference(text=phone_ids, lids=lids, spembs=spembs, **kwargs)
            else:
                pretrained_model.tts.spk_embed_dim = None
                output_dic = pretrained_model.tts.inference(text=phone_ids, lids=lids, **kwargs)
        mel = output_dic['feat_gen']
        wav = vocoder.inference(mel, normalize_before=False).view(-1)
    os.makedirs(save_dir, exist_ok=True)
    
    # save as PCM 16 bit wav file
    sf.write(
        filepath,
        wav.detach().cpu().numpy(),
        16000,
        "PCM_16",
    )

In [13]:
phones = 'p i t a <sos/eos>'.split()
save_wav(phones, lid=None)
for lid in lid2spk.keys():
    save_wav(phones, lid=lid)

/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_None.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_2.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_3.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_8.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_9.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_11.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_14.wav
/home/perry/PycharmProjects/present/prosody/outputs/zm-text-tts/aishell3/pita_17.wav


In [3]:
with open(TRANSCRIPT_FILE) as f:
    for line in f:
        wav_file, transcript = line.strip().split(maxsplit=1)
        pinyins = re.sub(r'[^a-z1-5 ]', '', transcript).split()
        ipa = ' '.join(pinyin2ipa(py) for py in pinyins)
        print(wav_file, ipa)
        save_wav(phones=ipa.split(), filename=wav_file, lid=None)

SSB06930002.wav w u ʂ u ʂ ɨ tʃ ʊ ŋ p eɪ k h a n ts w ɔ w ɔ k w ɔ t ə k w ɔ ts h w eɪ
SSB06930003.wav ɕ i n tʃ ə ŋ t ə t h w eɪ tʃ h u ʂ ɨ j i ɕ j a ŋ tʃ h a ŋ j y ɛ n t ə tʃ ɨ t u ʔ a n p h aɪ
SSB06930004.wav tɕ h i ʂ ɨ w u l w ə n ʂ ɨ t h ʊ ŋ k w ɔ x ə ʂ ə n m ə tʃ h a ŋ ʂ a ŋ x ə ts w ɔ
SSB06930005.wav tɕ j a ŋ s u ɕ j oʊ ɕ j ɛ n aɪ n aɪ w a n tɕ y tɕ y ɛ n k h w a n j i ɹ ə n j a ŋ x w ɔ x w a n p i ŋ l aʊ p a n x ə ʔ ɚ ts ɨ
SSB06930006.wav j i tɕ i j i t h a ŋ tɕ h i t w eɪ tɕ h i m eɪ ʂ ɨ k a n l a n tɕ h j oʊ t ə tɕ j a ŋ tɕ j ɛ k h ə
SSB06930007.wav ʂ ɨ ts ɨ tɕ i t ə t aʊ k ə ŋ t w ɔ t ə ʂ ɨ x w eɪ
SSB06930008.wav x aɪ ʂ ɨ tɕ h y tʃ ə ŋ ts h ə t h w eɪ tʃ h u t ə l j a ŋ x aʊ tɕ h i tɕ i x ə ʂ ɨ tʃ h a ŋ t w eɪ k aɪ k ə t ə ɕ i n ɹ ə n
SSB06930010.wav t w eɪ k ʊ ŋ j i tɕ i n p u ʂ ɨ tɕ h i t aʊ f a n p h aɪ ts w ɔ j ʊ ŋ x aɪ ʂ ɨ f u ts w ɔ ts w ɔ j ʊ ŋ n ə
SSB06930011.wav s oʊ x u j y l ə ɕ y n x w a n tɕ h j oʊ k w ɔ tɕ i j i ŋ j ə j y ɕ y ɛ n p u
SSB06930012.wav ɹ u f a ɕ j ɛ 