# Imports & Settings

In [6]:
%matplotlib inline
import IPython.display as ipd

import os.path as osp
import torch
import commons
import utils
from text import cleaned_text_to_sequence
from models import SynthesizerTrn

DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.


In [7]:
def get_text(text, text_symbols):
    text = cleaned_text_to_sequence(text, text_symbols)
    if hps.data.add_blank:
        text = commons.intersperse(text, 0)
    text = torch.LongTensor(text)
    return text

# Model

In [8]:
MODEL_DIR = "logs/vits2_NeuOl_nosdp"
MODEL = "G_669000-1992.pth"
hps = utils.get_hparams_from_file(osp.join(MODEL_DIR, "config.json"))
text_symbols = [hps.data.pad] + list(hps.data.punctuation) + list(hps.data.characters)

In [9]:
spec_channels = hps.data.filter_length // 2 + 1

if "use_mel_posterior_encoder" in hps.model:
    spec_channels = 80 if hps.model.use_mel_posterior_encoder else hps.data.filter_length // 2 + 1

net_g = SynthesizerTrn(
    len(text_symbols),
    spec_channels,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint(osp.join(MODEL_DIR, MODEL), net_g, None)

INFO:root:Loaded checkpoint 'logs/vits2_NeuOl_nosdp/G_669000-1992.pth' (iteration 1992)


# Inference

In [10]:
t_tst = get_text("$ tohle je pokus. $", text_symbols)
with torch.no_grad():
    x_tst = t_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([t_tst.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))