## Set up imports

In [1]:
from models.models import (
    SynthesizerTrn,
)

from text.symbols import symbols

from utils import utils
from text import text_to_sequence, cleaned_text_to_sequence

from utils import commons
import torch
import scipy

## Auxilliary functions

In [2]:
def get_text(text, hps):
    # text_norm = text_to_sequence(text, hps.data.text_cleaners)
    text_norm = text_to_sequence(text, ["no_cleaners"])
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [3]:
def get_phn_text(phn, hps):
    if hps.data.add_blank:
        phn = commons.intersperse(phn, 0)
    phn = torch.LongTensor(phn)
    return phn

## Inference

In [4]:
# Input seetings
CONFIG = 'configs/ljs_reproduce.json'
MODEL = './logs/exp3/G_420.pth'
TEXT = 'mˈoːɹ sˈiəɹɪəsli, ðə fˈækts ʌv hɪz dɪfˈɛkʃən hɐd bɪkˌʌm nˈoʊn, lˈiːvɪŋ hˌɪm ˈoʊpən tʊ ˈɔːlmoʊst ʌnˈænsɚɹəbəl ɐtˈæk baɪ ðoʊz hˌuː əpˈoʊzd hɪz vjˈuːz.'
# Inference settings
NOISE_SCALE = 0.667
LENGTH_SCALE = 1.0
MAX_LEN = 1200

In [5]:
hps = utils.get_hparams_from_file(CONFIG)
model_path = MODEL
text = TEXT

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    hps.models,
).cuda(0)

net_g.attach_memory_bank(hps.models)

_, _, _, epoch_str = utils.load_checkpoint(
    model_path, net_g, None
)

net_g.eval()

x = get_text(text, hps).cuda().unsqueeze(0)
x_lengths = torch.LongTensor([x.size(1)]).cuda()

with torch.no_grad():
    y_hat, mask, *_ = net_g.infer(x, x_lengths, noise_scale=NOISE_SCALE, length_scale=LENGTH_SCALE, max_len=MAX_LEN)
    audio = y_hat[0, 0, :].cpu().numpy()

scipy.io.wavfile.write(
    filename="result.wav",
    rate=hps.data.sampling_rate,
    data=audio,
)

INFO:root:Loaded checkpoint './logs/exp3/G_420.pth' (iteration 420)
