In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from text import cleaned_text_to_sequence

from scipy.io.wavfile import write

from g2s_with_question import pyopenjtalk_g2p_prosody_with_questioin



def get_text(text, hps):
    text_norm = cleaned_text_to_sequence(text)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## MB-iSTFT-VITS

In [6]:
# 使用したjsonファイルを指定する
config_path = "./path/to/config.json"
hps = utils.get_hparams_from_file(config_path)

In [7]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

# 生成モデルパス指定
G_model_path = "./path/to/G_xxx.pth"
_ = utils.load_checkpoint(G_model_path, net_g, None)

Mutli-stream iSTFT VITS


In [31]:
input_sentence = "特許許可する東京特許許可局。"

stn_phn = pyopenjtalk_g2p_prosody_with_questioin(input_sentence)
print(stn_phn)
stn_tst = get_text(stn_phn, hps)
print(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

['^', 't', 'a', '[', 'n', 'i', 'N', 'n', 'i', '#', 'y', 'a', '[', 'r', 'a', 's', 'a', 'r', 'e', 't', 'e', '#', 'i', '[', 'r', 'u', '#', 'k', 'o', '[', 't', 'o', ']', 'o', '_', 'd', 'o', ']', 'ry', 'o', 'k', 'u', 't', 'o', 'w', 'a', '#', 'i', '[', 'w', 'a', 'n', 'e', 'e', '$']
tensor([ 1, 36,  7,  2, 26, 20, 27, 26, 20,  3, 42,  7,  2, 32,  7, 34,  7, 32,
        14, 36, 14,  3, 20,  2, 32, 39,  3, 22, 29,  2, 36, 29,  4, 29,  0, 12,
        29,  4, 33, 29, 22, 39, 36, 29, 41,  7,  3, 20,  2, 41,  7, 26, 14, 14,
         6])


