In [None]:
# %matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np

import commons
import utils
# from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import  cleaned_text_to_sequence
from utils import load_wav_to_torch
from mel_processing import spectrogram_torch
from scipy.io.wavfile import write

from text.cleaners import _clean_text


config_path = 'configs/config.json'
hps = utils.get_hparams_from_file(config_path)

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

def get_text(text, hps):
    cleaned_text, lang = _clean_text(text)
    print(cleaned_text)
    text_norm = cleaned_text_to_sequence(cleaned_text)
    if hps.data.add_blank:
        text_norm,lang = commons.intersperse_with_language_id(text_norm,lang, 0)
    text_norm = torch.LongTensor(text_norm)
    lang = torch.LongTensor(lang)
    return text_norm,lang,cleaned_text

dev = "cuda"
net_g = net_g.to(dev)

In [None]:
# stn_tst = get_text("VITS is Awesome!", hps)
# with torch.no_grad():
#     x_tst = stn_tst.cuda().unsqueeze(0)
#     x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
#     sid = torch.LongTensor([4]).cuda()
#     audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
# ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
_ = utils.load_checkpoint(utils.latest_checkpoint_path("logs/snac/", "G_*.pth"), net_g,
                      None, None)


In [None]:
text = "[ZH]你说的对，但是《原神》是由米哈游自主研发的一款全新开放世界冒险游戏。游戏发生在一个被称作提瓦特的幻想世界，在这里，被神选中的人将被授予「神之眼」，导引元素之力。你将扮演一位名为「旅行者」的神秘角色，在自由的旅行中邂逅性格各异、能力独特的同伴们，和他们一起击败强敌，找回失散的亲人[ZH]"
text = "[ZH]你说的对，但是《原神》是由米哈游自主研发的一款全新开放世界冒险游戏。游戏发生在一个被称作提瓦特的幻想世界，在这里，被神选中的人将被授予「神之眼」，导引元素之力。[ZH]"
text = "[ZH]你说的对，但是原神是由米哈游自主研发的一款全新开放世界冒险游戏。[ZH]"
spk = "paimon"
text_norm,lang,_ = get_text(text, hps)

text_norm = torch.LongTensor(text_norm)
lang = torch.LongTensor(lang)
x_tst = text_norm.to(dev).unsqueeze(0)
lang = lang.to(dev).unsqueeze(0)
x_tst_lengths = torch.LongTensor([text_norm.size(0)]).to(dev)
speaker_id = torch.LongTensor([0]).to(dev)

In [None]:
latents = []
for i in range(4):
    with torch.no_grad():
        y_hat,_,_,_,latent =net_g.infer(x_tst, x_tst_lengths, lang,None,
                                       sid=speaker_id,  predict_style=True, style_noise_scale=1.4)
        audio = y_hat[0, :, :].cpu().numpy()
        ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
        latents.append(latent)

In [None]:
za = latent

In [None]:
zb = latent

In [None]:
for ratio in [0, 0.2,0.4, 0.6,0.8, 1]:
    z = za * (1-ratio) + zb * ratio
    with torch.no_grad():
        y_hat,_,_,_,latent =net_g.infer(x_tst, x_tst_lengths, lang,None,
                                       sid=speaker_id,  predict_style=True, manual_latent=z)
        audio = y_hat[0, :, :].cpu().numpy()
        ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
        # latents.append(latent)

In [None]:
from mel_processing import spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text
import librosa
import wav2ssl
ssl_type = "chinese_hubert"

ssl_model = wav2ssl.get_ssl_model(ssl_type).to(dev)

def get_audio(filename):
    import utils
    wav16k, sr = librosa.load(filename, sr=16000)
    wav16k = torch.from_numpy(wav16k).to(dev)
    feats = wav2ssl.get_ssl_content(ssl_type, ssl_model, wav16k)
    
    audio, sampling_rate = librosa.load(filename, sr=44100)
    audio = torch.FloatTensor(audio)
    if sampling_rate != hps.data.sampling_rate:
        raise ValueError("{} {} SR doesn't match target {} SR".format(
            sampling_rate, hps.data.sampling_rate))
    audio_norm = audio / hps.data.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    spec_filename = filename.replace(".wav", ".spec.pt")
    if os.path.exists(spec_filename):
        spec = torch.load(spec_filename)
    else:
        spec = spectrogram_torch(audio_norm, hps.data.filter_length,
                                 hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
                                 center=False)
        spec = torch.squeeze(spec, 0)
        torch.save(spec, spec_filename)
    feats = F.interpolate(feats, size=spec.shape[-1], mode="nearest").squeeze(0)


    
    return feats, None

In [None]:
src = "dataset/SSB1125/SSB11250140.wav"
src = "test.wav"
# src = "dataset/paimon/vo_XMAQ014_6_paimon_11.wav"
tgt = "dataset/paimon/vo_XMAQ014_6_paimon_11.wav"
spec_tgt = get_audio(tgt)[0].unsqueeze(0).to(dev)
spec_tgt_lengths = torch.LongTensor([spec_tgt.size(-1)]).to(dev)
spec_src = get_audio(src)[0].unsqueeze(0).to(dev)
spec_src_lengths = torch.LongTensor([spec_src.size(-1)]).to(dev)


In [None]:
with torch.no_grad():
    y_hat,_,_,_,latent =net_g.infer(x_tst, x_tst_lengths, lang,spec_src,
                                   sid=speaker_id,  predict_style=False)
    audio = y_hat[0, :, :].cpu().numpy()
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [None]:
with torch.no_grad():
    y_hat,_,_  =net_g.voice_conversion(spec_src,spec_src_lengths,spec_tgt)
    audio = y_hat[0, :, :].cpu().numpy()
    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [None]:
spec_src_lengths