In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import (
    TextAudioLoader,
    TextAudioCollate,
    TextAudioSpeakerLoader,
    TextAudioSpeakerCollate,
)
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence, _symbol_to_id

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def get_phone(phone_str, hps):
    sequence = []
    seq = list(phone_str)
    print(_symbol_to_id)
    for symbol in seq:
        if symbol in _symbol_to_id.keys():
            symbol_id = _symbol_to_id[symbol]
            sequence += [symbol_id]
        else:
            continue
    return torch.LongTensor(sequence)

DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=1039)
           2	LOAD_FAST(arg=0, lineno=1042)
           4	LOAD_CONST(arg=1, lineno=1042)
           6	BINARY_SUBSCR(arg=None, lineno=1042)
           8	LOAD_FAST(arg=0, lineno=1042)
          10	LOAD_CONST(arg=2, lineno=1042)
          12	BINARY_SUBSCR(arg=None, lineno=1042)
          14	COMPARE_OP(arg=4, lineno=1042)
          16	LOAD_FAST(arg=0, lineno=1042)
          18	LOAD_CONST(arg=1, lineno=1042)
          20	BINARY_SUBSCR(arg=None, lineno=1042)
          22	LOAD_FAST(arg=0, lineno=1042)
          24	LOAD_CONST(arg=3, lineno=1042)
          26	BINARY_SUBSCR(arg=None, lineno=1042)
          28	COMPARE_OP(arg=5, lineno=1042)
          30	BINARY_AND(arg=None, lineno=1042)
          32	RETURN_VALUE(arg=None, lineno=1042)
DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
DEBUG:numba.core.byteflow:stack: []
DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=0 nstack_

In [2]:
# hps.data.text_cleaners

In [3]:
device = torch.device("cpu")

In [4]:
hps = utils.get_hparams_from_file("/kaggle/repo/vits2-pytorch/logs/vlsp2023emo_base/config.json")

In [5]:
# symbols
# hps.data.filter_length // 2 + 1

In [6]:
net_g = SynthesizerTrn(
    len(symbols),
    80, # hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cpu()# .cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("./logs/vlsp2023emo_base/G_98000.pth", net_g, None)
# _ = utils.load_checkpoint("./logs/vlsp2023emo_base_r1/G_48000.pth", net_g, None)

In [7]:
import phonemizer

phonemizer.__version__

'2.2.1'

In [25]:
# stn_tst = get_text("cái gì", hps)
# stn_tst = get_phone("ɡaːɪɜ zi2", hps)
stn_tst = get_text("chiều nay có rảnh không", hps)
# stn_tst = get_phone("tɕiʊ2 naɪ1 ɡɔɜ zaː4ɲ xo1", hps)
with torch.no_grad():
    x_tst = stn_tst.cpu().unsqueeze(0) # .cuda()
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu() # .cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
# ləː5 baːɜn zɛ4 waːɜ ve2 bi6 tɕyɪ4 ti2 ʂaːʊ7

# emo transcript sai: 003795.wav | ba nói->nghĩ gì vậy

tɕiʊ2 naɪ1 ɡɔɜ zaː4ɲ xo1
t
ɕ
i
ʊ
 
n
a
ɪ
 
ɡ
ɔ
ɜ
 
z
a
ː
ɲ
 
x
o


In [34]:
import pandas as pd
df = pd.read_csv("/kaggle/repo/vlsp2023-ess/text.txt", delimiter="|", header=None)
df.columns = ["id", "text"]

save_dir = "./infer"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for i, row in df.iterrows():
    if i < 10:
        print(row["text"])
        stn_tst = get_text(row["text"], hps)
        with torch.no_grad():
            x_tst = stn_tst.cpu().unsqueeze(0) # .cuda()
            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu() # .cuda()
            audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
        # ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
        # save to file with format: f"{id}-vits2.wav"
        write(os.path.join(save_dir, f"{row['id']}-vits2_G_98000.wav"), hps.data.sampling_rate, audio)
    else:
        break

chị rất là hạnh phúc
tɕi6 zəɜt̪ laː2 haː6ɲ fuɜkh
t
ɕ
i
 
z
ə
ɜ
t
 
l
a
ː
 
h
a
ː
ɲ
 
f
u
ɜ
k
h
chị rất là hạnh phúc với cuộc sống của mình
tɕi6 zəɜt̪ laː2 haː6ɲ fuɜkh vəːɪɜ ɡuə6kh ʂoɜ ɡuə4 mi2ɲ
t
ɕ
i
 
z
ə
ɜ
t
 
l
a
ː
 
h
a
ː
ɲ
 
f
u
ɜ
k
h
 
v
ə
ː
ɪ
ɜ
 
ɡ
u
ə
k
h
 
ʂ
o
ɜ
 
ɡ
u
ə
 
m
i
ɲ
chị rất là hạnh phúc với cuộc sống của mình nói chung là trong cuộc sống thì
tɕi6 zəɜt̪ laː2 haː6ɲ fuɜkh vəːɪɜ ɡuə6kh ʂoɜ ɡuə4 mi2ɲ nɔɪɜ tɕu1ŋ laː2 tʃɔ1 ɡuə6kh ʂoɜ ti2
t
ɕ
i
 
z
ə
ɜ
t
 
l
a
ː
 
h
a
ː
ɲ
 
f
u
ɜ
k
h
 
v
ə
ː
ɪ
ɜ
 
ɡ
u
ə
k
h
 
ʂ
o
ɜ
 
ɡ
u
ə
 
m
i
ɲ
 
n
ɔ
ɪ
ɜ
 
t
ɕ
u
ŋ
 
l
a
ː
 
t
ʃ
ɔ
 
ɡ
u
ə
k
h
 
ʂ
o
ɜ
 
t
i
quá dễ dàng từ bỏ một cái
waːɜ ze5 zaː2ŋ t̪y2 bɔ4 mo6t̪ ɡaːɪɜ
w
a
ː
ɜ
 
z
e
 
z
a
ː
ŋ
 
t
y
 
b
ɔ
 
m
o
t
 
ɡ
a
ː
ɪ
ɜ
quá dễ dàng từ bỏ một cái kế hoạch của mình
waːɜ ze5 zaː2ŋ t̪y2 bɔ4 mo6t̪ ɡaːɪɜ keɜ hoə6c ɡuə4 mi2ɲ
w
a
ː
ɜ
 
z
e
 
z
a
ː
ŋ
 
t
y
 
b
ɔ
 
m
o
t
 
ɡ
a
ː
ɪ
ɜ
 
k
e
ɜ
 
h
o
ə
c
 
ɡ
u
ə
 
m
i
ɲ
quá dễ dàng từ bỏ một cái kế hoạch của mình cái điều đó không được rồi sau này m

# LJSpeech

In [9]:
# hps = utils.get_hparams_from_file("./configs/vits2_ljs_base.json")

In [10]:
# net_g = SynthesizerTrn(
#     len(symbols),
#     hps.data.filter_length // 2 + 1,
#     hps.train.segment_size // hps.data.hop_length,
#     **hps.model).cuda()
# _ = net_g.eval()

# _ = utils.load_checkpoint("/path/to/pretrained_ljs.pth", net_g, None)

In [11]:
# stn_tst = get_text("VITS is Awesome!", hps)
# with torch.no_grad():
#     x_tst = stn_tst.cuda().unsqueeze(0)
#     x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
#     audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
# ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

# VCTK

In [12]:
# hps = utils.get_hparams_from_file("./configs/vits2_vctk_base2.json")

In [13]:
# if hps.model.use_mel_posterior_encoder == True:
#     print("Using mel posterior encoder for VITS2")
#     posterior_channels = 80  # vits2
#     hps.data.use_mel_posterior_encoder = True
# else:
#     print("Using lin posterior encoder for VITS1")
#     posterior_channels = hps.data.filter_length // 2 + 1
#     hps.data.use_mel_posterior_encoder = False

# net_g = SynthesizerTrn(
#     len(symbols),
#     hps.data.n_mel_channels,
#     None,
#     n_speakers=hps.data.n_speakers,
#     **hps.model,
# ).to(device)
# _ = net_g.eval()

# _ = utils.load_checkpoint("/path/to/the/pretrained.pth", net_g, None)

In [14]:
# text = """VITS2 is Awesome!"""
# sid = 4

# stn_tst = get_text(text, hps)
# with torch.no_grad():
#     x_tst = stn_tst.to(device).unsqueeze(0)
#     x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
#     sid = torch.LongTensor([int(sid)]).to(device)
#     audio = (
#         net_g.infer(
#             x_tst,
#             x_tst_lengths,
#             sid=sid,
#             noise_scale=0.667,
#             noise_scale_w=0.8,
#             length_scale=1,
#         )[0][0, 0]
#         .data.cpu()
#         .float()
#         .numpy()
#     )
# ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

# Voice Conversion

In [15]:
# dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
# collate_fn = TextAudioSpeakerCollate()
# loader = DataLoader(dataset, num_workers=0, shuffle=False,
#     batch_size=1, pin_memory=False,
#     drop_last=True, collate_fn=collate_fn)
# data_list = list(loader)

In [16]:
# with torch.no_grad():
#     x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(device) for x in data_list[0]]
#     sid_tgt1 = torch.LongTensor([1]).to(device)
#     sid_tgt2 = torch.LongTensor([2]).to(device)
#     sid_tgt3 = torch.LongTensor([4]).to(device)
#     audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
#     audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
#     audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
# print("Original SID: %d" % sid_src.item())
# ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
# print("Converted SID: %d" % sid_tgt1.item())
# ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
# print("Converted SID: %d" % sid_tgt2.item())
# ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
# print("Converted SID: %d" % sid_tgt3.item())
# ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))