## VCTK MSTTS

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np
from scipy.io.wavfile import write, read


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

### Load Models

In [2]:
hps_base = utils.get_hparams_from_file("./configs/vctk_base.json")
hps_spkemb1 = utils.get_hparams_from_file("./configs/vctk_spkemb1_b16.json")

In [3]:
net_g_base = SynthesizerTrn(
    len(symbols),
    hps_base.data.filter_length // 2 + 1,
    hps_base.train.segment_size // hps_base.data.hop_length,
    n_speakers=hps_base.data.n_speakers,
    **hps_base.model)#.cuda()
_ = net_g_base.eval()

_ = utils.load_checkpoint("/fsx/home/1123/vits/vits/logs/vctk_base/G_350000.pth", net_g_base, None)

In [4]:
net_g_spkemb1 = SynthesizerTrn(
    len(symbols),
    hps_spkemb1.data.filter_length // 2 + 1,
    hps_spkemb1.train.segment_size // hps_spkemb1.data.hop_length,
    use_spk_enc=True,
    **hps_spkemb1.model)#.cuda()
_ = net_g_spkemb1.eval()

_ = utils.load_checkpoint("/fsx/home/1123/vits/vits/logs/vctk_spkemb1/G_350000.pth", net_g_spkemb1, None)

### Load Data

In [5]:
dataset = TextAudioSpeakerLoader('filelists/vctk2_audio_sid_text_test_filelist.txt', hps_spkemb1.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
#data_list = list(loader)
data_list = dataset.audiopaths_sid_text



In [6]:
spk2spkid = {}
spk2spkid['351'] = 33
spk2spkid['343'] = 21
spk2spkid['364'] = 88
spk2spkid['376'] = 71
spk2filelist = {}
for spk_key,spkid in spk2spkid.items():
    spk2filelist[spk_key] = []
    for i, line in enumerate(data_list):
        if line[1] == str(spkid):
           spk2filelist[spk_key].append(i)

In [7]:
print(data_list[spk2filelist['351'][0]])

['DUMMY2/p351/p351_299.wav', '33', 'I started it, but then forgot all about it.']


### Speech Synthesis

In [8]:
tgt_spk='343'
tgt_phrase_idx = 6
src_phrase_idx = 0
tgt_phrase=data_list[spk2filelist[tgt_spk][tgt_phrase_idx]][2]
tgt_sid = spk2spkid[tgt_spk]
print(f'tgt_spk={tgt_spk} tgt_phrase={tgt_phrase}')
#for a,b in enumerate(spk2filelist[tgt_spk]):
#    print(a, data_list[b])

tgt_spk=343 tgt_phrase=I've done nothing wrong, and that's the truth.


In [9]:
tgt_spk='351'
tgt_phrase_idx = 3
src_phrase_idx = 0
tgt_phrase=data_list[spk2filelist[tgt_spk][tgt_phrase_idx]][2]
tgt_sid = spk2spkid[tgt_spk]
print(f'tgt_spk={tgt_spk} tgt_phrase={tgt_phrase}')
#for a,b in enumerate(spk2filelist[tgt_spk]):
#    print(a, data_list[b])

tgt_spk=351 tgt_phrase=It was the crowning point of my career.


In [10]:
tgt_spk='364'
tgt_phrase_idx = 18
src_phrase_idx = 0
tgt_phrase=data_list[spk2filelist[tgt_spk][tgt_phrase_idx]][2]
tgt_sid = spk2spkid[tgt_spk]
print(f'tgt_spk={tgt_spk} tgt_phrase={tgt_phrase}')
#for a,b in enumerate(spk2filelist[tgt_spk]):
#    print(a, data_list[b])

tgt_spk=364 tgt_phrase=The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain.


In [11]:
tgt_spk='376'
tgt_phrase_idx = 18
src_phrase_idx = 0
tgt_phrase=data_list[spk2filelist[tgt_spk][tgt_phrase_idx]][2]
tgt_sid = spk2spkid[tgt_spk]
print(f'tgt_spk={tgt_spk} tgt_phrase={tgt_phrase}')
#for a,b in enumerate(spk2filelist[tgt_spk]):
#    print(a, data_list[b])

tgt_spk=376 tgt_phrase="The Norsemen considered the rainbow as a bridge over which the gods passed from earth to their home in the sky. "


#### GT

In [12]:
audio_file = data_list[spk2filelist[tgt_spk][tgt_phrase_idx]][0]
fs, s = read(audio_file)
s = s.astype(np.float32)/(2**15-1)
ipd.display(ipd.Audio(s, rate=fs, normalize=False))
filename = 'audios/tts_spk%s_p%d_gt.wav' % (tgt_spk,tgt_phrase_idx)
if not os.path.exists('./audios'):
    os.mkdir('./audios')
write(filename, fs, s)

#### Embedding Look at Table, Spk seen in training

In [13]:
stn_tst = get_text(tgt_phrase, hps_base)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])#.cuda()
    sid = torch.LongTensor([tgt_sid])#.cuda()
    audio = net_g_base.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps_base.data.sampling_rate, normalize=False))
filename = 'audios/tts_spk%s_p%d_base.wav' % (tgt_spk,tgt_phrase_idx)
write(filename, fs, audio)

#### Spk Encoder Embedding, Spk unseen in training

In [14]:
stn_tst = get_text(tgt_phrase, hps_spkemb1)
global_phrase_idx = spk2filelist[tgt_spk][src_phrase_idx]
_, _, spec, spec_lengths, y, _, _ = collate_fn([dataset[global_phrase_idx]])
src_audio = y.float().numpy()[0,0]
assert sid[0] == tgt_sid
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])#.cuda()
    sid = torch.LongTensor([tgt_sid])#.cuda()
    audio = net_g_spkemb1.infer(x_tst, x_tst_lengths, y_spk=spec, y_spk_lengths=spec_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(src_audio, rate=hps_spkemb1.data.sampling_rate, normalize=False))
ipd.display(ipd.Audio(audio, rate=hps_spkemb1.data.sampling_rate, normalize=False))
filename = 'audios/tts_spk%s_p%d_spkemb1_srcp%d.wav' % (tgt_spk,tgt_phrase_idx,src_phrase_idx)
write(filename, fs, audio)
filename = 'audios/tts_spk%s_p%d_spkemb1_srcp%d_ref.wav' % (tgt_spk,tgt_phrase_idx,src_phrase_idx)
write(filename, fs, src_audio)

### Voice Conversion

In [15]:
src_spk='343'
src_phrase_idx = 6
spkemb_phrase_idx = 0
src_phrase=data_list[spk2filelist[src_spk][src_phrase_idx]][2]
src_sid = spk2spkid[src_spk]
print(f'src_spk={src_spk} src_phrase={src_phrase}')
tgt_spks=['351', '364', '376']

src_spk=343 src_phrase=I've done nothing wrong, and that's the truth.


#### Source Audio

In [16]:
global_phrase_idx = spk2filelist[src_spk][src_phrase_idx]
_, _, spec, spec_lengths, y, y_lengths, sid_src = collate_fn([dataset[global_phrase_idx]])
src_audio = y.float().numpy()[0,0]
ipd.display(ipd.Audio(src_audio, rate=hps_spkemb1.data.sampling_rate, normalize=False))
filename = 'audios/vc_src_spk%s_p%d.wav' % (src_spk,src_phrase_idx)
write(filename, fs, src_audio)

#### Conversion Look at Table model

In [21]:
for i, tgt_spk in enumerate(tgt_spks):
    with torch.no_grad():
        global_phrase_idx = spk2filelist[tgt_spk][spkemb_phrase_idx]
        _, _, spk_spec, spk_spec_lengths, y, y_lengths, sid_tgt = collate_fn([dataset[global_phrase_idx]])
        audio_base = net_g_base.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][0,0].data.cpu().float().numpy()
        audio_spkemb1 = net_g_spkemb1.voice_conversion(spec, spec_lengths, y_tgt=spk_spec, y_tgt_lengths=spk_spec_lengths)[0][0,0].data.cpu().float().numpy()
    print("GT SID: %s" % tgt_spk)
    audio_gt = y[0,0].cpu().numpy()
    ipd.display(ipd.Audio(audio_gt, rate=hps_base.data.sampling_rate, normalize=False))
    filename = 'audios/vc_gt_src%s_spk%s_p%d.wav' % (src_spk, tgt_spk, src_phrase_idx)
    write(filename, hps_base.data.sampling_rate, audio_gt)
    print("Converted Baseline SID: %s" % tgt_spk)
    ipd.display(ipd.Audio(audio_base, rate=hps_base.data.sampling_rate, normalize=False))
    filename = 'audios/vc_base_src%s_spk%s_p%d.wav' % (src_spk, tgt_spk, src_phrase_idx)
    write(filename, hps_base.data.sampling_rate, audio_base)
    print("Converted SpkEmb1 SID: %s" % tgt_spk)
    ipd.display(ipd.Audio(audio_spkemb1, rate=hps_base.data.sampling_rate, normalize=False))
    filename = 'audios/vc_spkemb1_src%s_spk%s_p%d.wav' % (src_spk, tgt_spk, src_phrase_idx)
    write(filename, hps_base.data.sampling_rate, audio_spkemb1)

GT SID: 351


Converted Baseline SID: 351


Converted SpkEmb1 SID: 351


GT SID: 364


Converted Baseline SID: 364


Converted SpkEmb1 SID: 364


GT SID: 376


Converted Baseline SID: 376


Converted SpkEmb1 SID: 376
