In [55]:
import os
import json
import numpy as np

from torch.utils.data import DataLoader

import sys
sys.path.append('../testing')
from hifigan.generator import HifiganGenerator
from hifigan.vctk_loader import VCTK_dataset, LogMelSpectrogram

sys.path.append('../training')
from networks.discriminator import Discriminator
from hyface import Nansy, BShall
from datasets.loader import Dataset

from utils import utils

import IPython.display as ipd

# HiFiGAN

## Setting

In [14]:
# main_dir = '/home/jaejun/nansy/
config_path = '/home/jaejun/temp_jaejun/hifigan/logs/config.json'
with open(config_path, "r") as f:
    data = f.read()
config = json.loads(data)
hifigan_args = utils.HParams(**config)

In [154]:
melspectrogram = LogMelSpectrogram(sample_rate=hifigan_args.data.sample_rate, n_fft=hifigan_args.data.filter_length,
                hop_length=hifigan_args.data.hop_length, win_length=hifigan_args.data.win_length, n_mels=hifigan_args.data.n_mel_channels, center=False)

In [155]:
hfgan = HifiganGenerator()

In [25]:
checkpoint_iter = 500
checkpoint_path = f'/home/jaejun/temp_jaejun/hifigan/checkpoints/G_{checkpoint_iter}.pth'
hfgan, _, _, _ = utils.load_checkpoint(checkpoint_path, hfgan)

Loaded checkpoint '/home/jaejun/temp_jaejun/hifigan/checkpoints/G_500.pth' (Epoch 500)


# HyFace

## Setting

In [37]:
# main_dir = '/home/jaejun/nansy/
config_path = '/home/jaejun/temp_jaejun/hyface/bshall/logs/config.json'
with open(config_path, "r") as f:
    data = f.read()
config = json.loads(data)
bshall_args = utils.HParams(**config)

meta_root = "/home/jaejun/hyface/training"

In [67]:
testset = Dataset(bshall_args, meta_root=os.path.join(meta_root, 'filelists'),
                        mode='test', datasets=['vctk'], sample_rate=bshall_args.data.sample_rate)
valid_loader = DataLoader(testset, batch_size=4,
                                shuffle=True, collate_fn=testset.collate,
                                worker_init_fn=lambda _: np.random.seed(int(torch.initial_seed())%(2**32-1)))

In [68]:
data = next(iter(valid_loader))
print(data['audio'].shape, data['hubert'].shape, data['frame_lengths'])

(4, 75840) torch.Size([4, 256, 237]) [408 438 474 324]


In [50]:
ipd.Audio(data['audio'][0], rate=16000)

In [58]:
hyface = BShall(bshall_args)

In [60]:
checkpoint_iter = 1800
checkpoint_path = f'/home/jaejun/temp_jaejun/hyface/bshall/checkpoints/G_{checkpoint_iter}.pth'
hyface, _, _, _ = utils.load_checkpoint(checkpoint_path, hyface)

Loaded checkpoint '/home/jaejun/temp_jaejun/hyface/bshall/checkpoints/G_1800.pth' (Epoch 1800)


## HifiGAN test

In [52]:
audio = data['audio']
melspec = melspectrogram(torch.tensor(audio))
print(audio.shape, melspec.shape)

(4, 54400) torch.Size([4, 80, 340])


In [53]:
pred_audio = hfgan(melspec)
pred_audio.shape

torch.Size([4, 1, 54400])

In [57]:
ipd.Audio(pred_audio[0].detach().numpy(), rate=16000)

## HyFace test

In [87]:
audio1, audio2, audio3, audio4 = data['audio']
print(audio1.shape, audio2.shape, audio3.shape, audio4.shape)
hubert = data['hubert']
print(hubert.shape)

(75840,) (75840,) (75840,) (75840,)
torch.Size([4, 256, 237])


In [88]:
ipd.Audio(audio1, rate=16000)

In [89]:
ipd.Audio(audio2, rate=16000)

In [92]:
ipd.Audio(audio3, rate=16000)

In [93]:
ipd.Audio(audio4, rate=16000)

In [97]:
timbre_global, timbre_bank = hyface.analyze_timbre(torch.tensor(data['audio']))
print(timbre_global.shape, timbre_bank.shape)

torch.Size([4, 192]) torch.Size([4, 128, 50])


In [98]:
# Order normal ( 0, 1, 2, 3)
mel_synth = hyface.synthesize(hubert, timbre_global, timbre_bank)
print(mel_synth.shape)

torch.Size([4, 80, 474])


In [80]:
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 1, 75840])


In [81]:
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [82]:
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [94]:
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [95]:
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

=> Recon 잘한다 (음질은 뭔가이상)

In [125]:
# Order shuffle ( 4, 3, 2, 1)
indices = [3,2,1,0]
# timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank[indices]
timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank
mel_synth = hyface.synthesize(hubert, timbre_global_perm, timbre_bank_perm)
print(mel_synth.shape)

torch.Size([4, 80, 474])


In [126]:
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 1, 75840])


In [127]:
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [128]:
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [129]:
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [130]:
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

=> 오 VC 도 잘 되는데??

In [141]:
filedir = '/data2/VoxCeleb2/test/n000017/id00017+bonafide+01dfn2spqyE_00001.wav'
i = 0
y, sr = torchaudio.load(filedir, num_frames=60000)
print(y.shape, sr)

torch.Size([1, 60000]) 16000


In [132]:
import torchaudio

In [148]:
i = 0
y, sr = torchaudio.load(filedir, num_frames=16000*3 if i==0 else 16000)
y.shape, sr

(torch.Size([1, 48000]), 16000)

In [151]:
i = 0
wave_length = "16000"
y, sr = torchaudio.load(filedir, num_frames=wave_length)
y.shape, sr

RuntimeError: torchaudio::sox_io_load_audio_file() Expected a value of type 'Optional[int]' for argument '_2' but instead found type 'str'.
Position: 2
Value: '16000'
Declaration: torchaudio::sox_io_load_audio_file(str _0, int? _1, int? _2, bool? _3, bool? _4, str? _5) -> ((Tensor, int)? _0)
Cast error details: Unable to cast Python instance to C++ type (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)

In [152]:
16000/320

50.0

In [153]:
800 / 1000

0.8