In [1]:
import os
import json
import wandb
import random
import argparse
import itertools
import numpy as np
from time import gmtime, strftime

import soundfile as sf

import torch
import torchaudio
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
import PIL

import sys
sys.path.append('../training')
sys.path.append('../training/networks/nansypp')
sys.path.append('../testing')
from hifigan.generator import HifiganGenerator
from hifigan.vctk_loader import VCTK_dataset, LogMelSpectrogram

from datasets.loader import Dataset
from networks.discriminator import Discriminator
# from networks.f2v import F2V_Ecapa
from nansypp import Nansypp
from hyface import Nansy, BShall_Nimbre, BShall_Ecapa
# import loss_funcs

from utils import audio_utils
from utils.data_utils import phoneme_inventory, decollate_tensor, combine_fixed_length
from utils import utils

import IPython.display as ipd

  from .autonotebook import tqdm as notebook_tqdm


# Setting

## Argument setting

In [2]:
# main_dir = '/home/jaejun/nansy/
config_path = '/disk3/jaejun/hyface/bshall_nimbre/logs/config.json'
with open(config_path, "r") as f:
    data = f.read()
config = json.loads(data)
args = utils.HParams(**config)

args.meta_root = '/home/jaejun/hyface/training'
# args.base_dir = '/home/jaejun/temp_jaejun/hyface/230810/checkpoints'

# Data

## VCTK

In [3]:
vctk_dir = '/disk2/vctk/modified/wav16_cleaned'
vctk_spkrs = os.listdir(vctk_dir)
sr = 16000
print(len(vctk_spkrs))

108


In [4]:
# Female 1
vctk_f1_spk = 'p225'
vctk_f1_file = 'p225_001.wav'
vctk_f1_path = os.path.join(vctk_dir, vctk_f1_spk, vctk_f1_file)
vctk_f1, pos = audio_utils.load_wav(path=vctk_f1_path, max_len=4*16000, pos='random')
hubert_pos = int(np.round(pos / 320))
vctk_f1_hubert = torch.load(vctk_f1_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[hubert_pos:hubert_pos+4*50]
print("Hubert:", vctk_f1_hubert.shape)
print(vctk_f1.shape, sr, pos)
ipd.Audio(vctk_f1, rate=16000)

Hubert: torch.Size([102, 256])
(32825,) 16000


In [5]:
32825 / 102

321.8137254901961

In [11]:
int(160.6)

160

In [12]:
# Female 2
vctk_f2_spk = 'p238'
vctk_f2_file = 'p238_003.wav'
vctk_f2_path = os.path.join(vctk_dir, vctk_f2_spk, vctk_f2_file)
vctk_f2, pos = audio_utils.load_wav(path=vctk_f2_path, max_len=4*16000, pos='random')
hubert_pos = int(np.round(pos / 320))
vctk_f2_hubert = torch.load(vctk_f2_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[hubert_pos:hubert_pos+4*50]
print("Hubert:", vctk_f2_hubert.shape)
print(vctk_f2.shape, sr, pos)
ipd.Audio(vctk_f2, rate=16000)

Hubert: torch.Size([200, 256])
(64000,) 16000 22602


In [13]:
# Male 1
vctk_m1_spk = 'p226'
vctk_m1_file = 'p226_002.wav'
vctk_m1_path = os.path.join(vctk_dir, vctk_m1_spk, vctk_m1_file)
vctk_m1 = audio_utils.load_wav(path=vctk_m1_path, max_len=4*16000)
vctk_m1_hubert = torch.load(vctk_m1_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[:4*50]
print("Hubert:", vctk_m1_hubert.shape)
print(vctk_m1.shape, sr)
ipd.Audio(vctk_m1, rate=16000)

Hubert: torch.Size([200, 256])
(64000,) 16000


In [14]:
# Male 2
vctk_m2_spk = 'p237'
vctk_m2_file = 'p237_004.wav'
vctk_m2_path = os.path.join(vctk_dir, vctk_m2_spk, vctk_m2_file)
vctk_m2 = audio_utils.load_wav(path=vctk_m2_path, max_len=4*16000)
vctk_m2_hubert = torch.load(vctk_m2_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[:4*50]
print("Hubert:", vctk_m2_hubert.shape)
print(vctk_m2.shape, sr)
ipd.Audio(vctk_m2, rate=16000)

Hubert: torch.Size([200, 256])
(64000,) 16000


In [15]:
trainset = Dataset(args, meta_root=os.path.join(args.meta_root, 'filelists'),
                        mode='train', datasets=['vctk'], sample_rate=args.data.sample_rate)
train_loader = DataLoader(trainset, batch_size=4, collate_fn=trainset.collate, shuffle=True)

## VoxCeleb2

In [32]:
voxdir = '/disk2/VoxCeleb2/VoxCeleb2/original'
vox_ecapa_dir = '/disk2/VoxCeleb2/VoxCeleb2/modified/ecapa_16000'
vox_spkrs = os.listdir(vox_ecapa_dir)
print(len(vox_spkrs))

vggdir = '/disk2/VGG_Face2/data/original'

6112


In [53]:
# male 1
random.seed(1)
spk = 'n000012'
rnd_aud_paths = os.listdir(os.path.join(voxdir,spk))
random.shuffle(rnd_aud_paths)
rnd_aud_path = os.path.join(voxdir,spk,rnd_aud_paths[0])
vox_m1 = audio_utils.load_wav(path=rnd_aud_path, max_len=4*16000)
vox_m1_hubert = torch.load(vctk_m1_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[:4*50]
ecapa_dir = os.path.join(voxdir, spk).replace('original','modified/ecapa_avg') + '.npy'
vox_m1_ecapa = torch.tensor(np.load(ecapa_dir))
print(vox_m1_ecapa.shape)
ipd.Audio(vox_m1, rate=16000)

torch.Size([1, 192])


In [52]:
# male 2
random.seed(2)
spk = 'n000029'
rnd_aud_paths = os.listdir(os.path.join(voxdir,spk))
random.shuffle(rnd_aud_paths)
rnd_aud_path = os.path.join(voxdir,spk,rnd_aud_paths[0])
vox_m2 = audio_utils.load_wav(path=rnd_aud_path, max_len=4*16000)
vox_m2_hubert = torch.load(vctk_m2_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[:4*50]
ecapa_dir = os.path.join(voxdir, spk).replace('original','modified/ecapa_avg') + '.npy'
vox_m2_ecapa = torch.tensor(np.load(ecapa_dir))
print(vox_m2_ecapa.shape)
ipd.Audio(vox_m2, rate=16000)

torch.Size([1, 192])


In [51]:
# female 1
random.seed(2)
spk = 'n000024'
rnd_aud_paths = os.listdir(os.path.join(voxdir,spk))
random.shuffle(rnd_aud_paths)
rnd_aud_path = os.path.join(voxdir,spk,rnd_aud_paths[0])
vox_f1 = audio_utils.load_wav(path=rnd_aud_path, max_len=4*16000)
vox_f1_hubert = torch.load(vctk_f1_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[:4*50]
ecapa_dir = os.path.join(voxdir, spk).replace('original','modified/ecapa_avg') + '.npy'
vox_f1_ecapa = torch.tensor(np.load(ecapa_dir))
print(vox_f1_ecapa.shape)
ipd.Audio(vox_f1, rate=16000)

torch.Size([1, 192])


In [50]:
# female 2
random.seed(4)
spk = 'n000042'
rnd_aud_paths = os.listdir(os.path.join(voxdir,spk))
random.shuffle(rnd_aud_paths)
rnd_aud_path = os.path.join(voxdir,spk,rnd_aud_paths[0])
vox_f2 = audio_utils.load_wav(path=rnd_aud_path, max_len=4*16000)
vox_f2_hubert = torch.load(vctk_f2_path.replace('wav16_cleaned', 'hubert_soft').replace('.wav','.emb')).squeeze(0)[:4*50]
ecapa_dir = os.path.join(voxdir, spk).replace('original','modified/ecapa_avg') + '.npy'
vox_f2_ecapa = torch.tensor(np.load(ecapa_dir))
print(vox_f2_ecapa.shape)
ipd.Audio(vox_f2, rate=16000)

torch.Size([1, 192])


# Model

# HifiGAN (for vocoding)

In [16]:
# main_dir = '/home/jaejun/nansy/
hfgan_config_path = '/disk3/jaejun/hifigan/230807/logs/config.json'
with open(hfgan_config_path, "r") as f:
    data = f.read()
config = json.loads(data)
hifigan_args = utils.HParams(**config)

In [17]:
melspectrogram = LogMelSpectrogram(sample_rate=hifigan_args.data.sample_rate, n_fft=hifigan_args.data.filter_length,
                hop_length=hifigan_args.data.hop_length, win_length=hifigan_args.data.win_length, n_mels=hifigan_args.data.n_mel_channels, center=False)



In [18]:
hfgan = HifiganGenerator()

In [20]:
checkpoint_iter = 500
checkpoint_path = f'/disk3/jaejun/hifigan/230807/checkpoints/G_{checkpoint_iter}.pth'
hfgan, _, _, _ = utils.load_checkpoint(checkpoint_path, hfgan)

Loaded checkpoint '/disk3/jaejun/hifigan/230807/checkpoints/G_500.pth' (Epoch 500)


## BShall_Nimbre

In [21]:
# main_dir = '/home/jaejun/nansy/
config_path = '/disk3/jaejun/hyface/bshall_nimbre/logs/config.json'
with open(config_path, "r") as f:
    data = f.read()
config = json.loads(data)
args = utils.HParams(**config)

# args.meta_root = '/disk3/jaejun/hyface/training'
# args.base_dir = '/disk3/jaejun/hyface/bshall/checkpoints'

In [22]:
if args.model.generator == "nansy":
    hyface = Nansy(args)
elif args.model.generator == "bshall" and args.model.timbre.type == "nansy":
    hyface = BShall_Nimbre(args)
elif args.model.generator == "bshall" and args.model.timbre.type == "ecapa":
    hyface = BShall_Ecapa(args)

In [24]:
index = 3000
checkpoint_path = f'/disk3/jaejun/hyface/bshall_nimbre/checkpoints/G_{index}.pth'
hyface, _, _, _ = utils.load_checkpoint(checkpoint_path, hyface, None)

Loaded checkpoint '/disk3/jaejun/hyface/bshall_nimbre/checkpoints/G_3000.pth' (Epoch 3000)


# Test

## Recon

### ling : m2, timbre : m2

In [179]:
timbre_dup = torch.vstack([torch.tensor(vctk_m2), torch.tensor(vctk_m2)])
hubert_dup = torch.vstack([vctk_m2_hubert, vctk_m2_hubert])

In [180]:
print(vctk_m2_hubert.shape)

torch.Size([1, 263, 256])


In [181]:
timbre_global, timbre_bank = hyface.analyze_timbre(timbre_dup)
print(timbre_global.shape, timbre_bank.shape)

torch.Size([2, 192]) torch.Size([2, 128, 50])


In [182]:
mel_synth = hyface.synthesize(hubert_dup.transpose(-1,-2), (timbre_global, timbre_bank))
print(mel_synth.shape)

torch.Size([2, 80, 526])


In [183]:
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([2, 1, 84160])


In [184]:
# 1500
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [185]:
# 1500
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [186]:
# 3000
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [187]:
# 3000
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [61]:
# 3900
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [62]:
# 3900
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

=> Recon 잘한다 (음질은 뭔가이상)

### ling : f2, timbre : f2

In [108]:
timbre_dup = torch.vstack([vctk_f2, vctk_f2])
hubert_dup = torch.vstack([vctk_f2_hubert, vctk_f2_hubert])
timbre_global, timbre_bank = hyface.analyze_timbre(timbre_dup)
print(timbre_global.shape, timbre_bank.shape)
mel_synth = hyface.synthesize(hubert_dup.transpose(-1,-2), (timbre_global, timbre_bank))
print(mel_synth.shape)
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([2, 192]) torch.Size([2, 128, 50])
torch.Size([2, 80, 832])
torch.Size([2, 1, 133120])


In [109]:
# 3900
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [110]:
# 3900
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

### ling : f2, timbre : m2. (VCTK)

In [25]:
audios = [vctk_f1, vctk_f2, vctk_m1, vctk_m2]
huberts = [vctk_f1_hubert, vctk_f2_hubert, vctk_m1_hubert, vctk_m2_hubert]
virtual_batch = []
for i in range(len(audios)):
    virtual_batch.append(tuple([audios[i], huberts[i]]))
virtual_batch = trainset.collate(virtual_batch)

In [26]:
indices = [3,2,1,0]
timbre_global, timbre_bank = hyface.analyze_timbre(torch.tensor(virtual_batch['audio']))
print(timbre_global.shape, timbre_bank.shape)
timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank[indices]
mel_synth = hyface.synthesize(virtual_batch['hubert'], (timbre_global_perm, timbre_bank_perm))
print(mel_synth.shape)
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 192]) torch.Size([4, 128, 50])
torch.Size([4, 80, 400])
torch.Size([4, 1, 64000])


In [27]:
# 1800
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [28]:
# 1800
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [29]:
# 1800
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [30]:
# 1800
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

### VoxCeleb ling : f2, timbre : m2

In [63]:
audios = [vox_f1, vox_f2, vox_m1, vox_m2]
huberts = [vox_f1_hubert, vox_f2_hubert, vox_m1_hubert, vox_m2_hubert]
virtual_batch = []
for i in range(len(audios)):
    virtual_batch.append(tuple([audios[i], huberts[i]]))
virtual_batch = trainset.collate(virtual_batch)

In [64]:
indices = [0,1,2,3]
timbre_global, timbre_bank = hyface.analyze_timbre(torch.tensor(virtual_batch['audio']))
print(timbre_global.shape, timbre_bank.shape)
timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank[indices]
mel_synth = hyface.synthesize(virtual_batch['hubert'], (timbre_global_perm, timbre_bank_perm))
print(mel_synth.shape)
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 192]) torch.Size([4, 128, 50])
torch.Size([4, 80, 400])
torch.Size([4, 1, 64000])


In [68]:
# 1800
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [69]:
# 1800
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [70]:
# 1800
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [71]:
# 1800
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

In [77]:
indices = [3,2,1,0]
timbre_global, timbre_bank = hyface.analyze_timbre(torch.tensor(virtual_batch['audio']))
print(timbre_global.shape, timbre_bank.shape)
timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank[indices]
mel_synth = hyface.synthesize(virtual_batch['hubert'], (timbre_global_perm, timbre_bank_perm))
print(mel_synth.shape)
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 192]) torch.Size([4, 128, 50])
torch.Size([4, 80, 400])
torch.Size([4, 1, 64000])


In [86]:
# 1800
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [87]:
# 1800
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [88]:
# 1800
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [89]:
# 1800
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

## 의문, data 직접 불러와서 inference 하면 안되고, loader 에서 하면 vc 잘됨

In [225]:
# Recon
indices = [0,1,2,3]
timbre_global, timbre_bank = hyface.analyze_timbre(torch.tensor(data['audio']))
print(timbre_global.shape, timbre_bank.shape)
timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank
mel_synth = hyface.synthesize(data['hubert'], (timbre_global_perm, timbre_bank_perm))
print(mel_synth.shape)
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 192]) torch.Size([4, 128, 50])
torch.Size([4, 80, 400])
torch.Size([4, 1, 64000])


In [226]:
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [227]:
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [228]:
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [229]:
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

In [230]:
# Conversion
indices = [3,2,1,0]
timbre_global_perm, timbre_bank_perm = timbre_global[indices], timbre_bank[indices]
mel_synth = hyface.synthesize(data['hubert'], (timbre_global_perm, timbre_bank_perm))
print(mel_synth.shape)
audio_synth = hfgan(mel_synth)
print(audio_synth.shape)

torch.Size([4, 80, 400])
torch.Size([4, 1, 64000])


In [231]:
ipd.Audio(audio_synth[0].detach().numpy(), rate=16000)

In [232]:
ipd.Audio(audio_synth[1].detach().numpy(), rate=16000)

In [233]:
ipd.Audio(audio_synth[2].detach().numpy(), rate=16000)

In [234]:
ipd.Audio(audio_synth[3].detach().numpy(), rate=16000)

=> 오 VC 도 잘 되는데??

In [208]:
torchaudio.load(vctk_f1_path)

(tensor([[-0.0020, -0.0037, -0.0031,  ..., -0.0283, -0.0323,  0.0000]]), 16000)

In [209]:
audio_utils.load_wav(vctk_f1_path)

array([-0.00195312, -0.00366211, -0.00311279, ..., -0.02825928,
       -0.03234863,  0.        ], dtype=float32)

In [153]:
800 / 1000

0.8