In [12]:
from __future__ import absolute_import, division, print_function, unicode_literals

import glob
import os
import argparse
import json
import torch
from scipy.io.wavfile import write
from env import AttrDict
from meldataset import mel_spectrogram, MAX_WAV_VALUE
from models_lightvoc import Generator
from stft import TorchSTFT


h = None
device = None


def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict


def get_mel(x):
    return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)


def scan_checkpoint(cp_dir, prefix):
    pattern = os.path.join(cp_dir, prefix + '*')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return ''
    return sorted(cp_list)[-1]

In [13]:
config_file = '/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/lightvoc_vctk_new/config.json'
with open(config_file) as f:
    data = f.read()

global h
json_config = json.loads(data)
h = AttrDict(json_config)

torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
    torch.cuda.manual_seed(h.seed)
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

generator = Generator(h).to(device)
stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device)

state_dict_g = load_checkpoint('/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/lightvoc_vctk_new/g_00000300', device)
generator.load_state_dict(state_dict_g['generator'])

generator.eval()
generator.remove_weight_norm()



Loading '/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/lightvoc_vctk_new/g_00000300'
Complete.
Removing weight norm...


In [14]:
from scipy.io.wavfile import read

def load_wav(full_path):
    sampling_rate, data = read(full_path)
    if max(data)<=1:
        data = data*MAX_WAV_VALUE
    return data.astype("int16"), sampling_rate

In [27]:
stft = stft.to('cuda')
def inference(filename):
    with torch.no_grad():
        wav, sr = load_wav(filename)
        wav = wav / MAX_WAV_VALUE
        wav = torch.FloatTensor(wav).to(device)
        x = get_mel(wav.unsqueeze(0))
        spec, phase = generator(x)
        y_g_hat = stft.inverse(spec, phase)
        audio = y_g_hat.squeeze()
        audio = audio * MAX_WAV_VALUE
        audio = audio.cpu().numpy().astype('int16')
        
        return audio

In [16]:
from IPython.display import Audio as Audio 

y1 = inference("/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/sample/si965.wav")
y2 = inference("/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/sample/p225_001.wav")
y3 = inference("/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/sample/p226_002.wav")
y4 = inference("/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/sample/raw.wav")
y5 = inference("/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/sample/raw1.wav")
y6 = inference("/home/jovyan/voice-chung/tuht/vocoder/iSTFTNet-pytorch/sample/raw2.wav")



In [17]:
Audio(y1, rate=16000)

In [18]:
Audio(y2, rate=16000)

In [19]:
Audio(y3, rate=16000)

In [20]:
Audio(y4, rate=16000)

In [21]:
Audio(y5, rate=16000)

In [22]:
Audio(y6, rate=16000)