In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
import librosa
import numpy as np
import os
import glob
import json

import torch
from text.text_id import text_to_sequence
from text.symbols import symbols
import commons
import attentions
import modules
import models
import utils



DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.


In [5]:
def plot_waveform(waveform, sr, title="Waveform"):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sr

    figure, axes = plt.subplots(num_channels, 1)
    axes.plot(time_axis, waveform[0], linewidth=1)
    axes.grid(True)
    figure.suptitle(title)
    plt.show(block=False)


def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or "Spectrogram (db)")
    axs.set_ylabel(ylabel)
    axs.set_xlabel("frame")
    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
    fig.colorbar(im, ax=axs)
    plt.show(block=False)

In [10]:
# If you are using your own trained model
model_dir = "/media/caijb/data_drive/glowtts_KR/logs/multi"
hps = utils.get_hparams_from_dir(model_dir)
checkpoint_path = utils.latest_checkpoint_path(model_dir)

# If you are using a provided pretrained model
# hps = utils.get_hparams_from_file("./configs/any_config_file.json")
# checkpoint_path = "/path/to/pretrained_model"

model = models.FlowGenerator(
    len(symbols) + getattr(hps.data, "add_blank", False),
    out_channels=hps.data.n_mel_channels,n_speakers=hps.model.n_speaker
    ,gin_channels=256,
    **hps.model).to("cuda")

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse() # do not calcuate jacobians for fast decoding
_ = model.eval()


# normalizing & type casting
def normalize_audio(x, max_wav_value=hps.data.max_wav_value):
    return np.clip((x / np.abs(x).max()) * max_wav_value, -32768, 32767).astype("int32")

/media/caijb/data_drive/glowtts_KR/logs/multi/G_1000.pth
INFO:root:Loaded checkpoint '/media/caijb/data_drive/glowtts_KR/logs/multi/G_1000.pth' (iteration 1000)


In [24]:
tst_stn = "똑바로 말하는 것은 컴퓨터에겐 너무나도 어려운 일입니다."
if getattr(hps.data, "add_blank", True):
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'])
    text_norm = commons.intersperse(text_norm, len(symbols))
else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality
    tst_stn = " " + tst_stn.strip() + " "
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'])
print(text_norm)
sequence = np.array(text_norm)[None, :]
print("".join([symbols[c] if c < len(symbols) else "<BNK>" for c in sequence[0]]))
x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda()

[147, 16, 147, 39, 147, 52, 147, 19, 147, 31, 147, 17, 147, 39, 147, 11, 147, 18, 147, 31, 147, 59, 147, 30, 147, 31, 147, 14, 147, 49, 147, 55, 147, 11, 147, 12, 147, 35, 147, 70, 147, 23, 147, 49, 147, 55, 147, 11, 147, 27, 147, 35, 147, 67, 147, 29, 147, 48, 147, 28, 147, 35, 147, 23, 147, 36, 147, 12, 147, 36, 147, 55, 147, 11, 147, 14, 147, 35, 147, 18, 147, 44, 147, 14, 147, 31, 147, 15, 147, 39, 147, 11, 147, 23, 147, 35, 147, 17, 147, 37, 147, 23, 147, 44, 147, 55, 147, 11, 147, 23, 147, 51, 147, 59, 147, 23, 147, 51, 147, 68, 147, 14, 147, 51, 147, 15, 147, 31, 147, 7, 147]
<BNK>F<BNK>c<BNK>p<BNK>I<BNK>U<BNK>G<BNK>c<BNK>A<BNK>H<BNK>U<BNK>w<BNK>T<BNK>U<BNK>D<BNK>m<BNK>s<BNK>A<BNK>B<BNK>Y<BNK>@AE2<BNK>M<BNK>m<BNK>s<BNK>A<BNK>Q<BNK>Y<BNK>@AE<BNK>S<BNK>l<BNK>R<BNK>Y<BNK>M<BNK>Z<BNK>B<BNK>Z<BNK>s<BNK>A<BNK>D<BNK>Y<BNK>H<BNK>h<BNK>D<BNK>U<BNK>E<BNK>c<BNK>A<BNK>M<BNK>Y<BNK>G<BNK>a<BNK>M<BNK>h<BNK>s<BNK>A<BNK>M<BNK>o<BNK>w<BNK>M<BNK>o<BNK>@AE0<BNK>D<BNK>o<BNK>E<BNK>U<BNK>.<BNK>


In [39]:
with torch.no_grad():
    noise_scale = .667
    length_scale = 1.0
    sid=[3]
    sid=torch.tensor(sid).cuda()
    (y_gen, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths,g=sid,gen=True)
    if not os.path.exists('./hifi-gan/test_mel_files'):
        os.makedirs('./hifi-gan/test_mel_files')
    np.save("./hifi-gan/test_mel_files/sample_3.npy", y_gen.cpu().detach().numpy())
    import audio_processing as ap
    y_gen=ap.dynamic_range_decompression(y_gen)
    mel=y_gen.detach().cpu()
    mel=mel.numpy()
    mel_basis=librosa.filters.mel(sr=hps.data.sampling_rate, n_fft=hps.data.filter_length, n_mels=hps.data.n_mel_channels)
    covered_mel=librosa.util.nnls(mel_basis, mel)
    audio=librosa.griffinlim(covered_mel,n_iter=60)
    #stft_fn=STFT(hps.data.filter_length, hps.data.hop_length, hps.data.win_length)
    #audio=ap.griffin_lim(torch.tensor(covered_mel),stft_fn,n_iters=60)

#fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
#import librosa.display as dp
#dp.waveplot(audio, sr=hps.data.sampling_rate, color='b', ax=ax[0])
#ax[0].set(title='Original', xlabel=None)
#ax[0].label_outer()
ipd.Audio(audio, rate=hps.data.sampling_rate)