In [1]:
import os
import time

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("Agg")
import IPython.display as ipd

import sys
sys.path.append('/workspace/tacotron2-gst/hifi_gan/')

import numpy as np
from scipy.io.wavfile import write

#import glow
import torch

from sklearn.manifold import TSNE

from utils import load_wav_to_torch
from tqdm import tqdm
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from train import load_model
from text import text_to_sequence

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/root/anaconda3/envs/tf13/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/root/anaconda3/envs/tf13/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/root/anaconda3/envs/tf13/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/root/anaconda3/envs/tf13/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/root/anaconda3/envs/tf13/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/root/anaconda3/envs/tf13/lib/python3.6/sit

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/root/anaconda3/envs/tf13/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/root/anaconda3/envs/tf13/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/root/anaconda3/envs/tf13/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/root/anaconda3/envs/tf13/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/root/anaconda3/envs/tf13/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/root/anaconda3/envs/tf13/lib/python3.6/sit

In [2]:
!pwd

/workspace/tacotron2-gst


In [3]:
print(sys.path)

['/root/anaconda3/envs/tf13/lib/python36.zip', '/root/anaconda3/envs/tf13/lib/python3.6', '/root/anaconda3/envs/tf13/lib/python3.6/lib-dynload', '', '/root/anaconda3/envs/tf13/lib/python3.6/site-packages', '/root/anaconda3/envs/tf13/lib/python3.6/site-packages/IPython/extensions', '/root/.ipython', '/workspace/tacotron2-gst/hifi_gan/']


In [4]:
hparams = create_hparams()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [5]:
stft = TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)

def load_mel(path):
    audio, sampling_rate = load_wav_to_torch(path)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

In [6]:
def plot_data(data, figsize=(16, 4)):
    plt.figure(figsize=figsize)
    plt.imshow(data, aspect='auto', origin='bottom', interpolation='none')

## Load Models

In [7]:
checkpoint_path = "/workspace/tacotron2-gst/skt_multi_outdir/checkpoint_212500"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.eval()

Device: cuda
Current cuda device: 0
Count of using GPUs: 1


In [None]:
# waveglow_path = '/workspace/tacotron2-gst/waveglow_520000'
# waveglow = torch.load(waveglow_path)['model']
# waveglow.cuda()

## Text Encoder

In [8]:
def TextEncoder(text):
    sequence = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
    inputs = model.parse_input(sequence)
    embedded_inputs = model.embedding(inputs).transpose(1,2)
    transcript_outputs = model.encoder.inference(embedded_inputs)
    
    return transcript_outputs

## Decoder

In [9]:
def Decoder(encoder_outputs):
    decoder_input = model.decoder.get_go_frame(encoder_outputs)
    model.decoder.initialize_decoder_states(encoder_outputs, mask=None)
    mel_outputs, gate_outputs, alignments = [], [], []

    while True:
        decoder_input = model.decoder.prenet(decoder_input)
        mel_output, gate_output, alignment = model.decoder.decode(decoder_input)

        mel_outputs += [mel_output]
        gate_outputs += [gate_output]
        alignments += [alignment]

        if torch.sigmoid(gate_output.data) > hparams.gate_threshold:
            print(torch.sigmoid(gate_output.data), gate_output.data)
            break
        if len(mel_outputs) == hparams.max_decoder_steps:
            print("Warning! Reached max decoder steps")
            break

        decoder_input = mel_output

    mel_outputs, gate_outputs, alignments = model.decoder.parse_decoder_outputs(
        mel_outputs, gate_outputs, alignments)
    mel_outputs_postnet = model.postnet(mel_outputs)
    mel_outputs_postnet = mel_outputs + mel_outputs_postnet

    with torch.no_grad():
        synth = waveglow.infer(mel_outputs_postnet, sigma=0.666)
        
    return synth, mel_outputs_postnet

## Condition on Ref Audio

In [None]:
def generate_mels_by_ref_audio(text, ref_audio):
    transcript_outputs = TextEncoder(text)
    print("ref_audio")
    ipd.display(ipd.Audio(ref_audio, rate=hparams.sampling_rate))
    ref_audio_mel = load_mel(ref_audio)
    ipd.display(plot_data(ref_audio_mel.data.cpu().numpy()[0]))
    latent_vector = model.gst(ref_audio_mel)
    latent_vector = latent_vector.expand_as(transcript_outputs)

    encoder_outputs = transcript_outputs + latent_vector
    
    synth, mel_outputs = Decoder(encoder_outputs)
    
    ipd.display(ipd.Audio(synth[0].data.cpu().numpy(), rate=hparams.sampling_rate))
    ipd.display(plot_data(mel_outputs.data.cpu().numpy()[0]))

In [None]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
#ref_wav = "/DATA2/jinhan/KoreanEmotionSpeech/wav/ang/ang_00000100.wav"
ref_wav = "/workspace/data/Emo_kor/acriil_ang_00000100.wav"
generate_mels_by_ref_audio(text, ref_wav)

In [None]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
#ref_wav = "/DATA2/jinhan/KoreanEmotionSpeech/wav/sad/sad_00000100.wav"
ref_wav = "/workspace/data/Emo_kor/acriil_sad_00000100.wav"
generate_mels_by_ref_audio(text, ref_wav)

In [None]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
#ref_wav = "/DATA2/jinhan/KoreanEmotionSpeech/wav/sad/sad_00000100.wav"
ref_wav = "/workspace/data/Emo_kor/acriil_hap_00000100.wav"
generate_mels_by_ref_audio(text, ref_wav)

In [None]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
#ref_wav = "/DATA2/jinhan/KoreanEmotionSpeech/wav/sad/sad_00000100.wav"
ref_wav = "/workspace/data/Emo_kor/acriil_fea_00000100.wav"
generate_mels_by_ref_audio(text, ref_wav)

In [None]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
#ref_wav = "/DATA2/jinhan/KoreanEmotionSpeech/wav/sad/sad_00000100.wav"
ref_wav = "/workspace/data/Emo_kor/acriil_sur_00000100.wav"
generate_mels_by_ref_audio(text, ref_wav)

In [None]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
#ref_wav = "/DATA2/jinhan/KoreanEmotionSpeech/wav/sad/sad_00000100.wav"
ref_wav = "/workspace/data/Emo_kor/acriil_dis_00000100.wav"
generate_mels_by_ref_audio(text, ref_wav)

## Condition on Style Tokens

In [10]:
def generate_mels_by_style_tokens(text):
    transcript_outputs = TextEncoder(text)
    GST = torch.tanh(model.gst.stl.embed)

    for idx in range(10):
        query = torch.zeros(1, 1, hparams.E//2).cuda()
        keys = GST[idx].unsqueeze(0).expand(1,-1,-1)
        style_emb = model.gst.stl.attention(query, keys)
        encoder_outputs = transcript_outputs + style_emb

        synth, mel_outputs = Decoder(encoder_outputs)

        print("token {}".format(idx))
        ipd.display(ipd.Audio(synth[0].data.cpu().numpy(), rate=hparams.sampling_rate))
        ipd.display(plot_data(mel_outputs.data.cpu().numpy()[0]))

In [11]:
text = "이 모델을 이용하면 같은 문장을 여러가지 스타일로 말할 수 있습니다."
generate_mels_by_style_tokens(text)



NameError: name 'waveglow' is not defined