## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [2]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/home/hbauerec/dev/venv/pytorch1.4/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/hbauerec/dev/venv/pytorch1.4/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/hbauerec/dev/venv/pytorch1.4/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/hbauerec/dev/venv/pytorch1.4/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/hbauerec/dev/venv/pytorch1.4/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File

In [23]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')

#### Setup hparams

In [24]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [25]:
checkpoint_path = "outdir_pitch/checkpoint_1000"
#checkpoint_path = "pretrained/tacotron2_statedict.pt"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

RuntimeError: CUDA error: device-side assert triggered

#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = 'pretrained/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

#### Prepare text input

In [1]:
text = "hickery dickery duck the mouse went up the clock"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()
sequence


NameError: name 'np' is not defined

In [19]:
from audio import audio_to_seq
test_file="/mnt/shared_ad2_mt1/hbauerec/data/LJSpeech-1.1/wavs/LJ045-0096.wav"

sequence = np.array(audio_to_seq(test_file))[None, :48]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()
sequence

tensor([[148,  62, 117, 118, 118, 118,  62, 116, 116,  68,  68,  54,  52,  70,
          70,  70,  70,  69,  51, 107, 109,  73,  72,  72,  70,   0,   0, 106,
         125, 124,  57,  76,  78,  79,  79,  75,  58,  76,  75,  74,  83,  83,
          83,  82,  62, 117, 117,  52]], device='cuda:0')

#### Decode text input and plot results

In [20]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

#### Synthesize audio from spectrogram using WaveGlow

In [8]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [21]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

In [None]:
import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [None]:


import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser