In [6]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, "../")
sys.path.insert(0, "../tacotron2/")
sys.path.insert(0, "../waveglow/")

import json
import torch
import numpy as np
from datetime import datetime
from IPython.display import Audio

from audio.vocoders import griffin_lim
from tacotron2.model import Tacotron2
from tacotron2.text import text_to_sequence, sequence_to_text
from waveglow.glow import WaveGlow

In [7]:
# Tacotron 2
TACOTRON_CONFIG=json.load(open('../tacotron2/config.json', 'r'))
TACOTRON_CHECKPT='../checkpoints/tacotron2_statedict.pt'

# Waveglow
WAVEGLOW_CONFIG=json.load(open('../waveglow/config.json', 'r'))
WAVEGLOW_CHECKPT='../checkpoints/waveglow_256channels_ljs_v3.pt'

# Essential
ON_GPU=False
MAX_WAV_VALUE=32768.0
SIGMA=1.1

### **Load models**

In [10]:
tacotron2 = Tacotron2(TACOTRON_CONFIG)
checkpt_state_dict = torch.load(TACOTRON_CHECKPT,
                                map_location=lambda storage, loc: storage)['state_dict']
tacotron2.load_state_dict(checkpt_state_dict)
_ = tacotron2.cuda().eval() if ON_GPU else tacotron2.cpu().eval()

In [9]:
waveglow = torch.load(WAVEGLOW_CHECKPT,
                      map_location=lambda storage, loc: storage)['model']
waveglow = waveglow.remove_weightnorm(waveglow)
_ = waveglow.cuda().eval() if ON_GPU else waveglow.cpu().eval()

### **SVD compression of linear layers**

In [24]:
for key in tacotron2.state_dict().keys():
    print(key)

embedding.weight
encoder.convolutions.0.0.conv.weight
encoder.convolutions.0.0.conv.bias
encoder.convolutions.0.1.weight
encoder.convolutions.0.1.bias
encoder.convolutions.0.1.running_mean
encoder.convolutions.0.1.running_var
encoder.convolutions.0.1.num_batches_tracked
encoder.convolutions.1.0.conv.weight
encoder.convolutions.1.0.conv.bias
encoder.convolutions.1.1.weight
encoder.convolutions.1.1.bias
encoder.convolutions.1.1.running_mean
encoder.convolutions.1.1.running_var
encoder.convolutions.1.1.num_batches_tracked
encoder.convolutions.2.0.conv.weight
encoder.convolutions.2.0.conv.bias
encoder.convolutions.2.1.weight
encoder.convolutions.2.1.bias
encoder.convolutions.2.1.running_mean
encoder.convolutions.2.1.running_var
encoder.convolutions.2.1.num_batches_tracked
encoder.lstm.weight_ih_l0
encoder.lstm.weight_hh_l0
encoder.lstm.bias_ih_l0
encoder.lstm.bias_hh_l0
encoder.lstm.weight_ih_l0_reverse
encoder.lstm.weight_hh_l0_reverse
encoder.lstm.bias_ih_l0_reverse
encoder.lstm.bias_hh_

In [None]:
tacotron2.

### **Inference**
**Prepare texts**

In [None]:
texts = ["Cristiano Ronaldo has won his first UEFA Champions League with Manchester United in two thousand eight.",
         "Implicit learning of the likelihood makes normalizing flows very strong generative tool.",
         "WaveGlow and L P C Net are accelerated derivatives of WaveNet state-of-the-art model."]
assert len(texts) > 0
texts = [text.strip() for text in texts]
sequences = [np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
             for text in texts]
sequences = [torch.autograd.Variable(torch.from_numpy(sequence)).long()
             for sequence in sequences]

**Synthesis**

In [None]:
TEXT_IDX = -1
print(sequence_to_text(map(int, list(sequences[TEXT_IDX].squeeze()))))

total_start = datetime.now()

with torch.no_grad():
    tc_start = datetime.now()
    mel_outputs, mel, gate_outputs, alignments = tacotron2.inference(sequences[TEXT_IDX])
    tc_end = datetime.now()
    
    wg_start = datetime.now()
    wave = MAX_WAV_VALUE*waveglow.infer(mel, sigma=SIGMA)
    wg_end = datetime.now()
    
total_end = datetime.now()
print('Total inference time:', total_end - total_start)
print('Tacotron 2 inference time:', tc_end - tc_start)
print('Waveglow inference time:', wg_end - wg_start)

In [None]:
Audio(wave, rate=22050)