## Libs

In [5]:
import torch
from torch.autograd import Variable

import numpy as np
from IPython.display import Audio
import matplotlib.pyplot as plt
%matplotlib inline

import models
from tacotron2.text import text_to_sequence
from common.utils import load_wav_to_torch, to_gpu
from common.layers import TacotronSTFT
from hparams import Hyperparameters as hp

In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

## Paths to checkpoints

In [7]:
taco_path = '/workspace/output/gst2/checkpoint_Tacotron2_1500'
wg_path = '/workspace/output/sm_wg/checkpoint_WaveGlow_1750'

## Load models

In [8]:
taco_checkpoint = torch.load(taco_path, map_location='cpu')
wg_checkpoint = torch.load(wg_path, map_location='cpu')

In [5]:
t2 = models.get_model('Tacotron2', taco_checkpoint['config'], to_cuda=True)
wg = models.get_model('WaveGlow', wg_checkpoint['config'], to_cuda=True)

In [6]:
for model, checkpoint in [(t2, taco_checkpoint), (wg, wg_checkpoint)]:
    new_state_dict = {}
    for key, value in checkpoint['state_dict'].items():
        new_key = key.replace('module.', '')
        new_state_dict[new_key] = value

    model.load_state_dict(new_state_dict)

In [7]:
t2.eval()
wg.eval()
print('Done')

Done


## Set speaker and text

In [8]:
text = "hello, how are you doing today?"
speaker_id = 1

## Select inference type

In [9]:
inf_type = 'ref'

### Reference audio

In [10]:
ref_audio = '/workspace/training_data/blizzard_2013/wavs/CA-MP2-03-013.wav'


#### Listen to ref Audio

In [11]:
Audio(ref_audio, rate=hp.sampling_rate)

### Or GST token

In [12]:
style_token = None

In [13]:
if inf_type == 'ref':
    stft = TacotronSTFT(
        hp.filter_length, hp.hop_length, hp.win_length,
        hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin,
        hp.mel_fmax
    )

    audio, sampling_rate = load_wav_to_torch(ref_audio)

    if sampling_rate != stft.sampling_rate:
        raise ValueError("{} {} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))

    audio_norm = audio / hp.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = Variable(audio_norm, requires_grad=False)
    ref_mel = stft.mel_spectrogram(audio_norm)
    ref_mel = torch.squeeze(ref_mel, 0)

    ref_mel = ref_mel.unsqueeze(0)
    
    ref_mel = to_gpu(ref_mel)
elif inf_type == 'token':
    pass

## Infer

In [14]:
inputs = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
inputs = torch.from_numpy(inputs).to(device='cuda', dtype=torch.int64)
#input_lengths = torch.IntTensor([inputs.size(1)]).cuda().long()
speaker_id = torch.IntTensor([speaker_id]).cuda().long()

In [15]:
embedded_speaker = t2.speakers_embedding(speaker_id)

In [16]:
embedded_speaker

tensor([[ 1.3895e-05,  9.6907e-06,  9.0520e-06, -1.7259e-05, -2.1256e-02,
         -9.3956e-06, -2.8580e-05,  9.6071e-06,  1.1235e-05, -1.8886e-05,
          2.4008e-05,  1.9629e-05,  1.9292e-05, -6.9328e-06, -4.0916e-05,
          8.4590e-06]], device='cuda:0', grad_fn=<EmbeddingBackward>)

In [17]:
with torch.no_grad():
    _, mel, _, _ = t2.infer(inputs, speaker_id, ref_mel)
    audio = wg.infer(mel)



style_embeddings torch.Size([1, 1, 256])
style_embeddings exp. torch.Size([1, 31, 256])
merged_outputs torch.Size([1, 31, 784])


In [None]:
plt.imshow(mel.squeeze(0).detach().cpu().numpy())

In [None]:
audio_numpy = audio[0].data.cpu().numpy()
rate = 22050

In [None]:
Audio(audio_numpy, rate=rate)