In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import tensorflow as tf
from tensorflow.python.framework import convert_to_constants
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants

from parallel_wavegan.models import TFMelGANGenerator, MelGANGenerator
import torch

import yaml
import numpy as np

In [2]:
class TFMelgan(object):
    def __init__(self, saved_path):
        self.saved_path = saved_path
        self.graph = self._load_model()

        # warmup 2 steps
        self.set_mels(tf.random.uniform(shape=[1, 250, 80]))
        self.run_inference()
        self.run_inference()

        self.mels = None
        self.audios = None
    
    def _load_model(self):
        saved_model_loaded = tf.saved_model.load(
          self.saved_path, tags=[tag_constants.SERVING])
        graph_func = saved_model_loaded.signatures[
          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
        graph_func = convert_to_constants.convert_variables_to_constants_v2(graph_func)
        return graph_func

    def set_mels(self, values):
        self.mels = values

    def get_mels(self):
        return self.mels

    def get_audio(self):
        return self.audios

    def run_inference(self):
        tf_mels = tf.constant(self.mels)
        self.audios = self.graph(tf_mels)[0].numpy()[:, :, 0]
        return self.audios    

In [3]:
tf_melgan = TFMelgan(saved_path='./checkpoint/tensorflow_generator/')

In [4]:
vocoder_conf = '../egs/ljspeech/voc1/conf/melgan.v1.long.yaml'
with open(vocoder_conf) as f:
    config = yaml.load(f, Loader=yaml.Loader)

In [10]:
pytorch_melgan = MelGANGenerator(**config["generator_params"])
pytorch_melgan.load_state_dict(
    torch.load("./checkpoint/pytorch_generator/generator_4000000.pth", map_location='cpu'))
pytorch_melgan.remove_weight_norm()
pytorch_melgan.to("cpu")
pytorch_melgan.eval().to("cpu")

MelGANGenerator(
  (melgan): Sequential(
    (0): ReflectionPad1d((3, 3))
    (1): Conv1d(80, 512, kernel_size=(7,), stride=(1,))
    (2): LeakyReLU(negative_slope=0.2)
    (3): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
    (4): ResidualStack(
      (stack): Sequential(
        (0): LeakyReLU(negative_slope=0.2)
        (1): ReflectionPad1d((1, 1))
        (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
        (3): LeakyReLU(negative_slope=0.2)
        (4): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      )
      (skip_layer): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
    )
    (5): ResidualStack(
      (stack): Sequential(
        (0): LeakyReLU(negative_slope=0.2)
        (1): ReflectionPad1d((3, 3))
        (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), dilation=(3,))
        (3): LeakyReLU(negative_slope=0.2)
        (4): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      )
      (skip_layer): Conv1d(256, 256, kernel_size=(1,)

In [11]:
fake_mels = np.random.sample((4, 80, 500)).astype(np.float32)
# warmup
y = pytorch_melgan(torch.Tensor(fake_mels))
y = pytorch_melgan(torch.Tensor(fake_mels))

In [12]:
%%time
y = pytorch_melgan(torch.Tensor(fake_mels))

CPU times: user 20.4 s, sys: 2.17 s, total: 22.6 s
Wall time: 3.91 s


In [13]:
%%time
tf_melgan.set_mels(np.random.sample((4, 500, 80)).astype(np.float32))
y = tf_melgan.run_inference()

CPU times: user 7.49 s, sys: 951 ms, total: 8.44 s
Wall time: 1.95 s
