<a href="https://colab.research.google.com/github/iamsusiep/slp2019/blob/master/prosody_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook loads a model that generates prosody embeddings (using the Tacotron architecture). Original model code from https://github.com/syang1993/gst-tacotron/. 

This notebook expects that you already have run the preprocess_utterances notebook, generating a folder with mel spectrograms from individual utterances, which have been pulled from audio files that have had music removed. 


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import tensorflow as tf
!git clone https://github.com/syang1993/gst-tacotron.git

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Cloning into 'gst-tacotron'...
remote: Enumerating objects: 375, done.[K
remote: Total 375 (delta 0), reused 0 (delta 0), pack-reused 375[K
Receiving objects: 100% (375/375), 421.74 KiB | 763.00 KiB/s, done.
Resolving deltas: 100% (244/244), done.


In [0]:
cd gst-tacotron

/content/gst-tacotron/gst-tacotron/gst-tacotron


In [0]:
%%capture
!pip install -r requirements.txt

In [0]:
hparams = tf.contrib.training.HParams(
  # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
  # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
  cleaners='english_cleaners',

  # Audio:
  num_mels=80,
  num_freq=1025,
  sample_rate=16000,
  frame_length_ms=50,
  frame_shift_ms=12.5,
  preemphasis=0.97,
  min_level_db=-100,
  ref_level_db=20,

  # Model:
  outputs_per_step=2,
  embed_depth=256,
  prenet_depths=[256, 128],
  encoder_depth=256,
  rnn_depth=256,

  # Attention
  attention_depth=256,


  # Training:
  batch_size=32,
  adam_beta1=0.9,
  adam_beta2=0.999,
  initial_learning_rate=0.002,
  decay_learning_rate=True,
  use_cmudict=False,  # Use CMUDict during training to learn pronunciation of ARPAbet phonemes

  # Eval:
  max_iters=1000,
  griffin_lim_iters=60,
  power=1.5,              # Power to raise magnitudes to prior to Griffin-Lim

  #Global style token
  use_gst=False,     # When false, the scripit will do as the paper  "Towards End-to-End Prosody Transfer for Expressive Speech Synthesis with Tacotron"
  num_gst=10,
  num_heads=4,       # Head number for multi-head attention
  style_embed_depth=256,
  reference_filters=[32, 32, 64, 64, 128, 128],
  reference_depth=128,
  style_att_type="mlp_attention", # Attention type for style attention module (dot_attention, mlp_attention)
  style_att_dim=128,
)

In [0]:
import io
import numpy as np
import tensorflow as tf
from models import create_model
from text import text_to_sequence
from util import audio, plot
import textwrap


class Synthesizer:
  def __init__(self):
    pass
  def load(self, reference_mel=None, model_name='tacotron'):
    print('Constructing model: %s' % model_name)
    inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
    input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') 
    if reference_mel is not None:
      reference_mel = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'reference_mel')

    with tf.variable_scope('model') as scope:
      self.model = create_model(model_name, hparams)
      self.model.initialize(inputs, input_lengths, mel_targets=None, reference_mel=reference_mel)
      self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])
      self.alignments = self.model.alignments[0]

    print('Loading checkpoint')
    self.session = tf.Session()
    self.session.run(tf.global_variables_initializer())
    # saver = tf.train.import_meta_graph('/content/gdrive/My Drive/prosody_model/model.ckpt-50000.meta')
    saver = tf.train.Saver()
    saver.restore(self.session, "/content/gdrive/My Drive/prosody_model/model.ckpt-50000")

  def synthesize(self, text, reference_path=None):
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    seq = text_to_sequence(text, cleaner_names)
    feed_dict = {
      self.model.inputs: [np.asarray(seq, dtype=np.int32)],
      self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
    }
    if reference_path is not None:
      reference_mel = np.load(reference_path)
      reference_mel = np.expand_dims(reference_mel, 0)
      feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})

    # we only want to get the embeddings out, so we explicitly ask just for those
    op_to_restore = self.session.graph.get_tensor_by_name("model/inference/ref_encoder/dense/Tanh:0")

    embeddings = self.session.run(op_to_restore, feed_dict=feed_dict)
    return(embeddings)

In [0]:
tf.reset_default_graph()
synth = Synthesizer()
synth.load(reference_mel=True)

Constructing model: tacotron
Initialized Tacotron model. Dimensions: 
  text embedding:          256
  style embedding:         128
  prenet out:              128
  encoder out:             384
  attention out:           256
  concat attn & out:       640
  decoder cell out:        256
  decoder out (2 frames):  160
  decoder out (1 frame):   80
  postnet out:             256
  linear out:              1025
Loading checkpoint
INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/prosody_model/model.ckpt-50000


In [0]:
# reset in case we rerun this cell
# tf.reset_default_graph()

# check for existing files
import time
import glob 
import os
existing_files = glob.glob('/content/gdrive/My Drive/preprocessed_prosody_model_inputs/*.npy')

# run inference to get embeddings, only on new files
for f in existing_files:
  yt_link = f.split('preprocessed_prosody_model_inputs/')[1].split('-mel.npy')[0]
  if not any(yt_link in x for x in os.listdir('/content/gdrive/My Drive/prosody_embeddings/')):
    embeddings = synth.synthesize('dummy text', reference_path='/content/gdrive/My Drive/preprocessed_prosody_model_inputs/' + yt_link + '-mel.npy')
    np.save('/content/gdrive/My Drive/prosody_embeddings/' + yt_link, embeddings, allow_pickle=True)
    time.sleep(0.1)
