In [22]:
import sys
import os
sys.path.append(os.path.expanduser('~/workspace/tacotron/'))

import argparse
from hparams import hparams
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import cm
from matplotlib.colors import ListedColormap
import numpy as np
from PIL import Image
from tacotron.alignment_synthesizer import AlignmentSynthesizer
from tacotron.pml_synthesizer import Configuration, PMLSynthesizer
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm

In [15]:
training_data_dir = '/media/josh/Store/tacotron-data/163-lj-training'
metadata_filename = os.path.join(training_data_dir, 'test.txt')

cfg = Configuration(16000, 163)
synth = PMLSynthesizer(cfg)
# synth.load(checkpoint_path, hparams, gta=gta, model_name=args.variant)

with open(metadata_filename, encoding='utf-8') as f:
    metadata = [line.strip().split('|') for line in f]
    hours = sum((int(x[2]) for x in metadata)) * hparams.frame_shift_ms / (3600 * 1000)
    print('Loaded metadata for %d examples (%.2f hours)' % (len(metadata), hours))
    
pml_files = [m[3] for m in metadata]
texts = [m[5] for m in metadata]
wav_files = [m[6] for m in metadata]

Loaded metadata for 150 examples (0.27 hours)


In [3]:
FIGSIZE = (10, 6)

In [4]:
with open('EvalSentences.txt', 'r') as f:
    sentences = f.readlines()

In [5]:
sentences

['Scientists at the CERN laboratory say they have discovered a new particle.\n',
 'There’s a way to measure the acute emotional intelligence that has never gone out of style.\n',
 'President Trump met with other leaders at the Group of 20 conference.\n',
 "The Senate\\'s bill to repeal and replace the Affordable Care Act is now imperiled.\n",
 'Generative adversarial network or variational auto-encoder.\n',
 "The buses aren\\'t the problem, they actually provide a solution.\n",
 'Does the quick brown fox jump over the lazy dog?\n',
 'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.\n']

In [8]:
from tacotron.pml_synthesizer import Configuration, PMLSynthesizer
from lib import sigproc as sp

# Set up denormalisation parameters for synthesis
mean_path = os.path.expanduser('~/tacotron/LJSpeech-1.1/pml/mean.dat')
std_path = os.path.expanduser('~/tacotron/LJSpeech-1.1/pml/std.dat')
mean_norm = None
std_norm = None

if os.path.isfile(mean_path) and os.path.isfile(std_path):
    mean_norm = np.fromfile(mean_path, 'float32')
    std_norm = np.fromfile(std_path, 'float32')

# reset the graph before we do anything
tf.reset_default_graph()
cfg = Configuration(16000, 163)
pml_synth = PMLSynthesizer(cfg)
hparams.parse('sample_rate=16000,frame_length_ms=20,frame_shift_ms=5,pml_dimension=163,spec_type=fwbnd')
pml_synth.load('/media/josh/Store/remote-logs/163-pmlx-lj-150k/model.ckpt-112000', hparams, model_name='tacotron')
pml_features = pml_synth.synthesize(texts, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type)

INFO:tensorflow:Restoring parameters from /media/josh/Store/remote-logs/163-pmlx-lj-150k/model.ckpt-112000


In [11]:
pml_features_eval = pml_synth.synthesize(sentences, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type)

In [12]:
for i, pml_trj in enumerate(pml_features):
    pml_cmp = pml_trj.reshape(-1)
    pml_trj_test = pml_cmp.reshape((-1, 163))
    assert np.array_equal(pml_trj, pml_trj_test)

In [24]:
import shutil

base_dir = os.path.expanduser('~/workspace/sampleRNN_QDOU/datasets/lj/merlinData/')
cmp_dir = 'wav_PML_cmp_lf0_fwlspec129_fwnm33_nmnoscale'
wav_dir = 'wav_16kHz'

wav_orig_dir = '/media/josh/Store/tacotron-data/LJSpeech-1.1/wav_16kHz'
file_id_list = 'file_id_list.scp'

f = open(os.path.join(base_dir, file_id_list), 'w')

for i, pml_trj in enumerate(pml_features):
    pml_cmp = pml_trj.reshape(-1)
    
    filename = wav_files[i]
    basename = os.path.splitext(filename)[0]
    f.write('{}\n'.format(basename))  # python will convert \n to os.linesep
    
    cmp_name = '{}.cmp'.format(basename)
    pml_cmp.tofile(os.path.join(base_dir, cmp_dir, cmp_name))
    # copy the wav file
    shutil.copyfile(os.path.join(wav_orig_dir, filename), os.path.join(base_dir, wav_dir, filename))
    
f.close()  # you can omit in most cases as the destructor will call it

In [25]:
wav_outputs_163 = []

for i in tqdm(np.arange(0, len(texts), 10)):
    wav_outputs_163 += pml_synth.synthesize(texts[i:i+10], to_wav=True, num_workers=5, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcing binary noise mask
    Forcin

In [27]:
from util import audio

taco_pml_dir = 'taco_163_112k_ppmcep'
os.makedirs(os.path.join(training_data_dir, taco_pml_dir), exist_ok=True)
taco_pml_wavs_orig = np.copy(wav_outputs_163)
taco_pml_wavs = []

for i, wav in enumerate(tqdm(taco_pml_wavs_orig)):
    wav = wav[:audio.find_endpoint(wav, threshold_db=0)]
    taco_pml_wavs.append(wav)
    sp.wavwrite(os.path.join(training_data_dir, taco_pml_dir, wav_files[i]), wav, 16000, norm_max_ifneeded=True)

HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


