This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training.

In [4]:
%load_ext autoreload
%autoreload 2
import os
import sys
import torch
import importlib
import numpy as np
import pandas as pd
from tqdm import tqdm as tqdm
from torch.utils.data import DataLoader
from TTS.tts.datasets.dataset import TTSDataset
from TTS.tts.layers.losses import L1LossMasked
from TTS.utils.audio import AudioProcessor
from TTS.config import load_config
from TTS.tts.utils.visual import plot_spectrogram
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.models import setup_model
from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes
from TTS.tts.utils.synthesis import inv_spectrogram
from TTS.utils.generic_utils import get_npy_path

%matplotlib inline

import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
from matplotlib import pyplot as plt
plt.style.use('dark_background')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
BASE_DIR = '/home/perry/PycharmProjects/TTS/recipes/ljspeech/prune/'
DATA_DIR = "/home/perry/PycharmProjects/TTS/recipes/ljspeech/LJSpeech-1.1/"
TEST_CSV = DATA_DIR + "splits/test.csv"
MODEL_DIR = BASE_DIR + "tacotron2_nomask/baseline/"
CONFIG_PATH = MODEL_DIR + "config.json"
STEPS = 100000
MODEL_FILE = MODEL_DIR + f"checkpoint_{STEPS}.pth.tar"
BATCH_SIZE = 32
MEL_OUT_DIR = MODEL_DIR + f"mel_{STEPS}/"
FULL_MEL_OUT_DIR = MODEL_DIR + f"full_mel_{STEPS}/"
WAV_OUT_DIR = MODEL_DIR + f"wav_gl_{STEPS}/"

In [None]:


DRY_RUN = False   # if False, does not generate output files, only computes loss and visuals.

use_cuda = torch.cuda.is_available()
print(" > CUDA enabled: ", use_cuda)

C = load_config(CONFIG_PATH)
C.audio['do_trim_silence'] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files
ap = AudioProcessor(**C.audio)

In [None]:
C.max_decoder_steps = 1000

In [8]:
print(C['r'])
# if the vocabulary was passed, replace the default
if 'characters' in C and C['characters']:
    print('Using custom chars')
    symbols, phonemes = make_symbols(**C.characters)
MODEL_FILE = '/home/perry/PycharmProjects/TTS/recipes/ljspeech/prune/tacotron2_nomask/snip/sparsity_20/checkpoint_100000.pth.tar'
# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
# TODO: multiple speaker
model = setup_model(C)
model.load_checkpoint(C, MODEL_FILE, eval=True)

1
 > Using model: tacotron2
 > Model's reduction rate `r` is set to: 1


In [9]:
model = model.cuda()

In [10]:
for p, tensor in model.named_parameters():
    print(p, tensor.numel(), int(tensor.count_nonzero()))


embedding.weight 102400 22047
encoder.convolutions.0.convolution1d.weight 1310720 1253892
encoder.convolutions.0.convolution1d.bias 512 0
encoder.convolutions.0.batch_normalization.weight 512 512
encoder.convolutions.0.batch_normalization.bias 512 512
encoder.convolutions.1.convolution1d.weight 1310720 1261157
encoder.convolutions.1.convolution1d.bias 512 0
encoder.convolutions.1.batch_normalization.weight 512 512
encoder.convolutions.1.batch_normalization.bias 512 512
encoder.convolutions.2.convolution1d.weight 1310720 1268294
encoder.convolutions.2.convolution1d.bias 512 0
encoder.convolutions.2.batch_normalization.weight 512 512
encoder.convolutions.2.batch_normalization.bias 512 512
encoder.lstm.weight_ih_l0 524288 520618
encoder.lstm.weight_hh_l0 262144 250377
encoder.lstm.bias_ih_l0 1024 1021
encoder.lstm.bias_hh_l0 1024 1018
encoder.lstm.weight_ih_l0_reverse 524288 518723
encoder.lstm.weight_hh_l0_reverse 262144 243394
encoder.lstm.bias_ih_l0_reverse 1024 1016
encoder.lstm.bias_

In [7]:
test_items = pd.read_csv(TEST_CSV).values.tolist()
dataset = TTSDataset(
    C.dataset_configs[0],
    ap,
    test_items,
    use_phonemes=C.use_phonemes,
    use_mel=C.use_mel,
    enable_eos_bos=C.enable_eos_bos,
)
dataset.sort_and_filter_items()
loader = DataLoader(
    dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False
)



 > DataLoader initialization
 | > Inputs used: phonemes, mel
 | > Number of instances : 1310
 | > Max length sequence: 166
 | > Min length sequence: 12
 | > Avg length sequence: 93.27099236641222
 | > Num. instances discarded by max-min (max=2147483647, min=0) seq limits: 0
 | > Batch group size: 0.


In [None]:
from pathlib import Path
with torch.no_grad():
    os.makedirs(FULL_MEL_OUT_DIR, exist_ok=True)
    os.makedirs(MEL_OUT_DIR, exist_ok=True)
    os.makedirs(WAV_OUT_DIR, exist_ok=True)
    wav_filenames = []
    output_lengths_all = []
    mel_gt_lengths_all = []
    for batch in loader:
        char_ids = batch["char_ids"]
        wav_path = batch["wav_path"]
        wav_filenames.extend([Path(wav_file).stem for wav_file in wav_path])
        mel_gts = batch["mel"]
        mel_gt_lengths = batch["mel_lengths"]
        mel_gt_lengths_all.extend(mel_gt_lengths.tolist())

        # dispatch data to GPU
        if use_cuda:
            char_ids = char_ids.cuda()

        results = model.inference(char_ids)

        model_outputs = results['model_outputs']
        decoder_outputs = results['decoder_outputs']
        alignments = results['alignments']
        stop_tokens = results['stop_tokens']

        output_lengths = torch.sum(stop_tokens < 0.5, dim=1).squeeze()
        output_lengths_all.extend(output_lengths.tolist())

        for model_output, output_length, mel_gt_length, wav_file in zip(model_outputs, output_lengths, mel_gt_lengths, wav_path):
            # plot posnet output
            model_output = model_output[:output_length, :]
            spectrogram_full = model_output.cpu().numpy().squeeze().T
            np.save(get_npy_path(FULL_MEL_OUT_DIR, wav_file), spectrogram_full)
            # model_out = model_output[:mel_gt_length, :]
            # spectrogram = model_out.cpu().numpy().squeeze().T
            # np.save(get_npy_path(MEL_OUT_DIR, wav_file), spectrogram)

            # wav = ap.inv_melspectrogram(spectrogram)
            # ap.save_wav(wav, os.path.join(WAV_OUT_DIR, os.path.basename(wav_file)))


In [None]:
# import matplotlib.pyplot as plt
# plt.imshow(spectrogram, aspect="auto", origin="lower", cmap='viridis')

In [None]:
length_data = pd.DataFrame(data=list(zip(mel_gt_lengths, output_lengths)), columns=['mel_gt_length', 'output_length'], index=wav_filenames)

In [None]:
length_data.to_csv(os.path.join(MEL_OUT_DIR, 'length_data.csv'))

In [None]:
stop_targets = batch['stop_targets']
stop_target_lengths = batch['mel_lengths']

In [None]:
stop_targets.shape

In [None]:
id_lengths = batch["id_lengths"].cuda()
mel_gts = mel_gts.cuda()
mel_lengths = mel_lengths.cuda()

In [None]:
outputs = model.forward(char_ids, id_lengths, mel_specs=mel_gts, mel_lengths=mel_lengths)

In [None]:
print(torch.sum(1-stop_targets, dim=1))
print(mel_lengths)

### Generate model outputs

In [None]:
import pickle

file_idxs = []
metadata = []
losses = []
postnet_losses = []
criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)
with torch.no_grad():
    for data in tqdm(loader):
        # setup input data
        char_ids = data["char_ids"]
        id_lengths = data["id_lengths"]
        linear_input = data["linear"]
        stop_targets = data["stop_targets"]
        wav_path = data["wav_path"]

        # dispatch data to GPU
        if use_cuda:
            char_ids = char_ids.cuda()
            id_lengths = id_lengths.cuda()
            mel_input = mel_input.cuda()
            mel_lengths = mel_lengths.cuda()

        mask = sequence_mask(id_lengths)
        mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(char_ids, id_lengths)
        
        # compute loss
        loss = criterion(mel_outputs, mel_input, mel_lengths)
        loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)
        losses.append(loss.item())
        postnet_losses.append(loss_postnet.item())

        mels = postnet_outputs.detach().cpu().numpy()
        alignments = alignments.detach().cpu().numpy()

        if not DRY_RUN:
            os.makedirs(MEL_OUT_DIR, exist_ok=True)
            os.makedirs(WAV_OUT_DIR, exist_ok=True)
            wavs_gl = apply_griffin_lim(mels, mel_lengths)
            for idx in range(char_ids.shape[0]):
                wav_file = wav_path[idx]

                # save TTS mel
                mel = mels[idx]
                mel_length = mel_lengths[idx]
                mel = mel[:mel_length, :].T
                np.save(get_npy_path(MEL_OUT_DIR, wav_file), mel)



    print(np.mean(losses))
    print(np.mean(postnet_losses))

### Sanity Check

In [None]:

spec = np.load('/home/perry/PycharmProjects/TTS/recipes/ljspeech/prune/coqui_tts-20220204_1907-aa986857/sparsity_90/full_mel_100000/LJ025-0110.npy')


In [None]:
plot_spectrogram(spec.T, ap)

In [None]:
# plot decoder output
print(postnet_outputs.shape)
plot_spectrogram(postnet_outputs, ap)

In [None]:
# plot GT specgrogram
print(mel_gts[idx].shape)
plot_spectrogram(mel_gts[idx], ap)

In [None]:
# postnet, decoder diff
from matplotlib import pylab as plt
mel_diff = mel_decoder - mel_postnet
plt.figure(figsize=(16, 10))
plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect="auto", origin="lower");
plt.colorbar()
plt.tight_layout()

In [None]:
# PLOT GT SPECTROGRAM diff
from matplotlib import pylab as plt
mel_diff2 = mel_truth.T - mel_decoder
plt.figure(figsize=(16, 10))
plt.imshow(abs(mel_diff2).T,aspect="auto", origin="lower");
plt.colorbar()
plt.tight_layout()

In [None]:
# PLOT GT SPECTROGRAM diff
from matplotlib import pylab as plt
mel = postnet_outputs[idx]
mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]
plt.figure(figsize=(16, 10))
plt.imshow(abs(mel_diff2).T,aspect="auto", origin="lower");
plt.colorbar()
plt.tight_layout()