In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('waveglow/')

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd
import librosa
import torch
import random

from hparams import create_hparams
from model import Tacotron2, load_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence

In [2]:
def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]

In [3]:
def plot_mel_f0_alignment(mel_source, mel_outputs_postnet, f0s, alignments, figsize=(16, 16)):
    fig, axes = plt.subplots(4, 1, figsize=figsize)
    axes = axes.flatten()
    axes[0].imshow(mel_source, aspect='auto', origin='lower', interpolation='none')
    axes[1].imshow(mel_outputs_postnet, aspect='auto', origin='lower', interpolation='none')
    axes[2].scatter(range(len(f0s)), f0s, alpha=0.5, color='red', marker='.', s=1)
    axes[2].set_xlim(0, len(f0s))
    axes[3].imshow(alignments, aspect='auto', origin='lower', interpolation='none')
    axes[0].set_title("Source Mel")
    axes[1].set_title("Predicted Mel")
    axes[2].set_title("Source pitch contour")
    axes[3].set_title("Source rhythm")
    plt.tight_layout()

In [4]:
def load_mel(path):
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

In [5]:
hparams = create_hparams()

In [6]:
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)

## Load Models

In [None]:
"""여기서 체크포인트 파일 바꿈"""
checkpoint_path = "models/checkpoint_74500"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

In [None]:
waveglow_path = 'models/waveglow_256channels_universal_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()

## Setup dataloaders

In [None]:
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
"""여기서 audio_paths 파일로 정해서 하면됨"""
# audio_paths = 'data/examples_filelist.txt'
audio_paths = 'sample/filelist_vctk_val.txt'
# audio_paths = 'sample/filelist_nonparallel.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data

In [None]:
sentences = [
        "the party has never fully recovered.",
        "we also need a small plastic snake and a big toy frog for the kids.",
        # "this is a librivox recording.",
        # "matthew cuthbert is surprised",
        "ambitious hopes, which had seemed to be extinguished, revived in his bosom.",
        # "after a pause bechamel went back to the dining room.",
        # "in the aftermath of this storm, we were thrown back to the east. away went any hope of",
        "when we first met here we were younger than our girls are now.",
        # "you must know said margolotte when they were all seated together on the broad window seat that my husband foolishly gave away all the powder of life he first made to old mombi the witch who used to live in the country of the gillikins to the north of here.",
        "oh my god, he's lost it. he's totally lost it.",
        "Now it was finished - that is to say the design - she must stitch it together .",
        # "Advanced text to speech models such as Fast Speech can synthesize speech significantly faster than previous auto regressive models with comparable quality. The training of Fast Speech model relies on an auto regressive teacher model for duration prediction and knowledge distillation, which can ease the one to many mapping problem in T T S. However, Fast Speech has several disadvantages, 1, the teacher student distillation pipeline is complicated, 2, the duration extracted from the teacher model is not accurate enough, and the target mel spectrograms distilled from teacher model suffer from information loss due to data simplification, both of which limit the voice quality.",
        # "Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition",
        # "in being comparatively modern.",
        # "For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process",
        # "produced the block books, which were the immediate predecessors of the true printed book,",
        # "the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.",
        "And it is worth mention in passing that, as an example of fine typography,",
        # "the earliest book printed with movable types, the Gutenberg, or \"forty-two line Bible\" of about 1455,",
        # "has never been surpassed.",
        "Printing, then, for our purpose, may be considered as the art of making books by means of movable types.",
        # "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress,"
    ]

In [None]:
"""여기서 file_idx 고쳐서 audio path 다른거 뽑고 text 고르면됨"""
file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]
# text = random.sample(sentences, 1)[0]
print(audio_path, text)

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners))[None, :].cuda()
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(text_encoded.size(), pitch_contour.size(), mel.size())

# load source data to obtain rhythm using tacotron 2 as a forced aligner
# x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))
x, y = mellotron.parse_batch(datacollate([dataloader.get_data((audio_path, text, sid))]))

In [None]:
print(f'text_padded: {x[0].size()}')
print(f'input_lengths: {x[1].size()}')
print(f'mel_padded: {x[2].size()}')
print(f'max_len: {x[3]}')
print(f'output_lengths: {x[4].size()}')
print(f'speaker_ids: {x[5].size()}')
print(f'f0_padded: {x[6].size()}')
print(f'mel_padded: {y[0].size()}')
print(f'gate_padded: {y[1].size()}')

In [None]:
ipd.Audio(audio_path, rate=hparams.sampling_rate)

## Define Speakers Set

In [None]:
speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python',header=None, comment=';', sep=' *\| *', 
                       names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
female_speakers = cycle(
    speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
male_speakers = cycle(
    speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
print(next(female_speakers))

In [None]:
with open('data/vctk-speaker-info.txt') as f:
    speaker_lines = f.readlines()[1:]

speakers = [(l.split()[0].strip(), l.split()[2].strip()) for l in speaker_lines]
male_speakers = [(s[0], i) for i, s in enumerate(speakers) if s[1] == 'M']
female_speakers = [(s[0], i) for i, s in enumerate(speakers) if s[1] == 'F']
speakers_dict = dict(male_speakers + female_speakers)
print(speakers_dict)

# Style Transfer (Rhythm and Pitch Contour)

In [None]:
with torch.no_grad():
    # get rhythm (alignment map) using tacotron 2
    mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(x)
    rhythm = rhythm.permute(1, 0, 2)

In [None]:
"""여기서 위에 보고 스피커 id 바꿈"""
speaker_id = female_speakers[0][1] if np.random.randint(2) else male_speakers[0][1]
speaker_id = torch.LongTensor([speaker_id]).cuda()

with torch.no_grad():
    mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
        (text_encoded, mel, speaker_id, pitch_contour, rhythm))

plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                      mel_outputs_postnet.data.cpu().numpy()[0],
                      pitch_contour.data.cpu().numpy()[0, 0],
                      rhythm.data.cpu().numpy()[:, 0].T)

In [None]:
with torch.no_grad():
    audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)