In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('waveglow/')

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd
import librosa
import torch

from hparams import create_hparams
from model import Tacotron2, load_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

In [4]:
%reload_ext autoreload
%autoreload 2

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [6]:
def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]

In [7]:
def plot_mel_f0_alignment(mel_source, mel_outputs_postnet, f0s, alignments, figsize=(16, 16)):
    fig, axes = plt.subplots(4, 1, figsize=figsize)
    axes = axes.flatten()
    axes[0].imshow(mel_source, aspect='auto', origin='bottom', interpolation='none')
    axes[1].imshow(mel_outputs_postnet, aspect='auto', origin='bottom', interpolation='none')
    axes[2].scatter(range(len(f0s)), f0s, alpha=0.5, color='red', marker='.', s=1)
    axes[2].set_xlim(0, len(f0s))
    axes[3].imshow(alignments, aspect='auto', origin='bottom', interpolation='none')
    axes[0].set_title("Source Mel")
    axes[1].set_title("Predicted Mel")
    axes[2].set_title("Source pitch contour")
    axes[3].set_title("Source rhythm")
    plt.tight_layout()

In [8]:
def load_mel(path):
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

In [9]:
hparams = create_hparams()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [10]:
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)

## Load Models

In [11]:
checkpoint_path = "models/mellotron_libritts.pt"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

<All keys matched successfully>

In [12]:
waveglow_path = 'models/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()

## Setup dataloaders

In [13]:
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data

In [22]:
file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda()    
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

data/example1.wav exploring the expanses of space to keep our planet safe


In [23]:
ipd.Audio(audio_path, rate=hparams.sampling_rate)

## Define Speakers Set

In [24]:
speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python',header=None, comment=';', sep=' *\| *', 
                       names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
female_speakers = cycle(
    speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
male_speakers = cycle(
    speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())

# Singing Voice from Music Score

In [25]:
data = get_data_from_musicxml('data/haendel_hallelujah.musicxml', 90, convert_stress=True)
panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}

In [26]:
n_speakers_per_part = 4
frequency_scaling = 0.4
n_seconds = 90
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].cuda()
    pitch_contour = data[part]['pitch_contour'].cuda()
    text_encoded = data[part]['text_encoded'].cuda()
    
    for k in range(n_speakers_per_part):
        pan = np.random.randint(panning[part][0], panning[part][1])
        if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
            speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
        else:
            speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
        print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            audio_stereo[:audio.shape[0]] += audio            
#             write("{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)
audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
ipd.Audio([audio_stereo[:,0], audio_stereo[:,1]], rate=hparams.sampling_rate)

Soprano MellotronID 30 pan -45
Soprano MellotronID 104 pan -33


KeyboardInterrupt: 

In [108]:
#from mellotron_utils import *

In [17]:
import music21

In [18]:
s = music21.converter.parse('data/haendel_hallelujah.musicxml')

In [19]:
score1 = music21.stream.Score()

part = music21.stream.Part()
for n in list(s.parts[0].flat.getElementsByClass('Note')):
    part.insert(n)
# note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.insert(note)
# note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
# note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
# note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)

score1.insert(part)

In [20]:
score2 = music21.stream.Score()

part = music21.stream.Part()
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.insert(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
score2.insert(part)

In [21]:
def create_score_test(one, two, three, four, five):
    score2 = music21.stream.Score()

    part = music21.stream.Part()
    
    note = music21.note.Note('D4'); note.quarterLength = 0.5; note.offset = 0; note.lyric = one; part.insert(note)
    note = music21.note.Note('G4'); note.quarterLength = 0.5; note.lyric = two; part.append(note)
    note = music21.note.Note('F#4'); note.quarterLength = 0.25; note.lyric = three; part.append(note)
    note = music21.note.Note('G4'); note.quarterLength = 0.25; note.lyric = four; part.append(note)
#     note = music21.note.Note('A4'); note.quarterLength = 0.25 + 0.25*1/2;  note.lyric = five; part.append(note)
    note = music21.note.Rest(); note.quarterLength = 0.25;  part.append(note)


    score2.insert(part)
    return score2

In [35]:
# %load mellotron_utils
import re
import numpy as np
import music21 as m21
import torch
import torch.nn.functional as F
from text import text_to_sequence, get_arpabet, cmudict


CMUDICT_PATH = "data/cmu_dictionary"
CMUDICT = cmudict.CMUDict(CMUDICT_PATH)


########################
#  CONSONANT DURATION  #
########################
PHONEMEDURATION = {
    'B': 0.05,
    'CH': 0.1,
    'D': 0.075,
    'DH': 0.05,
    'DX': 0.05,
    'EL': 0.05,
    'EM': 0.05,
    'EN': 0.05,
    'F': 0.1,
    'G': 0.05,
    'HH': 0.05,
    'JH': 0.05,
    'K': 0.05,
    'L': 0.05,
    'M': 0.15,
    'N': 0.15,
    'NG': 0.15,
    'NX': 0.05,
    'P': 0.05,
    'Q': 0.075,
    'R': 0.05,
    'S': 0.1,
    'SH': 0.05,
    'T': 0.075,
    'TH': 0.1,
    'V': 0.05,
    'Y': 0.05,
    'W': 0.05,
    'WH': 0.05,
    'Z': 0.05,
    'ZH': 0.05
}


def add_space_between_events(events, connect=False):
    # import pdb; pdb.set_trace()
    new_events = []
    for i in range(1, len(events)):
        token_a, freq_a, start_time_a, end_time_a = events[i-1][-1]
        token_b, freq_b, start_time_b, end_time_b = events[i][0]

        if token_a in (' ', '') and len(events[i-1]) == 1:
            new_events.append(events[i-1])
        elif token_a not in (' ', '') and token_b not in (' ', ''):
            new_events.append(events[i-1])
            if connect:
                new_events.append([[' ', 0, end_time_a, start_time_b]])
            else:
                new_events.append([[' ', 0, end_time_a, end_time_a]])
        else:
            new_events.append(events[i-1])

    if new_events[-1][0][0] != ' ':
        new_events.append([[' ', 0, end_time_a, end_time_a]])
    new_events.append(events[-1])

    return new_events


def adjust_words(events):
    new_events = []

    for event in events:
        if len(event) == 1 and event[0][0] == ' ':
            new_events.append(event)
        else:
            for e in event:
                if e[0][0].isupper() or (len(new_events) == 0):
                    new_events.append([e])
                else:
                    new_events[-1].extend([e])

    return new_events


def adjust_extensions(events, phoneme_durations):
    if len(events) == 1:
        return events

    idx_last_vowel = None
    n_consonants_after_last_vowel = 0
    target_ids = np.arange(len(events))
    for i in range(len(events)):
        token = re.sub('[0-9{}]', '', events[i][0])
        if idx_last_vowel is None and token not in phoneme_durations:
            idx_last_vowel = i
            n_consonants_after_last_vowel = 0
        else:
            if token == '_' and not n_consonants_after_last_vowel:
                events[i][0] = events[idx_last_vowel][0]
            elif token == '_' and n_consonants_after_last_vowel:
                events[i][0] = events[idx_last_vowel][0]
                start = idx_last_vowel + 1
                target_ids[start:start+n_consonants_after_last_vowel] += 1
                target_ids[i] -= n_consonants_after_last_vowel
            elif token in phoneme_durations:
                n_consonants_after_last_vowel += 1
            else:
                n_consonants_after_last_vowel = 0
                idx_last_vowel = i

    new_events = [0] * len(events)
    for i in range(len(events)):
        new_events[target_ids[i]] = events[i]

    # adjust time of consonants that were repositioned
    for i in range(1, len(new_events)):
        if new_events[i][2] < new_events[i-1][2]:
            new_events[i][2] = new_events[i-1][2]
            new_events[i][3] = new_events[i-1][3]

    return new_events


def adjust_consonant_lengths(events, phoneme_durations):
    t_init = events[0][2]

    idx_last_vowel = None
    for i in range(len(events)):
        task = re.sub('[0-9{}]', '', events[i][0])
        if task in phoneme_durations:
            duration = phoneme_durations[task]
            if idx_last_vowel is None:  # consonant comes before any vowel
                events[i][2] = t_init
                events[i][3] = t_init + duration
            else:  # consonant comes after a vowel, must offset
                events[idx_last_vowel][3] -= duration
                for k in range(idx_last_vowel+1, i):
                    events[k][2] -= duration
                    events[k][3] -= duration
                events[i][2] = events[i-1][3]
                events[i][3] = events[i-1][3] + duration
        else:
            events[i][2] = t_init
            events[i][3] = events[i][3]
            t_init = events[i][3]
            idx_last_vowel = i
        t_init = events[i][3]

    return events


def adjust_consonants(events, phoneme_durations):
    if len(events) == 1:
        return events

    start = 0
    split_ids = []
    t_init = events[0][2]

    # get each substring group
    for i in range(1, len(events)):
        if events[i][2] != t_init:
            split_ids.append((start, i))
            start = i
            t_init = events[i][2]
    split_ids.append((start, len(events)))

    for (start, end) in split_ids:
        events[start:end] = adjust_consonant_lengths(
            events[start:end], phoneme_durations)

    return events


def adjust_event(event, hop_length=256, sampling_rate=22050):
    tokens, freq, start_time, end_time = event

    if tokens == ' ':
        return [event] if freq == 0 else [['_', freq, start_time, end_time]]

    return [[token, freq, start_time, end_time] for token in tokens]


def musicxml2score(filepath, bpm=60):
    track = {}
    beat_length_seconds = 60/bpm

    if isinstance(filepath, str):
        data = m21.converter.parse(filepath)
    else:
        data = filepath

    for i in range(len(data.parts)):
        part = data.parts[i].flat
        events = []
        for k in range(len(part.getElementsByClass(['Note','Rest']))):
            event = part.getElementsByClass(['Note','Rest'])[k]
            if isinstance(event, m21.note.Note):
                freq = event.pitch.frequency
                token = event.lyrics[0].text if len(event.lyrics) > 0 else ' '
                start_time = event.offset * beat_length_seconds
                end_time = start_time + event.duration.quarterLength * beat_length_seconds
                event = [token, freq, start_time, end_time]
            elif isinstance(event, m21.note.Rest):
                freq = 0
                token = ' '
                start_time = event.offset * beat_length_seconds
                end_time = start_time + event.duration.quarterLength * beat_length_seconds
                event = [token, freq, start_time, end_time]

            if token == '_':
                raise Exception("Unexpected token {}".format(token))

            if len(events) == 0:
                events.append(event)
            else:
                if token == ' ':
                    if freq == 0:
                        if events[-1][1] == 0:
                            events[-1][3] = end_time
                        else:
                            events.append(event)
                    elif freq == events[-1][1]:  # is event duration extension ?
                        events[-1][-1] = end_time
                    else:  # must be different note on same syllable
                        events.append(event)
                else:
                    events.append(event)
        track[part.partName] = events
    return track


def track2events(track):
    events = []
    for e in track:
        events.extend(adjust_event(e))
    group_ids = [i for i in range(len(events))
                 if events[i][0] in [' '] or events[i][0].isupper()]

    events_grouped = []
    for i in range(1, len(group_ids)):
        start, end = group_ids[i-1], group_ids[i]
        events_grouped.append(events[start:end])

    if events[-1][0] != ' ':
        events_grouped.append(events[group_ids[-1]:])

    return events_grouped

def event2alignment(events, hop_length=256, sampling_rate=22050):
    frame_length = float(hop_length) / float(sampling_rate)

    n_frames = int(events[-1][-1][-1] / frame_length)
    n_tokens = np.sum([len(e) for e in events])
    alignment = np.zeros((n_tokens, n_frames))

    cur_event = -1
    for event in events:
        for i in range(len(event)):
            if len(event) == 1 or cur_event == -1 or event[i][0] != event[i-1][0]:
                cur_event += 1
            token, freq, start_time, end_time = event[i]
            alignment[cur_event, int(start_time/frame_length):int(end_time/frame_length)] = 1

    return alignment[:cur_event+1]


def event2f0(events, hop_length=256, sampling_rate=22050):
    frame_length = float(hop_length) / float(sampling_rate)
    n_frames = int(events[-1][-1][-1] / frame_length)
    f0s = np.zeros((1, n_frames))

    for event in events:
        for i in range(len(event)):
            token, freq, start_time, end_time = event[i]
            f0s[0, int(start_time/frame_length):int(end_time/frame_length)] = freq

    return f0s


def event2text(events, convert_stress, cmudict=None):
    text_clean = ''
    for event in events:
        for i in range(len(event)):
            if i > 0 and event[i][0] == event[i-1][0]:
                continue
            if event[i][0] == ' ' and len(event) > 1:
                if text_clean[-1] != "}":
                    text_clean = text_clean[:-1] + '} {'
                else:
                    text_clean += ' {'
            else:
                if event[i][0][-1] in ('}', ' '):
                    text_clean += event[i][0]
                else:
                    text_clean += event[i][0] + ' '

    if convert_stress:
        text_clean = re.sub('[0-9]', '1', text_clean)

    text_encoded = text_to_sequence(text_clean, [], cmudict)
    return text_encoded, text_clean


def remove_excess_frames(alignment, f0s):
    excess_frames = np.sum(alignment.sum(0) == 0)
    alignment = alignment[:, :-excess_frames] if excess_frames > 0 else alignment
    f0s = f0s[:, :-excess_frames] if excess_frames > 0 else f0s
    return alignment, f0s

In [30]:
import ipdb

In [72]:
def get_data_from_musicxml(filepath, bpm, phoneme_durations=None,
                           convert_stress=False):
    if phoneme_durations is None:
        phoneme_durations = PHONEMEDURATION

    score = musicxml2score(filepath, bpm)
    data = {}
    for k, v in score.items():
        # ignore empty tracks
        if len(v) == 1 and v[0][0] == ' ':
            continue
        # import pdb; pdb.set_trace()

        events = track2events(v)
        events = adjust_words(events)

        events_arpabet = [events2eventsarpabet(e) for e in events]
        # make adjustments
        events_arpabet = [adjust_extensions(e, phoneme_durations)
                          for e in events_arpabet]
        events_arpabet = [adjust_consonants(e, phoneme_durations)
                          for e in events_arpabet]
        events_arpabet = add_space_between_events(events_arpabet)

        # convert data to alignment, f0 and text encoded
        alignment = event2alignment(events_arpabet)
        f0s = event2f0(events_arpabet)
        alignment, f0s = remove_excess_frames(alignment, f0s)
        text_encoded, text_clean = event2text(events_arpabet, convert_stress)

        # convert data to torch
        alignment = torch.from_numpy(alignment).permute(1, 0)[:, None].float()
        f0s = torch.from_numpy(f0s)[None].float()
        text_encoded = torch.LongTensor(text_encoded)[None]
        data[k] = {'rhythm': alignment,
                   'pitch_contour': f0s,
                   'text_encoded': text_encoded}

    return data

def events2eventsarpabet(event):
    print('event: ' + str(event))
    if event[0][0] == ' ':
        return event

    # get word and word arpabet
    word = ''.join([e[0] for e in event if e[0] not in('_', ' ')])
    word_arpabet = get_arpabet(word, CMUDICT)
    if word_arpabet[0] != '{':
        return event

    word_arpabet = word_arpabet.split()
    # import pdb; pdb.set_trace()
    # align tokens to arpabet
    i, k = 0, 0
    new_events = []
#     ipdb.set_trace(context=5)
    while i < len(event) and k < len(word_arpabet):
        # single token
        token_a, freq_a, start_time_a, end_time_a = event[i]

        if token_a == ' ':
            new_events.append([token_a, freq_a, start_time_a, end_time_a])
            i += 1
            continue

        if token_a == '_':
            new_events.append([token_a, freq_a, start_time_a, end_time_a])
            i += 1
            continue

        # two tokens
        if i < len(event) - 1:
            j = i + 1
            token_b, freq_b, start_time_b, end_time_b = event[j]
            between_events = []
            while j < len(event) and event[j][0] == '_':
                between_events.append([token_b, freq_b, start_time_b, end_time_b])
                j += 1
                if j < len(event):
                    token_b, freq_b, start_time_b, end_time_b = event[j]

            token_compound_2 = (token_a + token_b).lower()

        # single arpabet
        arpabet = re.sub('[0-9{}]', '', word_arpabet[k])

        if k < len(word_arpabet) - 1:
            arpabet_compound_2 = ''.join(word_arpabet[k:k+2])
            arpabet_compound_2 = re.sub('[0-9{}]', '', arpabet_compound_2)

        if i < len(event) - 1 and token_compound_2 in PHONEME2GRAPHEME[arpabet]:
            new_events.append([word_arpabet[k], freq_a, start_time_a, end_time_a])
            if len(between_events):
                new_events.extend(between_events)
            if start_time_a != start_time_b:
                new_events.append([word_arpabet[k], freq_b, start_time_b, end_time_b])
            i += 2 + len(between_events)
            k += 1
        elif token_a.lower() in PHONEME2GRAPHEME[arpabet]:
            new_events.append([word_arpabet[k], freq_a, start_time_a, end_time_a])
            i += 1
            k += 1
        elif arpabet_compound_2 in PHONEME2GRAPHEME and token_a.lower() in PHONEME2GRAPHEME[arpabet_compound_2]:
            new_events.append([word_arpabet[k], freq_a, start_time_a, end_time_a])
            new_events.append([word_arpabet[k+1], freq_a, start_time_a, end_time_a])
            i += 1
            k += 2
        else:
            k += 1

    # add extensions and pauses at end of words
    while i < len(event):
        token_a, freq_a, start_time_a, end_time_a = event[i]

        if token_a in (' ', '_'):
            new_events.append([token_a, freq_a, start_time_a, end_time_a])
        i += 1

    print('new_events: ' + str(new_events))
    print(len(new_events))
    return new_events

In [73]:
data = get_data_from_musicxml(score_error, 60, convert_stress=True)

event: [['T', 293.66476791740746, 0.0, 0.5], ['h', 293.66476791740746, 0.0, 0.5], ['i', 293.66476791740746, 0.0, 0.5], ['n', 293.66476791740746, 0.0, 0.5], ['k', 293.66476791740746, 0.0, 0.5]]
new_events: [['{TH', 293.66476791740746, 0.0, 0.5], ['IH1', 293.66476791740746, 0.0, 0.5], ['NG', 293.66476791740746, 0.0, 0.5], ['K}', 293.66476791740746, 0.0, 0.5]]
4
event: [['G', 391.99543598174927, 0.5, 1.0], ['r', 391.99543598174927, 0.5, 1.0], ['a', 391.99543598174927, 0.5, 1.0], ['n', 391.99543598174927, 0.5, 1.0], ['d', 391.99543598174927, 0.5, 1.0], ['f', 369.99442271163434, 1.0, 1.25], ['a', 369.99442271163434, 1.0, 1.25], ['t', 391.99543598174927, 1.25, 1.5], ['h', 391.99543598174927, 1.25, 1.5], ['e', 391.99543598174927, 1.25, 1.5], ['r', 391.99543598174927, 1.25, 1.5]]
new_events: [['{G', 391.99543598174927, 0.5, 1.0], ['R', 391.99543598174927, 0.5, 1.0], ['AE1', 391.99543598174927, 0.5, 1.0], ['N', 391.99543598174927, 0.5, 1.0], ['D', 391.99543598174927, 0.5, 1.0], ['F', 369.994422

In [None]:
data = get_data_from_musicxml(score2, 60, convert_stress=True)

In [39]:
score2 = create_score_test('Have', 'Grand-', '-fa-', '-ther-', 'Clock')
score_error = create_score_test('Think', 'Grand-', '-fa-', '-ther-', 'Clock')


In [71]:
get_arpabet('Think', CMUDICT).split()

['{TH', 'IH1', 'NG', 'K}']

In [49]:
PHONEME2GRAPHEME = {
    'AA': ['a', 'o', 'ah'],
    'AE': ['a', 'e'],
    'AH': ['u', 'e', 'a', 'h', 'o'],
    'AO': ['o', 'u', 'au'],
    'AW': ['ou', 'ow'],
    'AX': ['a'],
    'AXR': ['er'],
    'AY': ['i', 'y'],
    'EH': ['e', 'ae'],
    'EY': ['a', 'ai', 'ei', 'e', 'y'],
    'IH': ['i', 'e', 'y'],
    'IX': ['e', 'i'],
    'IY': ['ea', 'ey', 'y', 'i'],
    'OW': ['oa', 'o'],
    'OY': ['oy'],
    'UH': ['oo'],
    'UW': ['oo', 'u', 'o'],
    'UX': ['u'],
    'B': ['b'],
    'CH': ['ch', 'tch'],
    'D': ['d', 'e', 'de'],
    'DH': ['th'],
    'DX': ['tt'],
    'EL': ['le'],
    'EM': ['m'],
    'EN': ['on'],
    'ER': ['i', 'er'],
    'F': ['f'],
    'G': ['g'],
    'HH': ['h'],
    'JH': ['j'],
    'K': ['k', 'c', 'ch'],
    'KS': ['x'],
    'L': ['ll', 'l'],
    'M': ['m'],
    'N': ['n', 'gn'],
    'NG': ['ng', 'n'],
    'NX': ['nn'],
    'P': ['p'],
    'Q': ['-'],
    'R': ['wr', 'r'],
    'S': ['s', 'ce'],
    'SH': ['sh'],
    'T': ['t'],
    'TH': ['th'],
    'V': ['v', 'f', 'e'],
    'W': ['w'],
    'WH': ['wh'],
    'Y': ['y', 'j'],
    'Z': ['z', 's'],
    'ZH': ['s']
}

In [70]:

n_speakers_per_part = 4
frequency_scaling = 0.5
n_seconds = 2
pan = 0
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].cuda()
    pitch_contour = data[part]['pitch_contour'].cuda()
    text_encoded = data[part]['text_encoded'].cuda()
    
    speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
    print("{} MellotronID {}".format(part, speaker_id.item()))

    with torch.no_grad(): 
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
        audio = audio.cpu().numpy()
        audio = panner(audio, pan)
        audio_stereo[:audio.shape[0]] += audio            

audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
ipd.Audio([audio_stereo[:,0], audio_stereo[:,1]], rate=hparams.sampling_rate)

None MellotronID 96
