In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('waveglow/')

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd
import librosa
import torch

from hparams import create_hparams
from model import Tacotron2, load_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

In [2]:
%reload_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [3]:
def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]

In [4]:
def plot_mel_f0_alignment(mel_source, mel_outputs_postnet, f0s, alignments, figsize=(16, 16)):
    fig, axes = plt.subplots(4, 1, figsize=figsize)
    axes = axes.flatten()
    axes[0].imshow(mel_source, aspect='auto', origin='bottom', interpolation='none')
    axes[1].imshow(mel_outputs_postnet, aspect='auto', origin='bottom', interpolation='none')
    axes[2].scatter(range(len(f0s)), f0s, alpha=0.5, color='red', marker='.', s=1)
    axes[2].set_xlim(0, len(f0s))
    axes[3].imshow(alignments, aspect='auto', origin='bottom', interpolation='none')
    axes[0].set_title("Source Mel")
    axes[1].set_title("Predicted Mel")
    axes[2].set_title("Source pitch contour")
    axes[3].set_title("Source rhythm")
    plt.tight_layout()

In [5]:
def load_mel(path):
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

In [6]:
hparams = create_hparams()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [7]:
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)

## Load Models

In [8]:
checkpoint_path = "models/mellotron_libritts.pt"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

<All keys matched successfully>

In [9]:
waveglow_path = 'models/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()

## Setup dataloaders

In [10]:
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data

In [11]:
file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda()    
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

data/example1.wav exploring the expanses of space to keep our planet safe


In [13]:
ipd.Audio(audio_path, rate=hparams.sampling_rate)

## Define Speakers Set

In [12]:
speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python',header=None, comment=';', sep=' *\| *', 
                       names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
female_speakers = cycle(
    speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
male_speakers = cycle(
    speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())

# Singing Voice from Music Score

In [16]:
data = get_data_from_musicxml('data/haendel_hallelujah.musicxml', 90, convert_stress=True)
panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}

In [None]:
n_speakers_per_part = 4
frequency_scaling = 0.4
n_seconds = 90
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].cuda()
    pitch_contour = data[part]['pitch_contour'].cuda()
    text_encoded = data[part]['text_encoded'].cuda()
    
    for k in range(n_speakers_per_part):
        pan = np.random.randint(panning[part][0], panning[part][1])
        if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
            speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
        else:
            speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
        print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            audio_stereo[:audio.shape[0]] += audio            
#             write("{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)
audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
ipd.Audio([audio_stereo[:,0], audio_stereo[:,1]], rate=hparams.sampling_rate)

In [108]:
#from mellotron_utils import *

In [20]:
import music21

In [18]:
s = music21.converter.parse('data/haendel_hallelujah.musicxml')

In [19]:
score1 = music21.stream.Score()

part = music21.stream.Part()
for n in list(s.parts[0].flat.getElementsByClass('Note')):
    part.insert(n)
# note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.insert(note)
# note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
# note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
# note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)

score1.insert(part)

In [20]:
score2 = music21.stream.Score()

part = music21.stream.Part()
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.insert(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
note = music21.note.Note(50); note.quarterLength = 0.5; note.offset = 0.5; note.lyric = 'Hal-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-le-'; part.append(note)
note = music21.note.Note(47); note.quarterLength = 0.5; note.lyric = '-lu-'; part.append(note)
note = music21.note.Note(45); note.quarterLength = 0.5; note.lyric = '-jah'; part.append(note)
score2.insert(part)

In [18]:
def create_score_test(one, two, three, four, five):
    score2 = music21.stream.Score()

    part = music21.stream.Part()
    
    note = music21.note.Note('D4'); note.quarterLength = 0.5; note.offset = 0; note.lyric = one; part.insert(note)
    note = music21.note.Note('G4'); note.quarterLength = 0.5; note.lyric = two; part.append(note)
    note = music21.note.Note('F#4'); note.quarterLength = 0.25; note.lyric = three; part.append(note)
    note = music21.note.Note('G4'); note.quarterLength = 0.25; note.lyric = four; part.append(note)
#     note = music21.note.Note('A4'); note.quarterLength = 0.25 + 0.25*1/2;  note.lyric = five; part.append(note)
    note = music21.note.Rest(); note.quarterLength = 0.25;  part.append(note)


    score2.insert(part)
    return score2

In [71]:
score2 = create_score_test('Have', 'Grand-', '-fa-', '-ther', 'Clock')


In [72]:
data = get_data_from_musicxml(score2, 60, convert_stress=True)

In [80]:
score2 = create_score_test('Large', 'Grand-', '-fa-', '-ther', 'Clock')
data = get_data_from_musicxml(score2, 60, convert_stress=True)

In [190]:
from common.object import G2PSingleton
G2PSingleton.to_phonemes('Hummingbird')

['HH', 'AH1', 'M', 'IH0', 'NG', 'B', 'ER2', 'D']

In [152]:
import enchant
from hyphen import Hyphenator
from essential_generators import DocumentGenerator
import re
import music21
import pyphen


pyphen.language_fallback('nl_NL_variant1')

phen = pyphen.Pyphen(lang='nl_NL')

h_en = Hyphenator('en_US')
gen = DocumentGenerator()
d = enchant.Dict("en_US")

d.check("K'm")

False

In [182]:
def create_score_test_2(sentence_generator, syll_splitter, dictionary_check):
    score2 = music21.stream.Score()
    part = music21.stream.Part()
    
    string = (gen.sentence().title()); 
    string = re.sub('[^A-Za-z]+', ' ', string)
    print("Generated String: {}".format(string))
    for word in string.split():
#         sylls = syll_splitter.syllables(word)
        sylls = syll_splitter.inserted(word).split('-')
        for idx, syl in enumerate(sylls):
            pitch = np.random.randint(40, 40 + 24)
            note = music21.note.Note(pitch)
            note.quarterLength = 0.5
            note.lyric = ('-' if idx != 0 else '') +  syl + ('-' if idx != (len(sylls) -1) else '')
            part.append(note)
            
    score2.insert(part)
    return score2

In [202]:
score3 = create_score_test_2(gen, phen, d)

Generated String: Genes The Intersections With A Different Bias Over What Distance 


In [203]:
score3.parts[0].lyrics()

{1: [<music21.note.Lyric number=1 syllabic=begin text="Ge">,
  <music21.note.Lyric number=1 syllabic=end text="nes">,
  <music21.note.Lyric number=1 syllabic=single text="The">,
  <music21.note.Lyric number=1 syllabic=begin text="In">,
  <music21.note.Lyric number=1 syllabic=middle text="ter">,
  <music21.note.Lyric number=1 syllabic=middle text="sec">,
  <music21.note.Lyric number=1 syllabic=middle text="ti">,
  <music21.note.Lyric number=1 syllabic=end text="ons">,
  <music21.note.Lyric number=1 syllabic=single text="With">,
  <music21.note.Lyric number=1 syllabic=single text="A">,
  <music21.note.Lyric number=1 syllabic=begin text="Dif">,
  <music21.note.Lyric number=1 syllabic=middle text="fe">,
  <music21.note.Lyric number=1 syllabic=end text="rent">,
  <music21.note.Lyric number=1 syllabic=begin text="Bi">,
  <music21.note.Lyric number=1 syllabic=end text="as">,
  <music21.note.Lyric number=1 syllabic=single text="Over">,
  <music21.note.Lyric number=1 syllabic=single text="What"

In [258]:
['JH', 'EH1'] in G2PSingleton.to_phonemes('Gen')

False

In [247]:
data = get_data_from_musicxml(score3, 90, convert_stress=True)

['JH EH1 N']
Missing phoneme - token: G, token_compound_2: ge, arpabet: {JH, word: Genes, word_arpabet: ['{JH', 'IY1', 'N', 'Z}']
Missing phoneme - token: e, token_compound_2: en, arpabet: IY1, word: Genes, word_arpabet: ['{JH', 'IY1', 'N', 'Z}']
Missing phoneme - token: e, token_compound_2: es, arpabet: Z}, word: Genes, word_arpabet: ['{JH', 'IY1', 'N', 'Z}']
['JH EH1 N']
['JH EH1 N']
Missing phoneme - token: t, token_compound_2: ti, arpabet: SH, word: Intersections, word_arpabet: ['{IH2', 'N', 'T', 'ER0', 'S', 'EH1', 'K', 'SH', 'AH0', 'N', 'Z}']
Missing phoneme - token: i, token_compound_2: io, arpabet: AH0, word: Intersections, word_arpabet: ['{IH2', 'N', 'T', 'ER0', 'S', 'EH1', 'K', 'SH', 'AH0', 'N', 'Z}']
Missing phoneme - token: o, token_compound_2: on, arpabet: N, word: Intersections, word_arpabet: ['{IH2', 'N', 'T', 'ER0', 'S', 'EH1', 'K', 'SH', 'AH0', 'N', 'Z}']
Missing phoneme - token: n, token_compound_2: ns, arpabet: Z}, word: Intersections, word_arpabet: ['{IH2', 'N', 'T',

In [13]:
data = get_data_from_musicxml('data/song_en_short.musicxml', 90, convert_stress=True)

In [200]:
n_speakers_per_part = 4
frequency_scaling = 0.5
n_seconds = 90
pan = 0
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].cuda()
    pitch_contour = data[part]['pitch_contour'].cuda()
    text_encoded = data[part]['text_encoded'].cuda()
    
    speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
    print("{} MellotronID {}".format(part, speaker_id.item()))

    with torch.no_grad(): 
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
        audio = audio.cpu().numpy()
        audio = panner(audio, pan)
        audio_stereo[:audio.shape[0]] += audio            

audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
ipd.Audio([audio_stereo[:,0], audio_stereo[:,1]], rate=hparams.sampling_rate)

None MellotronID 99


RuntimeError: invalid argument 6: wrong matrix size at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/generic/THCTensorMathBlas.cu:558

In [96]:
good_female_idx = '70'