In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import IPython.display as ipd

import sys
sys.path.append('waveglow/')

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd
import librosa
import torch

from hparams import create_hparams
from model import Tacotron2, load_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"

In [4]:
def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]

In [28]:
def load_mel(path):
    hparams = create_hparams()

    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)
    
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

## Load Models

In [7]:
checkpoint_path = "models/mellotron_libritts.pt"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

<All keys matched successfully>

In [4]:
waveglow_path = 'models/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()

In [None]:
'n_group', 'n_early_every', 'n_early_size', and 'WN_config'

In [17]:
vars(waveglow.WN[0])

{'_backend': None,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('in_layers',
               ModuleList(
                 (0): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
                 (1): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
                 (2): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
                 (3): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
                 (4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
                 (5): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
                 (6): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(64,

In [6]:
torch.save(waveglow.state_dict(), '/home/hao/viMusic/repo/ViMusic/archs/voice_synth/mellotron/models/waveglow.pt')

## Setup dataloaders

In [30]:
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data

In [31]:
file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]
# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda()    
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

data/example1.wav exploring the expanses of space to keep our planet safe


In [None]:
ipd.Audio(audio_path, rate=hparams.sampling_rate)

## Define Speakers Set

In [9]:
speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python',header=None, comment=';', sep=' *\| *', 
                       names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
female_speakers = cycle(
    speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
male_speakers = cycle(
    speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())

# Singing Voice from Music Score

In [108]:
#from mellotron_utils import *

In [15]:
import music21

In [14]:
import enchant
from hyphen import Hyphenator
from essential_generators import DocumentGenerator

import pyphen
from mellotron_utils import *

pyphen.language_fallback('nl_NL_variant1')

phen = pyphen.Pyphen(lang='nl_NL')

h_en = Hyphenator('en_US')
gen = DocumentGenerator()
d = enchant.Dict("en_US")

d.check("K'm")

False

In [16]:
def create_score_test_2(sentence_generator, dictionary_check, string=''):
    score2 = music21.stream.Score()
    part = music21.stream.Part()
    
    if string == '':
        string = (gen.sentence().title()); 
        string = re.sub('[^A-Za-z]+', ' ', string)

    print("Generated String: {}".format(string))
    for word in string.split():
#         sylls = h_en.syllables(word)
        sylls_2 = phen.inserted(word).split('-')
        sylls_3 = h_en.syllables(word)

        
        num_sylls = nsyl(word)[0]
        print(word + ' num syll: ' + str(num_sylls))
        sylls = sylls_2
        if num_sylls != len(sylls_2):
            if num_sylls == len(sylls_3):
                sylls = sylls_3
            else:
                sylls = sylls_2

        if sylls == []:
            sylls = [word]
        for idx, syl in enumerate(sylls):
            pitch = np.random.randint(40+12, 40 + 24 +12)
            note = music21.note.Note(pitch)
            note.quarterLength = 0.5
            note.lyric = ('-' if idx != 0 else '') +  syl + ('-' if idx != (len(sylls) -1) else '')
            part.append(note)
            
    score2.insert(part)
    return score2

In [17]:
from nltk.corpus import cmudict
d = cmudict.dict()
def nsyl(word):
    return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]] 

In [26]:
def word_to_syllable_phonemes(word, add_bracket = False):
    sylls = [syl.phonemes for syl in list(syllable3.generate(word.rstrip()))[0]]
    if add_bracket:
        sylls[0] = '{' + sylls[0]
        sylls[-1] += '}'
    
    return sylls

In [24]:
def create_score_test_3(string=''):
    score2 = music21.stream.Score()
    part = music21.stream.Part()

    print("Generated String: {}".format(string))
    for word in string.split():

        sylls = word_to_syllable_phonemes(word, True)
    
        for idx, syl in enumerate(sylls):
            pitch = np.random.randint(40+12, 40 + 24 +12)
            note = music21.note.Note(pitch)
            note.quarterLength = 0.5
            note.lyric = ('-' if idx != 0 else '') +  syl + ('-' if idx != (len(sylls) -1) else '')
            part.append(note)
            
    score2.insert(part)
    return score2

In [21]:
lyr = 'Saw you there Magic madness heaven sins'
lyr = 'Saw you there and I thought'

In [15]:
from PyLyrics import *
from syllabify import syllable3
import re
import music21

In [16]:
index11 = 8

In [17]:
lyr = re.sub(r'[^\w]', ' ',PyLyrics.getLyrics('Taylor Swift','Blank Space').split('\n')[index11]); lyr



  s = BeautifulSoup(r.text)


''

In [42]:
index11 += 1

Exception ignored in: <function Wave_write.__del__ at 0x7fafa5603200>
Traceback (most recent call last):
  File "/home/hao/anaconda3/envs/torch/lib/python3.7/wave.py", line 327, in __del__
    self.close()
  File "/home/hao/anaconda3/envs/torch/lib/python3.7/wave.py", line 445, in close
    self._ensure_header_written(0)
  File "/home/hao/anaconda3/envs/torch/lib/python3.7/wave.py", line 468, in _ensure_header_written
    self._write_header(datasize)
  File "/home/hao/anaconda3/envs/torch/lib/python3.7/wave.py", line 485, in _write_header
    self._sampwidth * 8, b'data'))
struct.error: ushort format requires 0 <= number <= (0x7fff * 2 + 1)


In [43]:
lyr = re.sub("[^a-zA-Z' ]+", ' ',PyLyrics.getLyrics('Taylor Swift','Blank Space').split('\n')[index11]); lyr

'I can read you like a magazine'

In [36]:
audio_stereo[:10000].shape

(10000, 2)

In [33]:
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)


In [39]:
audio.shape

(44032, 2)

In [46]:
hparams.sampling_rate

22050

In [45]:
score3 = create_score_test_3(lyr);
data = get_data_from_musicxml(score3, 90, convert_stress=True, method1=False)
score3.parts[0].lyrics()

n_speakers_per_part = 4
frequency_scaling = 0.4
n_seconds = 10
# audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].cuda()
    pitch_contour = data[part]['pitch_contour'].cuda()
    text_encoded = data[part]['text_encoded'].cuda()
    
    speaker_id = torch.LongTensor([70]).cuda()
    print("{} MellotronID {}".format(part, speaker_id.item()))

    with torch.no_grad(): 
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
        audio = audio.cpu().numpy()
        audio_stereo = audio

audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
ipd.Audio(audio_stereo, rate=hparams.sampling_rate)

Generated String: I can read you like a magazine
None MellotronID 70


In [88]:
good_female_idx = '70'