In [1]:
import os
import gdown
import librosa
import argparse
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
from matplotlib.colors import Normalize

import torch
import torch.nn as nn
import torch.nn.functional as F
import random


os.sys.path.append("../")
from synthesizer import Synthesizer
from datasets.text import Language
from melgan.generator import Generator

In [2]:
hp_path = ['../config/global/default.yaml', '../config/vc/default.yaml']

hp_global = OmegaConf.load(hp_path[0])
hp_vc = OmegaConf.load(hp_path[1])
hp = OmegaConf.merge(hp_global, hp_vc)

parser = argparse.ArgumentParser()
parser.add_argument('--config')
hparams = parser.parse_args(['--config', hp_path])

checkpoint = torch.load('../chkpt/vc/cotatron_trained_vc/3ba23aeepoch=8.ckpt', map_location='cpu')

model = Synthesizer(hparams)#.cuda()
model.load_state_dict(checkpoint['state_dict'])
model.eval()
model.freeze()

lang = Language(hp.data.lang, hp.data.text_cleaners)

In [3]:
with open('../datasets/metadata/estonian_metadata.txt', 'r') as f:
    sentences = f.readlines()

random_input = random.choice(sentences)
random_input = 'preprocessed_v2/UT-uudised-Mari/Mari/wavs/0001_art_901_lause_18-22k.wav|Kurb öelda, aga Eestis on haiglaid, kus ei tööta psühhiaatreid.|UT-uudised-Mari\n'
#random_input

In [4]:
target_id = 'UT-uudised-Mari'
#target_id = 'UT-uudised-Albert'
target_id = random.choice(hp.data.speakers)
#target_id = 'ERR-uudised-Tonu_Karjatse'
target_audio_sample ='../data/' + [i.split("|")[0] for i in sentences if i.split('|')[2].strip() == target_id][0]
target_id

'e8b0aeb666620913a9cfd484b3bd2ef5d579f4ce53e26afb37b7fffd5b7a9a4c2620c93e789b6fc92c304481e2a1b5f3b4c5e0108a7c1256053fb183883b9510'

In [5]:
text = random_input.split('|')[1]
source_wavpath = '../data/' + random_input.split('|')[0]

text_norm = torch.LongTensor(lang.text_to_sequence(text, hp.data.text_cleaners))
text_norm = text_norm.unsqueeze(0)#.cuda()

wav_source_original, sr = librosa.load(source_wavpath, sr=None, mono=True)
wav_source_original *= (0.99 / np.max(np.abs(wav_source_original)))

wav_target_sample, sr_sample = librosa.load(target_audio_sample, sr=None, mono=True)
wav_target_sample *= (0.99 / np.max(np.abs(wav_target_sample)))

assert sr == hp.audio.sampling_rate
wav_source = torch.from_numpy(wav_source_original).view(1, 1, -1)#.cuda()
mel_source = model.cotatron.audio2mel(wav_source)

target_speaker = torch.LongTensor([hp.data.speakers.index(target_id)])#.cuda()

with torch.no_grad():
    mel_s_t, alignment, residual = model.inference(text_norm, mel_source, target_speaker)
    
melgan = Generator(80)#.cuda()
melgan_ckpt = torch.load('melgan_libritts_g_only.ckpt', map_location='cpu')
melgan.load_state_dict(melgan_ckpt['model_g'])
melgan.eval()

with torch.no_grad():
    audio_s_t = melgan(mel_s_t).squeeze().cpu().detach().numpy()

In [6]:
print("====== Source =======")
print(text)
ipd.Audio(wav_source_original, rate=22050)

Kurb öelda, aga Eestis on haiglaid, kus ei tööta psühhiaatreid.


In [7]:
print("====== Target =======")
ipd.Audio(audio_s_t, rate=22050)



In [8]:
print("====== Target Voice =======")
ipd.Audio(wav_target_sample, rate=22050)

