# Generating audio sampels from model

In [7]:
import os
import gdown
import librosa
import argparse
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
from matplotlib.colors import Normalize
from collections import defaultdict 

import torch
import torch.nn as nn
import torch.nn.functional as F
import random


os.sys.path.append("../")
from synthesizer import Synthesizer
from datasets.text import Language
from melgan.generator import Generator

In [8]:
sample_path = "../audioSampels/"

hp_path = ['../config/global/default.yaml', '../config/vc/default.yaml']

hp_global = OmegaConf.load(hp_path[0])
hp_vc = OmegaConf.load(hp_path[1])
hp = OmegaConf.merge(hp_global, hp_vc)

parser = argparse.ArgumentParser()
parser.add_argument('--config')
hparams = parser.parse_args(['--config', hp_path])

checkpoint = torch.load('../chkpt/vc/cotatron_trained_vc/05e2f42epoch=95.ckpt', map_location='cpu')

model = Synthesizer(hparams)#.cuda()
model.load_state_dict(checkpoint['state_dict'])
model.eval()
model.freeze()

lang = Language(hp.data.lang, hp.data.text_cleaners)

In [9]:
with open('../datasets/metadata/estonian_metadata.txt', 'r') as f:
    metadata = f.readlines()
    
speaker_counts = defaultdict(lambda: 0)
for i in metadata:
    speaker_counts[i.strip().split("|")[2]] += 1
    
speaker_counts = dict(sorted(speaker_counts.items(), key=lambda item: item[1], reverse=True))

In [6]:
speaker_pairs_over_5000 = [('UT-uudised-Mari', 'EKI-ilukirjandus-Kylli'), ('UT-uudised-Albert', 'EKI-ilukirjandus-Meelis'), ('UT-uudised-Kalev', 'UT-uudised-Albert'), ('EKI-ilukirjandus-Meelis', 'UT-uudised-Mari')]
speaker_pairs_over_1000 = [('UT-uudised-Vesta', 'EKI-yksiklaused-Liivika'), ('EKI-yksiklaused-Kersti', 'ERR-uudised-Birgit_Itse'), ('ERR-uudised-Meelis_Kompus', 'ERR-uudised-Tarmo_Maiberg'), ('EKI-yksiklaused-Kylli', 'ERR-uudised-Vallo_Kelmsaar'), ('EKI-yksiklaused-Meelis', 'UT-uudised-Vesta')]
speaker_pairs_over_100 = [('44c80063991b3d8465541af15c082b89b8e1a2e3b37f0d296e25e790c49b30c768708bb1f58f810652a3d4d9cf3c04edb4b44971324a20c18c150aa1535a6032', '2546b52782d46a01500bff6bebb2527ce2a1ef0026bcfa399548cfac326f665f463d4ed6f3f10bef7131567212b9df4f41f24fc86c725b9b1425dd16ea7b6532'),('29a3279b66344d333c6ce542c44280d36128d716416c9396a0ed7cb24bcbdc7a6c23ea731443eb7d43cf2e969e162bf3db39abd6ee46851be5f923a5fc685a09', '21ef89b80bcdfd83dabc0c508af6c100e584112f2ba6fec9d4df12e9d6a793d7d11c63d936b74debb4a18206d5bb4dbf0f54a0edb696ebf3e69c7040ff24e406'), ('417b61ca0a5145ee60db908bd1d75a8499ed06d33e292e2aff6f380496ff7377e2a4630794a18c6de620aa65ddedf3ca2af40545176d831f89affae0bfca890b', 'fa7f67d93b2f3a6e685275897b5b67653df98a2880d1a8e4550274bb2420a4965e5561a92daf604000bedf67bd9958be1c8c8c1bd54322c265024890d56e51da')]
speaker_pairs_over_50 = [('a7c34b9164e1d6f5635846fd9e6fb261e978376e3494148c1ef8e1975a91dfbe58de1f2fecc96537475d1ded29574a21880628a15283d2a4283c3e9c15dd82d7','2e85b3674c07ef9a9d31e675f0f67b8e039a075fc6e424d8698716f14e16e649fdc5490e368729717440e649b7f74f860c82a6c3fb260fee4a89c803ff214f93'),('679437bfb82910388c0a490f96ee50d037744be0cc923793533ad62e25bb939a3230e5a889e607eea54cad0833efd2a1c9b075b7e247ed0074eb62d2dd6ea84f','2d9756ce06cc5f71e1e00febfb368d395e360f0fe2b4ae8196db8865bb2708b15e7a1d13d6636d911a22700be3a5861c9807ca5b1bd3dab74bab3f8a7c98b35f'),('682bb93d9bdb03118bfd77c33e34f320038dd83f1dad720287418320aa7c0255bf6f2781d9de539b15cf2cbe027ac3d01aea1791e62375880df8f9d34dc00524','6f8d70af77ed76d1c32cad4e55a58ae518bc04ae4c067023d2ef9a2a8ddaedc3f427ef8fb1d91a48bf068d4d9a9defe15157e8fe90c65189929d69e33e639a96')]
speaker_pairs_over_10 = [('aaf6d3b7f66fc4bdae0b79d5d085300049ff9daf8fa28ead6a94aad690f501b0fb685adfd1cb7b83b5c6f4c20d6b279101d274aee5e3697214e1f3b2acf832e4','f206871b24414909bbf9d48a5f8f05464ec2e253efe62fcff368fa7c9b4df02bf75453d2e1f4ca76a8894cd154d919b1757745b1ea611d773be27f4105cd7304'),('6e9719926a63b9ab51cbccdd82d4aef48c414b99d869a2260e7364bf1c037394d9e8ea836093de24b76687dc29ec77d3427907859c0065357b43430639651f47','429b98f8a1b7664c811f2b6a0c9a1892aee9e0e75b162822c4c91c733783704d3f4ea4d6a737d529fcc7cbd5d8e550fb50803e7bcd93502a4110cc56ddb65211'),('a8c589373b556b84b051459250704cd6a9f944ec056e40d7c0c3bdff9c037db7051e34f595127fa463a51e187bb31dfe1d3c70d87092dad68dee4ba17c63569f','d3ccdf976a828573eb84fb243a3b0d530c9032f980ee8a7c3c02cf8ca54d50894d573e04bc4523dcf3db0ee57a0fea315aa55562c04578207d01f8c6a33c96b3')]
speaker_pairs_under_10 = [('15092f32524ee8ee9ca3e356ebbfa2c6715b1ba76694e0a956d01d4633a8071715e91ac5441873409f3fd7230d91db6970eb835c3499afa2a04a6db88334e1b3','1a4845efd89d2badd7be6588c01c5d3a3b4a7011b112ea7fa39d6aca30450507585c9b77e478bf8d734ec147b2732333c4d4794e30d0b8bd2a4fd614de6c1cf0'),('3bb05133e24b5327b56e1c3e5abfa244270acce3d55246c182c3400976478a0e27b53573905efa2991a43e635254cf898c98fec140a256df0122d6ebd97573ae','420fc9fb82176ceaccb8ca54dd584195d16ef040758e8764092ca4c7e967a8b49271f307533fe9d1764dee341763bc0355678972a6555bc1280195d909978f5d'),('b51b1f340446794e314ccf7c8d7c02f42b8c302e891a49d598aa7c3b50c28a042643e77c35d6ceb2e7b1f2bcf873605bd9a2b3d4d93f14333a03beb200e4b288','c4f49a0dbe6ce71a85a94c1c1564e365cd0ba93420259b897064b4a5905815060db3b596b9c0916f598d098035d0880b682eb7fffb7c1254f7016f7e1eec62f1')]

In [21]:
source_id = speaker_pairs_over_5000[0][0]
target_id = speaker_pairs_over_5000[0][1]

source_input = [i for i in metadata if i.split('|')[2].strip() == target_id][0]
target_audio_sample ='../data/' + [i.split("|")[0] for i in metadata if i.split('|')[2].strip() == source_id][0]

In [22]:
text = source_input.split('|')[1]
source_wavpath = '../data/' + source_input.split('|')[0]

text_norm = torch.LongTensor(lang.text_to_sequence(text, hp.data.text_cleaners))
text_norm = text_norm.unsqueeze(0)#.cuda()

wav_source_original, sr = librosa.load(source_wavpath, sr=None, mono=True)
wav_source_original *= (0.99 / np.max(np.abs(wav_source_original)))

wav_target_sample, sr_sample = librosa.load(target_audio_sample, sr=None, mono=True)
wav_target_sample *= (0.99 / np.max(np.abs(wav_target_sample)))

assert sr == hp.audio.sampling_rate
wav_source = torch.from_numpy(wav_source_original).view(1, 1, -1)#.cuda()
mel_source = model.cotatron.audio2mel(wav_source)

target_speaker = torch.LongTensor([hp.data.speakers.index(target_id)])#.cuda()

with torch.no_grad():
    mel_s_t, alignment, residual = model.inference(text_norm, mel_source, target_speaker)
    
melgan = Generator(80)#.cuda()
melgan_ckpt = torch.load('melgan_libritts_g_only.ckpt', map_location='cpu')
melgan.load_state_dict(melgan_ckpt['model_g'])
melgan.eval()

with torch.no_grad():
    audio_s_t = melgan(mel_s_t).squeeze().cpu().detach().numpy()

In [23]:
print("====== Source =======")
print(text)
ipd.Audio(wav_source_original, rate=22050)

Matildas tärkas sümpaatia selle kummalise väikese mehe vastu.


In [24]:
print("====== Target =======")
ipd.Audio(audio_s_t, rate=22050)



In [25]:
print("====== Target Voice =======")
ipd.Audio(wav_target_sample, rate=22050)



In [28]:
from scipy.io.wavfile import write

write('first_sine_wave.wav', 22050, audio_s_t)