In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    # text_norm = text_to_sequence(text, hps.data.text_cleaners)
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## VCTK

In [3]:
hps = utils.get_hparams_from_file("./logs/vctk_16k/config.json")

In [6]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint("logs/vctk_16k/G_500000.pth", net_g, None)

In [7]:
speaker_id = 5

stn_tst = get_text("There is no place like home.", hps)

with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.LongTensor([speaker_id]).to(device)
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [24]:
# write("./configs/a.wav", 16000, audio)

### AfroTTS

In [8]:
hps = utils.get_hparams_from_file("logs/afrotts_ft_upsamp/config.json")

In [9]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint("logs/afrotts_ft_upsamp/G_105000.pth", net_g, None)

In [29]:
import pandas as pd

train = pd.read_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-train-clean.csv")

In [30]:
train.head()

Unnamed: 0,idx,user_ids,accent,age_group,gender,country,transcript,nchars,audio_ids,audio_paths,duration,neg_percent,origin,domain,split,expand_puncts,is_alnum,user_ids_num
0,35869,cb196fe7954f96bc50ee468c9ee46881,Ebira,26-40,Male,NG,"engineer soni, honorable ukamaka close bracket...",155,7d3da6ebc635b13017a5965f9d1f6b3b,/AfriSpeech-TTS/train/d8c20506-0c0e-4a72-860e-...,16.711,1.0,nigerian,general,train,True,True,591
1,20705,3854724c6ef671549182904ae344339d,Ijaw,56yrs>,Male,NG,"paramount theater, iseyin, kumo featuring colo...",109,df7afca1d273cfff7ccf3c1ceea574b0,/AfriSpeech-TTS/train/d4554d18-9bfe-48a3-9633-...,12.683,0.0,nigerian,general,train,True,False,318
2,8235,73f971bbef38880a86ac97680ab5a7f8,Zulu,26-40,Female,ZA,"when now-adult umahi and elly discovered this,...",129,33f8e782c1f484688f96f4a58c92dd6a,/AfriSpeech-TTS/train/c91aec7d-c3f8-46d4-8222-...,16.802,0.15,african,general,train,False,False,71
3,7766,a1a211447e2b4d3dde24fa77de916ef0,Akan,26-40,Female,GH,"mutabazi, as a warrior wife of misess francis,...",139,d45dd822ca90c886c3569db26359373d,/AfriSpeech-TTS/train/438dd081-38f6-416c-bd0c-...,8.847,0.17,african,general,train,False,False,89
4,4710,73f971bbef38880a86ac97680ab5a7f8,Zulu,26-40,Female,ZA,"in sir nabunya chaeism, kebbeh is considered o...",119,0af9bf87a64faef331504bc209869465,/AfriSpeech-TTS/train/1e4cfb22-8c36-4f9e-9a18-...,15.713,0.15,african,general,train,False,False,71


In [18]:
train[train.accent == 'Siswati'].user_ids_num.unique()

array([ 26, 116, 119, 153, 610, 106])

In [34]:
speaker_id = 89

stn_tst = get_text("How can I help you today my friend?", hps)

with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.LongTensor([speaker_id]).to(device)
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [49]:
# from scipy.io.wavfile import write
# dst_path = "/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/samples/sampl_male3.wav"
# write(dst_path, 16000, audio)

### VITS with external speaker embedding

In [3]:
# from data_utils_ext_ms import TextAudioLoader as , TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models_ext_ms import SynthesizerTrn as SynthesizerTrnExt

In [4]:
hps = utils.get_hparams_from_file("logs/afrotts_ft_ext/config.json")

In [2]:
import json
import pickle as pkl
def load_speaker_emb(path):
    print(f"Loading external speaker embeddings from {path}")
    if path.split(".")[-1] == "pkl":
        with open(path, "rb") as read_file:
            speaker_embeddings_dict = pkl.load(read_file)
    elif path.split(".")[-1] == "json":
        with open(path, "r") as read_file:
            speaker_embeddings_dict = json.load(read_file)
    else:
        raise TypeError("Speaker embedding type unrecognized")
    return speaker_embeddings_dict

In [3]:
speaker_emb_dict = load_speaker_emb("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/embeddings/afritts_emb_intsp.pkl")

Loading external speaker embeddings from /srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/embeddings/afritts_emb_intsp.pkl


In [10]:
speaker_emb_dict = load_speaker_emb(hps.data.speaker_emb_path)

Loading external speaker embeddings from /srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/embeddings/afritts_emb.pkl


In [20]:
device="cpu"
net_g = SynthesizerTrnExt(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint("logs/afrotts_ft_ext/G_67000.pth", net_g, None)

In [24]:
speaker_emb = speaker_emb_dict.get("6c74fccba13870402ac15ea14f2ad70c_Rm9hVVH6")

In [25]:
speaker_emb.shape

(256,)

In [26]:
stn_tst = get_text("Alhaji. Danjuma was here.", hps)

with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.tensor(speaker_emb).reshape(1, -1).to(device)
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

### Speaker Interpolation

In [67]:
hps = utils.get_hparams_from_file("logs/afrotts_vctk_ft/config.json")

In [68]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint("logs/afrotts_vctk_ft/G_112000.pth", net_g, None)

AssertionError: 

In [28]:
speaker_ids = [159, 162] # speaker ids to interpolate

stn_tst = get_text("Alh. hassan was here to greet him.", hps)

with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.LongTensor([speaker_ids]).to(device)
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
    
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

### Voice Conversion

In [None]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [None]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))