In [1]:
%load_ext autoreload
%autoreload 2

This repo currently covers data generation for VITS models.

In [None]:
## WARNING: You need to do this in a terminal
# Also, you will need to sometimes update directory paths depending on your config. Also, help fix bugs if you find them.

#a) clone the AfriTTS repo
git clone https://github.com/intron-innovation/AfriSpeech-TTS.git

#b)switch to the vits branch
cd AfriSpeech-TTS/src/vits/
git switch vits

# look into the vits directory at AfriSpeech-TTS/src/vits/README.md for installation instructions
#c) basically create a new environment if you wish 
conda create -n vits python==3.10
#d) install requiremens
pip install -r requirements.txt #(cd to AfriSpeech-TTS/src/vits/)

#e) install espeak / other packages
apt-get install espeak -y
# or sudo apt-get install espeak -y

#f) Build Monotonic Alignment Search and run preprocessing if you use your own datasets.
# Cython-version Monotonoic Alignment Search
cd monotonic_align
python setup.py build_ext --inplace

# If everything goes well then you are set, else, read the README.md for VITS again.

In [None]:
# download the models and dataset to use for in the work
# See https://github.com/intron-innovation/AfriSpeech-TTS on how to setup awscli and download models adn dataset. 
# Stop at instruction 5. For instruction 6, change AfriSpeech-TTS to AfriSpeech-TTS-D
aws s3 cp s3://intron-open-source/AfriSpeech-TTS-D . --recursive

# Dataset can be found at s3://intron-open-source/AfriSpeech-TTS-D. Note that it's over 100G.
# models can be found at s3://intron-open-source/AfriSpeech-Models
aws s3 cp s3://intron-open-source/AfriSpeech-Models . --recursive

In [None]:
# Run this notebook from the vits/ directory where the notebook is located

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
from pathlib import Path
import random

import numpy as np

import torch

import commons
import utils

from models import SynthesizerTrn
from models_ext_ms import SynthesizerTrn as SynthesizerTrnExt
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

import json
import pickle as pkl
def load_speaker_emb(path):
    print(f"Loading external speaker embeddings from {path}")
    if path.split(".")[-1] == "pkl":
        with open(path, "rb") as read_file:
            speaker_embeddings_dict = pkl.load(read_file)
    elif path.split(".")[-1] == "json":
        with open(path, "r") as read_file:
            speaker_embeddings_dict = json.load(read_file)
    else:
        raise TypeError("Speaker embedding type unrecognized")
    return speaker_embeddings_dict


cuda


In [4]:
import pandas as pd

#!pip install pandas

In [None]:
# !pip install nemo_text_processing
from afritts_text_normalizer import tts_cleaner

In [5]:

def generate_data(models_dict, data, dst_folder, sentences):
    assert len(sentences) > 1
    for model_name in models:
        model_path, config_path = models[model_name]
        
        hps, net_g = None, None
        hps = utils.get_hparams_from_file(config_path)

        if "ext_spk" in model_name:
            # using external speaker embedding
            speaker_emb_dict = load_speaker_emb(hps.data.speaker_emb_path)
            net_g = SynthesizerTrnExt(
                len(symbols),
                hps.data.filter_length // 2 + 1,
                hps.train.segment_size // hps.data.hop_length,
                n_speakers=hps.data.n_speakers,
                **hps.model).to(device)
        else:
            net_g = SynthesizerTrn(
                len(symbols),
                hps.data.filter_length // 2 + 1,
                hps.train.segment_size // hps.data.hop_length,
                n_speakers=hps.data.n_speakers,
                **hps.model).to(device)
            _ = net_g.eval()
        
        _ = utils.load_checkpoint(model_path, net_g, None)
        
        for i, item in data.iterrows():
            # stn_tst = get_text(item.transcript, hps)
            transcript = sentences(i%len(sentences))
            
            transcript = tts_cleaner(transcript)
            stn_tst = get_text(transcript, hps)
            audio_path = Path(item.audio_paths).stem
            
            if "ext_spk" in model_name:
                speaker_emb = speaker_emb_dict.get(audio_path)
            else:
                speaker_id = int(item.user_ids_num)
            dst_path = os.path.join(dst_folder, model_name, audio_path+".wav")
            # if os.path.exists(dst_path): continue
            os.makedirs(os.path.dirname(dst_path), exist_ok=True)
            
            with torch.no_grad():
                x_tst = stn_tst.to(device).unsqueeze(0)
                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
                if "ext_spk" in model_name:
                    sid = torch.tensor(speaker_emb).reshape(1, -1).to(device)
                else:
                    if "vctk" in model_path:
                        # randomly sample speakers from 109 speaker vec
                        speaker_id = random.randint(0, 108)
                    sid = torch.LongTensor([speaker_id]).to(device)
                audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, 
                                    noise_scale=.667, noise_scale_w=0.8, 
                                    length_scale=1)[0][0,0].data.cpu().float().numpy()
                
                write(dst_path, 16000, audio)
                audio = None
                sid = None
                
    print("Finished.")

In [7]:
import pandas as pd

main_dir = os.getcwd()
             
test_seen = pd.read_csv(os.path.join(main_dir, "../..", "data/afritts-test-seen-clean.csv"))

#using general domain text with speakers in afrispeech-200 dataset
text_data = pd.read_csv(os.path.join(main_dir, "../..", "data/intron-test-public-6346-clean.csv"))
text_data = text_data[text_data.domain == "general"]

# remove too long sentences
text_data["count_char"] = text_data.transcript.apply(lambda x: len(list(x)))
text_data = text_data[text_data.count_char <= 200]
text_data = list(text_data.transcript.values)

# test_unseen = pd.read_csv(os.path.join(main_dir, "../..", "data/afritts-test-unseen-clean.csv"))

### Generate data for test seen

In [10]:
# !if code fails while generating files, just rerun, it "should" continue from where it stopped.
models = {
    "vits_vctk": ("AfriSpeech-Models/vctk_16k/vctk_16k.pth", "AfriSpeech-Models/vctk_16k/config.json"),
    "vits_afrotts": ("AfriSpeech-Models/vits_afrotts/vits_afrotts.pth", "AfriSpeech-Models/vits_afrotts/config.json"),
    "vits_afrotts_ft": ("AfriSpeech-Models/vits_afrotts_ft/vits_afrotts_ft.pth", "AfriSpeech-Models/vits_afrotts_ft/config.json"),
    "vits_afrotts_ft_ext_spk": ("AfriSpeech-Models/vits_afrotts_ft_ext_spk/vits_afrotts_ft_ext_spk.pth","AfriSpeech-Models/vits_afrotts_ft_ext_spk/config.json")
    }

dst_folder = "afritts_test_seen"

generate_data(models, test_seen, dst_folder, sentences)

Finished.


In [22]:
### Generate data for test unseen
# models = {
#     "vits_afrotts_ft_ext_spk": ("logs/afrotts_ft_ext/G_150000.pth" ,"logs/afrotts_ft_ext/config.json")
#     }
# dst_folder = "afritts_test_unseen"
# generate_data(models, test_unseen, dst_folder)

Loading external speaker embeddings from /srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/embeddings/afritts_emb.pkl
Finished.


### Speaker Interpolation

We can interpolate per gender per accent

In [8]:
import pandas as pd

main_dir = os.getcwd()
train = pd.read_csv(os.path.join(main_dir, "../..", "data/afritts-train-clean.csv"))

In [22]:
accents = train.accent.unique()
models = {
    "vits_afrotts": ("AfriSpeech-Models/vits_afrotts/vits_afrotts.pth", "AfriSpeech-Models/vits_afrotts/config.json"),
    "vits_afrotts_ft": ("AfriSpeech-Models/vits_afrotts_ft/vits_afrotts_ft.pth", "AfriSpeech-Models/vits_afrotts_ft/config.json"),
    "vits_afrotts_ft_ext_spk": ("AfriSpeech-Models/vits_afrotts_ft_ext_spk/vits_afrotts_ft_ext_spk.pth","AfriSpeech-Models/vits_afrotts_ft_ext_spk/config.json")
    }

speaker_emb_path_local = "AfriSpeech-Models/embeddings/afritts_emb_intsp.pkl"
dst_folder = "afritts_accent_interpolate"

# choose any text
texts = text_data[5] # only using 1 sentences. We can increase the number of sentences if needed

for model_name in models:
    model_path, config_path = models[model_name]
    
    hps, net_g = None, None
    hps = utils.get_hparams_from_file(config_path)
    
    if "ext_spk" in model_name:
        # using external speaker embedding
        
        speaker_emb_dict = load_speaker_emb(speaker_emb_path_local)
        net_g = SynthesizerTrnExt(
            len(symbols),
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            n_speakers=hps.data.n_speakers,
            **hps.model).to(device)
    else:
        net_g = SynthesizerTrn(
            len(symbols),
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            n_speakers=hps.data.n_speakers,
            **hps.model).to(device)
        _ = net_g.eval()
    
    _ = utils.load_checkpoint(model_path, net_g, None)

    for accent in accents:
        for gender in ["Male", "Female"]:
            train_ac_gender = train[(train.accent == accent) & (train.gender == gender)].copy()
            
            if len(train_ac_gender.user_ids_num.unique()) > 1:  # can only interpolate if more than 1 speaker satisfies the criteria
                # average embeddings
                audio_path = f"{accent}_{gender}.wav"
                sids = list(train_ac_gender.user_ids_num.unique())
                dst = os.path.join(dst_folder, model_name, accent, gender, audio_path)
                
                text = tts_cleaner(text)
                stn_tst = get_text(text, hps)
                if "ext_spk" in model_name:
                    a = [np.array(speaker_emb_dict.get(Path(x).stem)) for x in train_ac_gender.audio_paths.values]
                    # Todo: average speaker emb
                    speaker_emb = np.concatenate([speaker_emb_dict.get(Path(x).stem).reshape(-1, 1) for x in train_ac_gender.audio_paths.values], axis=1)
                    speaker_emb = np.mean(speaker_emb, axis=1)
                    speaker_emb = speaker_emb / np.linalg.norm(speaker_emb, 2)
                else:
                    speaker_id = sids
                dst_path = os.path.join(dst_folder, model_name, accent, gender, audio_path)
                os.makedirs(os.path.dirname(dst_path), exist_ok=True)
                
                with torch.no_grad():
                    x_tst = stn_tst.to(device).unsqueeze(0)
                    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
                    if "ext_spk" in model_name:
                        sid = torch.tensor(speaker_emb).reshape(1, -1).to(device)
                    # elif "vctk" in model_path:
                    #     # randomly sample speakers from 109 speaker vec
                    #     sid = random.randint(0, 108)
                    else:
                        sid = torch.LongTensor([speaker_id]).to(device)
                    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, 
                                        noise_scale=.667, noise_scale_w=0.8, 
                                        length_scale=1)[0][0,0].data.cpu().float().numpy()
                    
                    write(dst_path, 16000, audio)
                    audio = None
                
    print("Finished.")

Loading external speaker embeddings from /srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/embeddings/afritts_emb.pkl
Finished.


We can also interpolate just per accent (not considering gender)

In [12]:
accents = train.accent.unique()
models = {
    "vits_afrotts": ("AfriSpeech-Models/vits_afrotts/vits_afrotts.pth", "AfriSpeech-Models/vits_afrotts/config.json"),
    "vits_afrotts_ft": ("AfriSpeech-Models/vits_afrotts_ft/vits_afrotts_ft.pth", "AfriSpeech-Models/vits_afrotts_ft/config.json"),
    "vits_afrotts_ft_ext_spk": ("AfriSpeech-Models/vits_afrotts_ft_ext_spk/vits_afrotts_ft_ext_spk.pth","AfriSpeech-Models/vits_afrotts_ft_ext_spk/config.json")
    }

dst_folder = "afritts_accent_interpolate_no_gender"
texts = text_data[5] # only using 1 sentences. We can increase the number of sentences if needed

for model_name in models:
    model_path, config_path = models[model_name]
    
    hps, net_g = None, None
    hps = utils.get_hparams_from_file(config_path)
    
    if "ext_spk" in model_name:
        # using external speaker embedding
        speaker_emb_dict = load_speaker_emb(speaker_emb_path_local)
        net_g = SynthesizerTrnExt(
            len(symbols),
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            n_speakers=hps.data.n_speakers,
            **hps.model).to(device)
    else:
        net_g = SynthesizerTrn(
            len(symbols),
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            n_speakers=hps.data.n_speakers,
            **hps.model).to(device)
        _ = net_g.eval()
    
    _ = utils.load_checkpoint(model_path, net_g, None)

    for accent in accents:
        for gender in ["Male"]:
            train_ac = train[(train.accent == accent)].copy()
            
            if len(train_ac.user_ids_num.unique()) > 1: # can only interpolate if more than 1 speaker satisfies the criteria
                # average embeddings
                audio_path = f"{accent}.wav"
                sids = list(train_ac.user_ids_num.unique())
                dst = os.path.join(dst_folder, model_name, accent, audio_path)
                
                text = tts_cleaner(text)
                stn_tst = get_text(text, hps)
                if "ext_spk" in model_name:
                    
                    # Todo: average speaker emb
                    speaker_emb = np.concatenate([speaker_emb_dict.get(Path(x).stem).reshape(-1, 1) for x in train_ac.audio_paths.values], axis=1)
                    speaker_emb = np.mean(speaker_emb, axis=1)
                    speaker_emb = speaker_emb / np.linalg.norm(speaker_emb, 2)
                else:
                    speaker_id = sids
                dst_path = os.path.join(dst_folder, model_name, accent, audio_path)
                os.makedirs(os.path.dirname(dst_path), exist_ok=True)
                
                with torch.no_grad():
                    x_tst = stn_tst.to(device).unsqueeze(0)
                    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
                    if "ext_spk" in model_name:
                        sid = torch.tensor(speaker_emb).reshape(1, -1).to(device)
                    # elif "vctk" in model_path:
                    #     # randomly sample speakers from 109 speaker vec
                    #     sid = random.randint(0, 108)
                    else:
                        sid = torch.LongTensor([speaker_id]).to(device)
                    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, 
                                        noise_scale=.667, noise_scale_w=0.8, 
                                        length_scale=1)[0][0,0].data.cpu().float().numpy()
                    
                    write(dst_path, 16000, audio)
                    audio = None
                
    print("Finished.")

Loading external speaker embeddings from /srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/embeddings/afritts_emb.pkl
Finished.


We can also interpolate using median instead of averaging or other methods like PCA. Implement if you can explore.

### Voice Conversion

In [None]:
# dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
# collate_fn = TextAudioSpeakerCollate()
# loader = DataLoader(dataset, num_workers=8, shuffle=False,
#     batch_size=1, pin_memory=True,
#     drop_last=True, collate_fn=collate_fn)
# data_list = list(loader)

In [None]:
# with torch.no_grad():
#     x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
#     sid_tgt1 = torch.LongTensor([1]).cuda()
#     sid_tgt2 = torch.LongTensor([2]).cuda()
#     sid_tgt3 = torch.LongTensor([4]).cuda()
#     audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
#     audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
#     audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
# print("Original SID: %d" % sid_src.item())
# ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
# print("Converted SID: %d" % sid_tgt1.item())
# ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
# print("Converted SID: %d" % sid_tgt2.item())
# ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
# print("Converted SID: %d" % sid_tgt3.item())
# ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))