In [11]:
%matplotlib inline
from datetime import datetime
from pathlib import Path

from scipy.io.wavfile import write

from hparams import get_hparams_from_file
from src.audio.concat import concat_2d_array_audios_with_silence
from src.file import read_book
from src.model.checkpoint import load_checkpoint_for_inference
from src.model.synthesizer import SynthesizerTrn
from src.srt import generate_and_save_audiobook_srt
from src.synthesize import synthesize_book
from src.text.split import split_book_lines_to_sentences
from src.text.symbols import get_vocabulary


In [12]:
checkpoint_step = '150000'
speaker = 'milda_no_noise_44'
# speaker = 'milda_no_noise_44'
# checkpoint_step = '1190000'
# speaker = 'giedrius_altoriu_sesely'
sample_rate = 44100

device = 0

silence_between_sentences = 0.5  # TODO: [optional? would complicate the generation of srts] pick silence duration from a distribution (e.g. mean - 1.0s, lower bound - 0.5s, upper - 1.5s)
silence_between_paragraphs = 1.0

book_name = "Kur_vasara_amžina_stressed"
book_path = f"/home/aai-labs/inovoice/data/text/audiobooks/{book_name}.txt"

output_dir = f"/home/aai-labs/inovoice/repos/vits/files/audio/audiobooks/{speaker}/{checkpoint_step}"
Path(output_dir).mkdir(parents=True, exist_ok=True)
output_filename = f'{book_name}_{speaker}_{checkpoint_step}_{datetime.now().strftime("%Y-%d-%m_%H-%M-%S")}'

# checkpoint_filepath = f"/media/arnas/SSD Disk/inovoice/models/text-to-speech/vits/{speaker}/logs/G_{checkpoint_step}.pth"
checkpoint_filepath = f"/home/aai-labs/inovoice/models/{speaker}/G_{checkpoint_step}.pth"

hps = get_hparams_from_file("/home/aai-labs/inovoice/repos/vits/files/configs/44khz.json")

symbols, _, _ = get_vocabulary(hps.data.language)

In [13]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda(device)
_ = net_g.eval()
_ = load_checkpoint_for_inference(checkpoint_filepath, net_g)

[2022-09-25 22:53:23,972] INFO:PID-8014:MainThread:src.model.checkpoint: Loading checkpoint at /home/aai-labs/inovoice/models/milda_no_noise_44/G_150000.pth
[2022-09-25 22:53:24,355] INFO:PID-8014:MainThread:src.model.checkpoint: Loaded checkpoint '/home/aai-labs/inovoice/models/milda_no_noise_44/G_150000.pth'


In [14]:
book = read_book(book_path)

In [15]:
book_sentences = split_book_lines_to_sentences(book)

In [16]:
book_sentences = book_sentences[0:25]

In [17]:
audios = synthesize_book(book_sentences, net_g, hps, device=device)

100%|██████████| 25/25 [00:02<00:00, 11.20it/s]


In [18]:
audiobook = concat_2d_array_audios_with_silence(audios,
                                                silence_between_dim1=silence_between_sentences,
                                                silence_between_dim2=silence_between_paragraphs,
                                                sr=hps.data.sample_rate)

In [19]:
entries = generate_and_save_audiobook_srt(audios, book_sentences, sample_rate, silence_between_sentences,
                                          silence_between_paragraphs, Path(f"{output_dir}/{output_filename}.srt"))

In [20]:
write(f"{output_dir}/{output_filename}.wav", sample_rate, audiobook)
