In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## LJ Speech

In [2]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")

In [3]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("/home/ucsf/gimlet/repos/bravo_vits/pretrained_ljs.pth", net_g, None)

INFO:root:Loaded checkpoint '/home/ucsf/gimlet/repos/bravo_vits/pretrained_ljs.pth' (iteration 0)


In [4]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

## VCTK

In [None]:
hps = utils.get_hparams_from_file("./configs/vctk_base.json")

In [None]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("/path/to/pretrained_vctk.pth", net_g, None)

In [None]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([4]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

### BRAVO tests

In [7]:
import librosa
import time
import soundfile as sf
from transformers import VitsTokenizer, VitsModel, set_seed

help(net_g.infer)

# Prepare the input
text_prompt = 'Great to see you again!'
stn_tst = get_text(text_prompt, hps)
x_tst = stn_tst.unsqueeze(0).cuda()
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()

# Set seed for reproducibility
set_seed(42)

# Perform inference
with torch.no_grad():
    start = time.time()
    outputs = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
    end = time.time()

# Extract the outputs
audio, attn, y_mask, (z, z_p, m_p, logs_p), w, w_ceil = outputs

# Print the shapes
print("input shape:", stn_tst.shape)
print("audio shape:", audio.shape)
print("attn shape:", attn.shape)
print("y_mask shape:", y_mask.shape)
print("z shape:", z.shape)
print("z_p shape:", z_p.shape)
print("m_p shape:", m_p.shape)
print("logs_p shape:", logs_p.shape)
print("w shape:", w.shape)
print("w_ceil shape:", w_ceil.shape)

# Processing for audio playback
audio_processed = audio[0,0].data.cpu().float().numpy()
audio_processed = librosa.resample(audio_processed, orig_sr=hps.data.sampling_rate, target_sr=16000)

# Write to file and display
output_file = '../audio.wav'
sf.write(output_file, audio_processed, 16000)
ipd.display(ipd.Audio(output_file))

# Print the time taken for inference
print("Time taken for inference:", end - start)

Help on method infer in module models:

infer(x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1.0, max_len=None) method of models.SynthesizerTrn instance

input shape: torch.Size([51])
audio shape: torch.Size([1, 1, 33536])
attn shape: torch.Size([1, 1, 131, 51])
y_mask shape: torch.Size([1, 1, 131])
z shape: torch.Size([1, 192, 131])
z_p shape: torch.Size([1, 192, 131])
m_p shape: torch.Size([1, 192, 131])
logs_p shape: torch.Size([1, 192, 131])
w shape: torch.Size([1, 1, 51])
w_ceil shape: torch.Size([1, 1, 51])


Time taken for inference: 0.03692340850830078


In [8]:
import torch
import librosa
import soundfile as sf
import IPython.display as ipd

# Provided input preparation
text_prompt = 'Great to see you again!'
stn_tst = get_text(text_prompt, hps)
full_length = stn_tst.size(0)

# Step 1: Automatically find the length of the last word
# Tokenize the full sentence
tokens_full = get_text(text_prompt, hps)
last_space = text_prompt.rfind(' ')
if last_space != -1:
    text_without_last_word = text_prompt[:last_space]
else:
    text_without_last_word = ''  # In case there's only one word in the text
tokens_without_last_word = get_text(text_without_last_word, hps)
length_without_last_word = len(tokens_without_last_word)

# Step 2: Calculate the length of the phonemes for the last word
length_last_word = full_length - length_without_last_word

# Step 3: Normalize the phoneme durations for the last word
phoneme_durations = w_ceil[0, 0, -length_last_word:].cpu().numpy()
normalized_durations = phoneme_durations / sum(w_ceil[0, 0, :].cpu().numpy())

# Step 4: Extract the corresponding audio segment
# Calculate the start and end frame for the last word in the audio
fraction_of_audio = sum(normalized_durations)
start_frame = int(audio.shape[2] * (1 - fraction_of_audio))
end_frame = audio.shape[2]
audio_segment = audio[0, 0, start_frame:end_frame].data.cpu().float().numpy()

# Step 5: Resample and play back the audio
audio_segment_resampled = librosa.resample(audio_segment, orig_sr=hps.data.sampling_rate, target_sr=16000)

# Write to file and play
output_file_segment = '../audio_last_word.wav'
sf.write(output_file_segment, audio_segment_resampled, 16000)
ipd.display(ipd.Audio(output_file_segment))

In [11]:
import librosa
import time
import soundfile as sf
import torch
import IPython.display as ipd
import numpy as np
import string
import random
import scipy
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def synthesize_text(text, net_g, hps):
    stn_tst = get_text(text, hps)
    with torch.no_grad():
        start = time.time()
        x_tst = stn_tst.unsqueeze(0).cuda()
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        set_seed(42)
        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
        end = time.time()
    return audio, end - start

# Original method
text_prompt = 'Great to see you again!'
audio, latency_original = synthesize_text(text_prompt, net_g, hps)
audio = librosa.resample(audio, orig_sr=hps.data.sampling_rate, target_sr=16000)
output_file = '../audio.wav'
sf.write(output_file, audio, 16000)
ipd.display(ipd.Audio(output_file))
print(f"Latency for original method: {latency_original} seconds")

# New method - word by word
words = text_prompt.split()
def synthesize_last_word(text, net_g, hps, apply_voice_conversion=False):
    
    # Tokenize the text
    start = time.time()
    stn_tst = get_text(text, hps)
    x_tst = stn_tst.unsqueeze(0).cuda()
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    print("Encoder latency: ", time.time()-start)

    # Perform inference
    with torch.no_grad():
        outputs = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
    audio, _, _, _, _, w_ceil = outputs

    # Extract the last word audio
    last_space = text.rfind(' ')
    text_without_last_word = text[:last_space] if last_space != -1 else ''
    tokens_without_last_word = get_text(text_without_last_word, hps)
    length_last_word = len(stn_tst) - len(tokens_without_last_word)
    phoneme_durations = w_ceil[0, 0, -length_last_word:].cpu().numpy()
    normalized_durations = phoneme_durations / sum(w_ceil[0, 0, :].cpu().numpy())
    fraction_of_audio = sum(normalized_durations)
    start_frame = int(audio.shape[2] * (1 - fraction_of_audio))
    end_frame = audio.shape[2]
    last_word_audio = audio[0, 0, start_frame:end_frame].data.cpu().float().numpy()

    # Resample the last word audio
    last_word_audio_resampled = librosa.resample(last_word_audio, orig_sr=hps.data.sampling_rate, target_sr=16000)

    # Check if voice conversion should be applied
    if apply_voice_conversion:
        
        # Define temporary file paths
        temp_input_wav = '../audio_word_level.wav'
        temp_output_wav = '../audio_word_level_converted.wav'
        speaker_wav = "../b3_voice/b3-slow.wav"
        model_dir = "tts_models/multilingual/multi-dataset/your_tts"

        # Write the resampled last word audio to a temporary file
        scipy.io.wavfile.write(temp_input_wav, 16000, last_word_audio_resampled)

        # Run the voice conversion command
        os.system('tts --model_name "{}" --language_idx="en" --speaker_wav "{}" '
                  '--reference_wav "{}" --out_path "{}" --use_cuda False'.format(model_dir, speaker_wav, temp_input_wav, temp_output_wav))
    
        # Load the converted audio
        converted_audio, sr = librosa.load(temp_output_wav)
                  
        # Resample to the desired sample rate if needed
        if sr != 16000:
            converted_audio = librosa.resample(converted_audio, orig_sr=sr, target_sr=16000)
    
        # Use the converted audio as the output
        last_word_audio_resampled = converted_audio

    # Calculate latency
    end = time.time()
    latency = end - start

    return last_word_audio_resampled, latency

# Incremental synthesis method
cumulative_text = ''
combined_audio_incremental = np.array([])
total_latency_incremental = 0

for word in words:
    cumulative_text += (' ' + word) if cumulative_text else word
    audio, latency = synthesize_last_word(cumulative_text, net_g, hps, apply_voice_conversion=False)
    combined_audio_incremental = np.concatenate((combined_audio_incremental, audio))
    total_latency_incremental += latency
    print(f"Latency for incremental synthesis up to word '{word}': {latency} seconds")

output_file_incremental = '../audio_incremental.wav'
sf.write(output_file_incremental, combined_audio_incremental, 16000)
ipd.display(ipd.Audio(output_file_incremental))
print(f"Total latency for incremental synthesis method: {total_latency_incremental} seconds")

Latency for original method: 0.4194920063018799 seconds
Encoder latency:  0.26077985763549805
Latency for incremental synthesis up to word 'Great': 0.7059881687164307 seconds
Encoder latency:  0.26068544387817383
Latency for incremental synthesis up to word 'to': 0.7416484355926514 seconds
Encoder latency:  0.2755880355834961
Latency for incremental synthesis up to word 'see': 0.7625894546508789 seconds
Encoder latency:  0.2590920925140381
Latency for incremental synthesis up to word 'you': 0.7805376052856445 seconds
Encoder latency:  0.2512085437774658
Latency for incremental synthesis up to word 'again!': 0.8388271331787109 seconds


Total latency for incremental synthesis method: 3.8295907974243164 seconds


### Voice Conversion

In [None]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [None]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))