In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## LJ Speech

In [2]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")

In [3]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("/home/ucsf/gimlet/repos/task_gimlet/gimlet3.1/results/bravo3/models/118-kl_cjc_rnnt_multimodal_1_16_24_tts/pretrained_ljs.pth", net_g, None)

INFO:root:Loaded checkpoint '/home/ucsf/gimlet/repos/task_gimlet/gimlet3.1/results/bravo3/models/118-kl_cjc_rnnt_multimodal_1_16_24_tts/pretrained_ljs.pth' (iteration 0)


In [4]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

## VCTK

In [None]:
hps = utils.get_hparams_from_file("./configs/vctk_base.json")

In [None]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("/path/to/pretrained_vctk.pth", net_g, None)

In [None]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([4]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

### BRAVO tests

In [6]:
import librosa
import time
import soundfile as sf
from transformers import set_seed

help(net_g.infer)

# Prepare the input
text_prompt = 'Great to see you again!'
stn_tst = get_text(text_prompt, hps)
x_tst = stn_tst.unsqueeze(0).cuda()
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()

# Set seed for reproducibility
set_seed(42)

# Perform inference
with torch.no_grad():
    start = time.time()
    outputs = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
    end = time.time()

# Extract the outputs
audio, attn, y_mask, (z, z_p, m_p, logs_p), w, w_ceil = outputs

# Print the shapes
print("input shape:", stn_tst.shape)
print("audio shape:", audio.shape)
print("attn shape:", attn.shape)
print("y_mask shape:", y_mask.shape)
print("z shape:", z.shape)
print("z_p shape:", z_p.shape)
print("m_p shape:", m_p.shape)
print("logs_p shape:", logs_p.shape)
print("w shape:", w.shape)
print("w_ceil shape:", w_ceil.shape)

# Processing for audio playback
audio_processed = audio[0,0].data.cpu().float().numpy()
audio_processed = librosa.resample(audio_processed, orig_sr=hps.data.sampling_rate, target_sr=16000)

# Write to file and display
output_file = '../audio.wav'
sf.write(output_file, audio_processed, 16000)
ipd.display(ipd.Audio(output_file))

# Print the time taken for inference
print("Time taken for inference:", end - start)

Help on method infer in module models:

infer(x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1.0, max_len=None) method of models.SynthesizerTrn instance

input shape: torch.Size([51])
audio shape: torch.Size([1, 1, 33536])
attn shape: torch.Size([1, 1, 131, 51])
y_mask shape: torch.Size([1, 1, 131])
z shape: torch.Size([1, 192, 131])
z_p shape: torch.Size([1, 192, 131])
m_p shape: torch.Size([1, 192, 131])
logs_p shape: torch.Size([1, 192, 131])
w shape: torch.Size([1, 1, 51])
w_ceil shape: torch.Size([1, 1, 51])
DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=86)
           2	LOAD_GLOBAL(arg=0, lineno=86)
           4	LOAD_FAST(arg=0, lineno=86)
           6	LOAD_FAST(arg=1, lineno=86)
           8	LOAD_FAST(arg=2, lineno=86)
          10	LOAD_FAST(arg=3, lineno=86)
          12	LOAD_FAST(arg=4, lineno=86)
          14	LOAD_FAST(arg=5, lineno=86)
          16	LOAD_FAST(arg=6, lineno=86)
          18	CALL_FUNCTION(arg=7, lineno=86)
     

DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
DEBUG:numba.core.byteflow:stack: []
DEBUG:numba.core.byteflow:dispatch pc=0, inst=NOP(arg=None, lineno=9)
DEBUG:numba.core.byteflow:stack []
DEBUG:numba.core.byteflow:dispatch pc=2, inst=LOAD_GLOBAL(arg=0, lineno=9)
DEBUG:numba.core.byteflow:stack []
DEBUG:numba.core.byteflow:dispatch pc=4, inst=LOAD_FAST(arg=5, lineno=9)
DEBUG:numba.core.byteflow:stack ['$2load_global.0']
DEBUG:numba.core.byteflow:dispatch pc=6, inst=LOAD_FAST(arg=4, lineno=9)
DEBUG:numba.core.byteflow:stack ['$2load_global.0', '$scale4.1']
DEBUG:numba.core.byteflow:dispatch pc=8, inst=BINARY_MULTIPLY(arg=None, lineno=9)
DEBUG:numba.core.byteflow:stack ['$2load_global.0', '$scale4.1', '$num_table6.2']
DEBUG:numba.core.byteflow:dispatch pc=10, inst=CALL_FUNCTION(arg=1, lineno=9)
DEBUG:numba.core.byteflow:stack ['$2load_global.0', '$8binary_multiply.3']
DEBUG:numba.core.byteflow:dispatch pc=12, inst=STORE_FAST(arg=7, lineno=9)
DEBUG:numba.c

DEBUG:numba.core.byteflow:dispatch pc=112, inst=STORE_FAST(arg=10, lineno=30)
DEBUG:numba.core.byteflow:stack ['$phi84.0', '$110binary_multiply.12']
DEBUG:numba.core.byteflow:dispatch pc=114, inst=LOAD_FAST(arg=10, lineno=33)
DEBUG:numba.core.byteflow:stack ['$phi84.0']
DEBUG:numba.core.byteflow:dispatch pc=116, inst=LOAD_FAST(arg=4, lineno=33)
DEBUG:numba.core.byteflow:stack ['$phi84.0', '$frac114.13']
DEBUG:numba.core.byteflow:dispatch pc=118, inst=BINARY_MULTIPLY(arg=None, lineno=33)
DEBUG:numba.core.byteflow:stack ['$phi84.0', '$frac114.13', '$num_table116.14']
DEBUG:numba.core.byteflow:dispatch pc=120, inst=STORE_FAST(arg=11, lineno=33)
DEBUG:numba.core.byteflow:stack ['$phi84.0', '$118binary_multiply.15']
DEBUG:numba.core.byteflow:dispatch pc=122, inst=LOAD_GLOBAL(arg=0, lineno=34)
DEBUG:numba.core.byteflow:stack ['$phi84.0']
DEBUG:numba.core.byteflow:dispatch pc=124, inst=LOAD_FAST(arg=11, lineno=34)
DEBUG:numba.core.byteflow:stack ['$phi84.0', '$122load_global.16']
DEBUG:numba.

DEBUG:numba.core.byteflow:dispatch pc=280, inst=BINARY_SUBTRACT(arg=None, lineno=60)
DEBUG:numba.core.byteflow:stack ['$phi238.0', '$270load_global.13', '$276binary_subtract.16', '$const278.17']
DEBUG:numba.core.byteflow:dispatch pc=282, inst=LOAD_FAST(arg=15, lineno=60)
DEBUG:numba.core.byteflow:stack ['$phi238.0', '$270load_global.13', '$280binary_subtract.18']
DEBUG:numba.core.byteflow:dispatch pc=284, inst=LOAD_FAST(arg=12, lineno=60)
DEBUG:numba.core.byteflow:stack ['$phi238.0', '$270load_global.13', '$280binary_subtract.18', '$nwin282.19']
DEBUG:numba.core.byteflow:dispatch pc=286, inst=BINARY_SUBTRACT(arg=None, lineno=60)
DEBUG:numba.core.byteflow:stack ['$phi238.0', '$270load_global.13', '$280binary_subtract.18', '$nwin282.19', '$offset284.20']
DEBUG:numba.core.byteflow:dispatch pc=288, inst=LOAD_FAST(arg=7, lineno=60)
DEBUG:numba.core.byteflow:stack ['$phi238.0', '$270load_global.13', '$280binary_subtract.18', '$286binary_subtract.21']
DEBUG:numba.core.byteflow:dispatch pc=290

DEBUG:numba.core.byteflow:dispatch pc=230, inst=INPLACE_ADD(arg=None, lineno=47)
DEBUG:numba.core.byteflow:stack ['$phi170.0', '$phi170.1', '$y208.20', '$t210.21', '$214binary_subscr.24', '$228binary_multiply.31']
DEBUG:numba.core.byteflow:dispatch pc=232, inst=ROT_THREE(arg=None, lineno=47)
DEBUG:numba.core.byteflow:stack ['$phi170.0', '$phi170.1', '$y208.20', '$t210.21', '$230inplace_add.32']
DEBUG:numba.core.byteflow:dispatch pc=234, inst=STORE_SUBSCR(arg=None, lineno=47)
DEBUG:numba.core.byteflow:stack ['$phi170.0', '$phi170.1', '$230inplace_add.32', '$y208.20', '$t210.21']
DEBUG:numba.core.byteflow:dispatch pc=236, inst=JUMP_ABSOLUTE(arg=168, lineno=47)
DEBUG:numba.core.byteflow:stack ['$phi170.0', '$phi170.1']
DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=168, stack=('$phi170.0', '$phi170.1'), blockstack=(), npush=0)]
DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=304 nstack_initial=2), State(pc_initial=168 nstack_initial=2)])
DEBUG:numba.core.byteflow:stack: ['

DEBUG:numba.core.byteflow:stack ['$phi306.0', '$phi306.1', '$y344.20', '$t346.21', '$350binary_subscr.24', '$weight352.25', '$366binary_subscr.32']
DEBUG:numba.core.byteflow:dispatch pc=370, inst=INPLACE_ADD(arg=None, lineno=66)
DEBUG:numba.core.byteflow:stack ['$phi306.0', '$phi306.1', '$y344.20', '$t346.21', '$350binary_subscr.24', '$368binary_multiply.33']
DEBUG:numba.core.byteflow:dispatch pc=372, inst=ROT_THREE(arg=None, lineno=66)
DEBUG:numba.core.byteflow:stack ['$phi306.0', '$phi306.1', '$y344.20', '$t346.21', '$370inplace_add.34']
DEBUG:numba.core.byteflow:dispatch pc=374, inst=STORE_SUBSCR(arg=None, lineno=66)
DEBUG:numba.core.byteflow:stack ['$phi306.0', '$phi306.1', '$370inplace_add.34', '$y344.20', '$t346.21']
DEBUG:numba.core.byteflow:dispatch pc=376, inst=JUMP_ABSOLUTE(arg=304, lineno=66)
DEBUG:numba.core.byteflow:stack ['$phi306.0', '$phi306.1']
DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=304, stack=('$phi306.0', '$phi306.1'), blockstack=(), npush=0)]
DEBUG:numb

DEBUG:numba.core.byteflow:changing phismap: defaultdict(<class 'set'>,
            {'$phi168.0': {('$78get_iter.27',
                            State(pc_initial=0 nstack_initial=0))},
             '$phi168.1': {('$166get_iter.35',
                            State(pc_initial=84 nstack_initial=2))},
             '$phi170.0': {('$78get_iter.27',
                            State(pc_initial=0 nstack_initial=0))},
             '$phi170.1': {('$166get_iter.35',
                            State(pc_initial=84 nstack_initial=2))},
             '$phi170.2': {('$168for_iter.3',
                            State(pc_initial=168 nstack_initial=2))},
             '$phi238.0': {('$78get_iter.27',
                            State(pc_initial=0 nstack_initial=0))},
             '$phi304.0': {('$78get_iter.27',
                            State(pc_initial=0 nstack_initial=0))},
             '$phi304.1': {('$302get_iter.28',
                            State(pc_initial=238 nstack_initial=1))},
        

DEBUG:numba.core.byteflow:block_infos State(pc_initial=238 nstack_initial=1):
AdaptBlockInfo(insts=((238, {'res': '$scale238.1'}), (240, {'res': '$frac240.2'}), (242, {'lhs': '$scale238.1', 'rhs': '$frac240.2', 'res': '$242binary_subtract.3'}), (244, {'value': '$242binary_subtract.3'}), (246, {'res': '$frac246.4'}), (248, {'res': '$num_table248.5'}), (250, {'lhs': '$frac246.4', 'rhs': '$num_table248.5', 'res': '$250binary_multiply.6'}), (252, {'value': '$250binary_multiply.6'}), (254, {'res': '$254load_global.7'}), (256, {'res': '$index_frac256.8'}), (258, {'func': '$254load_global.7', 'args': ['$index_frac256.8'], 'res': '$258call_function.9'}), (260, {'value': '$258call_function.9'}), (262, {'res': '$index_frac262.10'}), (264, {'res': '$offset264.11'}), (266, {'lhs': '$index_frac262.10', 'rhs': '$offset264.11', 'res': '$266binary_subtract.12'}), (268, {'value': '$266binary_subtract.12'}), (270, {'res': '$270load_global.13'}), (272, {'res': '$n_orig272.14'}), (274, {'res': '$n274.15'}

DEBUG:numba.core.ssa:==== SSA block analysis pass on 0
DEBUG:numba.core.ssa:Running <numba.core.ssa._GatherDefsHandler object at 0x7f3c0c13bd00>
DEBUG:numba.core.ssa:on stmt: x = arg(0, name=x)
DEBUG:numba.core.ssa:on stmt: t_out = arg(1, name=t_out)
DEBUG:numba.core.ssa:on stmt: interp_win = arg(2, name=interp_win)
DEBUG:numba.core.ssa:on stmt: interp_delta = arg(3, name=interp_delta)
DEBUG:numba.core.ssa:on stmt: num_table = arg(4, name=num_table)
DEBUG:numba.core.ssa:on stmt: scale = arg(5, name=scale)
DEBUG:numba.core.ssa:on stmt: y = arg(6, name=y)
DEBUG:numba.core.ssa:on stmt: $2load_global.0 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: $8binary_multiply.3 = scale * num_table
DEBUG:numba.core.ssa:on stmt: index_step = call $2load_global.0($8binary_multiply.3, func=$2load_global.0, args=[Var($8binary_multiply.3, interpn.py:9)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: time_register = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt

DEBUG:numba.core.ssa:on stmt: $const278.17 = const(int, 1)
DEBUG:numba.core.ssa:on stmt: $280binary_subtract.18 = $276binary_subtract.16 - $const278.17
DEBUG:numba.core.ssa:on stmt: $286binary_subtract.21 = nwin - offset
DEBUG:numba.core.ssa:on stmt: $290binary_floor_divide.23 = $286binary_subtract.21 // index_step
DEBUG:numba.core.ssa:on stmt: k_max = call $270load_global.13($280binary_subtract.18, $290binary_floor_divide.23, func=$270load_global.13, args=[Var($280binary_subtract.18, interpn.py:60), Var($290binary_floor_divide.23, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $296load_global.25 = global(range: <class 'range'>)
DEBUG:numba.core.ssa:on stmt: $300call_function.27 = call $296load_global.25(k_max, func=$296load_global.25, args=[Var(k_max, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $302get_iter.28 = getiter(value=$300call_function.27)
DEBUG:numba.core.ssa:on stmt: $phi304.

DEBUG:numba.core.ssa:SSA violators {'n', 'eta', 'frac', 'index_frac', 'offset', 'weight', 'time_register'}
DEBUG:numba.core.ssa:Fix SSA violator on var n
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 0
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3d9c1ef430>
DEBUG:numba.core.ssa:on stmt: x = arg(0, name=x)
DEBUG:numba.core.ssa:on stmt: t_out = arg(1, name=t_out)
DEBUG:numba.core.ssa:on stmt: interp_win = arg(2, name=interp_win)
DEBUG:numba.core.ssa:on stmt: interp_delta = arg(3, name=interp_delta)
DEBUG:numba.core.ssa:on stmt: num_table = arg(4, name=num_table)
DEBUG:numba.core.ssa:on stmt: scale = arg(5, name=scale)
DEBUG:numba.core.ssa:on stmt: y = arg(6, name=y)
DEBUG:numba.core.ssa:on stmt: $2load_global.0 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: $8binary_multiply.3 = scale * num_table
DEBUG:numba.core.ssa:on stmt: index_step = call $2load_global.0($8binary_multiply.3, func=$2load_global.0, args=[Var($8binary_multiply.3, interp

DEBUG:numba.core.ssa:on stmt: offset = call $254load_global.7(index_frac, func=$254load_global.7, args=[Var(index_frac, interpn.py:14)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: eta = index_frac - offset
DEBUG:numba.core.ssa:on stmt: $270load_global.13 = global(min: <built-in function min>)
DEBUG:numba.core.ssa:on stmt: $276binary_subtract.16 = n_orig - n
DEBUG:numba.core.ssa:on stmt: $const278.17 = const(int, 1)
DEBUG:numba.core.ssa:on stmt: $280binary_subtract.18 = $276binary_subtract.16 - $const278.17
DEBUG:numba.core.ssa:on stmt: $286binary_subtract.21 = nwin - offset
DEBUG:numba.core.ssa:on stmt: $290binary_floor_divide.23 = $286binary_subtract.21 // index_step
DEBUG:numba.core.ssa:on stmt: k_max = call $270load_global.13($280binary_subtract.18, $290binary_floor_divide.23, func=$270load_global.13, args=[Var($280binary_subtract.18, interpn.py:60), Var($290binary_floor_divide.23, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)


DEBUG:numba.core.ssa:replaced with: $108binary_subtract.11 = time_register - n.1
DEBUG:numba.core.ssa:on stmt: frac = scale * $108binary_subtract.11
DEBUG:numba.core.ssa:on stmt: index_frac = frac * num_table
DEBUG:numba.core.ssa:on stmt: $122load_global.16 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: offset = call $122load_global.16(index_frac, func=$122load_global.16, args=[Var(index_frac, interpn.py:14)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: eta = index_frac - offset
DEBUG:numba.core.ssa:on stmt: $138load_global.22 = global(min: <built-in function min>)
DEBUG:numba.core.ssa:on stmt: $const142.24 = const(int, 1)
DEBUG:numba.core.ssa:on stmt: $144binary_add.25 = n + $const142.24
DEBUG:numba.core.ssa:find_def var='n' stmt=$144binary_add.25 = n + $const142.24
DEBUG:numba.core.ssa:replaced with: $144binary_add.25 = n.1 + $const142.24
DEBUG:numba.core.ssa:on stmt: $150binary_subtract.28 = nwin - offset
DEBUG:numba.core.ssa:on stmt: $

DEBUG:numba.core.ssa:find_def var='n' stmt=$360binary_add.29 = n + k
DEBUG:numba.core.ssa:find_def_from_top label 306
DEBUG:numba.core.ssa:idom 304 from label 306
DEBUG:numba.core.ssa:find_def_from_bottom label 304
DEBUG:numba.core.ssa:find_def_from_top label 304
DEBUG:numba.core.ssa:idom 238 from label 304
DEBUG:numba.core.ssa:find_def_from_bottom label 238
DEBUG:numba.core.ssa:find_def_from_top label 238
DEBUG:numba.core.ssa:idom 168 from label 238
DEBUG:numba.core.ssa:find_def_from_bottom label 168
DEBUG:numba.core.ssa:find_def_from_top label 168
DEBUG:numba.core.ssa:idom 84 from label 168
DEBUG:numba.core.ssa:find_def_from_bottom label 84
DEBUG:numba.core.ssa:replaced with: $360binary_add.29 = n.1 + k
DEBUG:numba.core.ssa:on stmt: $const362.30 = const(int, 1)
DEBUG:numba.core.ssa:on stmt: $364binary_add.31 = $360binary_add.29 + $const362.30
DEBUG:numba.core.ssa:on stmt: $366binary_subscr.32 = getitem(value=x, index=$364binary_add.31, fn=<built-in function getitem>)
DEBUG:numba.core

DEBUG:numba.core.ssa:on stmt: $184binary_subscr.9 = getitem(value=interp_win, index=$182binary_add.8, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $196binary_multiply.15 = i * index_step
DEBUG:numba.core.ssa:on stmt: $198binary_add.16 = offset + $196binary_multiply.15
DEBUG:numba.core.ssa:on stmt: $200binary_subscr.17 = getitem(value=interp_delta, index=$198binary_add.16, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $202binary_multiply.18 = eta * $200binary_subscr.17
DEBUG:numba.core.ssa:on stmt: weight = $184binary_subscr.9 + $202binary_multiply.18
DEBUG:numba.core.ssa:on stmt: $214binary_subscr.24 = getitem(value=y, index=t, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $224binary_subtract.29 = n.1 - i
DEBUG:numba.core.ssa:on stmt: $226binary_subscr.30 = getitem(value=x, index=$224binary_subtract.29, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $228binary_multiply.31 = weight * $226binary_subscr.30
DEBUG:numba.core.ss

DEBUG:numba.core.ssa:on stmt: $const66.22 = const(int, 0)
DEBUG:numba.core.ssa:on stmt: n_out = static_getitem(value=$64load_attr.21, index=0, index_var=$const66.22, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $72load_global.24 = global(prange: <class 'numba.misc.special.prange'>)
DEBUG:numba.core.ssa:on stmt: $76call_function.26 = call $72load_global.24(n_out, func=$72load_global.24, args=[Var(n_out, interpn.py:21)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $78get_iter.27 = getiter(value=$76call_function.26)
DEBUG:numba.core.ssa:on stmt: $phi80.0 = $78get_iter.27
DEBUG:numba.core.ssa:on stmt: jump 80
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 80
DEBUG:numba.core.ssa:Running <numba.core.ssa._FixSSAVars object at 0x7f3c0c136d30>
DEBUG:numba.core.ssa:on stmt: $80for_iter.1 = iternext(value=$phi80.0)
DEBUG:numba.core.ssa:on stmt: $80for_iter.2 = pair_first(value=$80for_iter.1)
DEBUG:numba.core.ssa:on stmt: $80for_iter.3 = pa

DEBUG:numba.core.ssa:on stmt: branch $304for_iter.4, 306, 380
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 306
DEBUG:numba.core.ssa:Running <numba.core.ssa._FixSSAVars object at 0x7f3c0c136d30>
DEBUG:numba.core.ssa:on stmt: k = $phi306.2
DEBUG:numba.core.ssa:on stmt: $316binary_multiply.7 = k * index_step
DEBUG:numba.core.ssa:on stmt: $318binary_add.8 = offset + $316binary_multiply.7
DEBUG:numba.core.ssa:on stmt: $320binary_subscr.9 = getitem(value=interp_win, index=$318binary_add.8, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $332binary_multiply.15 = k * index_step
DEBUG:numba.core.ssa:on stmt: $334binary_add.16 = offset + $332binary_multiply.15
DEBUG:numba.core.ssa:on stmt: $336binary_subscr.17 = getitem(value=interp_delta, index=$334binary_add.16, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $338binary_multiply.18 = eta * $336binary_subscr.17
DEBUG:numba.core.ssa:find_def var='eta' stmt=$338binary_multiply.18 = eta * $336binary_subscr.17
DEB

DEBUG:numba.core.ssa:on stmt: $phi168.1 = $166get_iter.35
DEBUG:numba.core.ssa:on stmt: jump 168
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 168
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3c0c0c4d00>
DEBUG:numba.core.ssa:on stmt: $168for_iter.2 = iternext(value=$phi168.1)
DEBUG:numba.core.ssa:on stmt: $168for_iter.3 = pair_first(value=$168for_iter.2)
DEBUG:numba.core.ssa:on stmt: $168for_iter.4 = pair_second(value=$168for_iter.2)
DEBUG:numba.core.ssa:on stmt: $phi170.2 = $168for_iter.3
DEBUG:numba.core.ssa:on stmt: branch $168for_iter.4, 170, 238
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 170
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3c0c0c4d00>
DEBUG:numba.core.ssa:on stmt: i = $phi170.2
DEBUG:numba.core.ssa:on stmt: $180binary_multiply.7 = i * index_step
DEBUG:numba.core.ssa:on stmt: $182binary_add.8 = offset + $180binary_multiply.7
DEBUG:numba.core.ssa:on stmt: $184binary_subscr.9 = getitem(value=int

DEBUG:numba.core.ssa:on stmt: n = const(int, 0)
DEBUG:numba.core.ssa:on stmt: frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: index_frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: offset = const(int, 0)
DEBUG:numba.core.ssa:on stmt: eta = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: weight = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: $44load_attr.13 = getattr(value=interp_win, attr=shape)
DEBUG:numba.core.ssa:on stmt: $const46.14 = const(int, 0)
DEBUG:numba.core.ssa:on stmt: nwin = static_getitem(value=$44load_attr.13, index=0, index_var=$const46.14, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $54load_attr.17 = getattr(value=x, attr=shape)
DEBUG:numba.core.ssa:on stmt: $const56.18 = const(int, 0)
DEBUG:numba.core.ssa:on stmt: n_orig = static_getitem(value=$54load_attr.17, index=0, index_var=$const56.18, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $64load_attr.21 = getattr(value=t_out, attr=shape)
DEBUG:numba.core.ssa:on stmt: $c

DEBUG:numba.core.ssa:on stmt: $290binary_floor_divide.23 = $286binary_subtract.21 // index_step
DEBUG:numba.core.ssa:on stmt: k_max = call $270load_global.13($280binary_subtract.18, $290binary_floor_divide.23, func=$270load_global.13, args=[Var($280binary_subtract.18, interpn.py:60), Var($290binary_floor_divide.23, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $296load_global.25 = global(range: <class 'range'>)
DEBUG:numba.core.ssa:on stmt: $300call_function.27 = call $296load_global.25(k_max, func=$296load_global.25, args=[Var(k_max, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $302get_iter.28 = getiter(value=$300call_function.27)
DEBUG:numba.core.ssa:on stmt: $phi304.1 = $302get_iter.28
DEBUG:numba.core.ssa:on stmt: jump 304
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 304
DEBUG:numba.core.ssa:Running <numba.core.ssa._FixSSAVars object at 0x7f3c0c0c6310>
DEBUG:numba.core.ssa:on

DEBUG:numba.core.ssa:on stmt: $150binary_subtract.28 = nwin - offset
DEBUG:numba.core.ssa:on stmt: $154binary_floor_divide.30 = $150binary_subtract.28 // index_step
DEBUG:numba.core.ssa:on stmt: i_max = call $138load_global.22($144binary_add.25, $154binary_floor_divide.30, func=$138load_global.22, args=[Var($144binary_add.25, interpn.py:40), Var($154binary_floor_divide.30, interpn.py:40)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $160load_global.32 = global(range: <class 'range'>)
DEBUG:numba.core.ssa:on stmt: $164call_function.34 = call $160load_global.32(i_max, func=$160load_global.32, args=[Var(i_max, interpn.py:40)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $166get_iter.35 = getiter(value=$164call_function.34)
DEBUG:numba.core.ssa:on stmt: $phi168.1 = $166get_iter.35
DEBUG:numba.core.ssa:on stmt: jump 168
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 168
DEBUG:numba.core.ssa:Running <numba.core.ssa._F

DEBUG:numba.core.ssa:on stmt: t_out = arg(1, name=t_out)
DEBUG:numba.core.ssa:on stmt: interp_win = arg(2, name=interp_win)
DEBUG:numba.core.ssa:on stmt: interp_delta = arg(3, name=interp_delta)
DEBUG:numba.core.ssa:on stmt: num_table = arg(4, name=num_table)
DEBUG:numba.core.ssa:on stmt: scale = arg(5, name=scale)
DEBUG:numba.core.ssa:on stmt: y = arg(6, name=y)
DEBUG:numba.core.ssa:on stmt: $2load_global.0 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: $8binary_multiply.3 = scale * num_table
DEBUG:numba.core.ssa:on stmt: index_step = call $2load_global.0($8binary_multiply.3, func=$2load_global.0, args=[Var($8binary_multiply.3, interpn.py:9)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: time_register = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: n = const(int, 0)
DEBUG:numba.core.ssa:on stmt: frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: index_frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: offset = const(int, 0)
DEBUG

DEBUG:numba.core.ssa:find_def var='index_frac' stmt=offset = call $254load_global.7(index_frac, func=$254load_global.7, args=[Var(index_frac, interpn.py:14)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:replaced with: offset = call $254load_global.7(index_frac.2, func=$254load_global.7, args=[Var(index_frac.2, interpn.py:53)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: eta.2 = index_frac - offset
DEBUG:numba.core.ssa:find_def var='index_frac' stmt=eta.2 = index_frac - offset
DEBUG:numba.core.ssa:replaced with: eta.2 = index_frac.2 - offset
DEBUG:numba.core.ssa:on stmt: $270load_global.13 = global(min: <built-in function min>)
DEBUG:numba.core.ssa:on stmt: $276binary_subtract.16 = n_orig - n.1
DEBUG:numba.core.ssa:on stmt: $const278.17 = const(int, 1)
DEBUG:numba.core.ssa:on stmt: $280binary_subtract.18 = $276binary_subtract.16 - $const278.17
DEBUG:numba.core.ssa:on stmt: $286binary_subtract.21 = nwin - offset
DEBUG:numba.cor

DEBUG:numba.core.ssa:on stmt: n.1 = call $94load_global.5(time_register, func=$94load_global.5, args=[Var(time_register, interpn.py:10)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $108binary_subtract.11 = time_register - n.1
DEBUG:numba.core.ssa:on stmt: frac.1 = scale * $108binary_subtract.11
DEBUG:numba.core.ssa:on stmt: index_frac.1 = frac.1 * num_table
DEBUG:numba.core.ssa:on stmt: $122load_global.16 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: offset = call $122load_global.16(index_frac.1, func=$122load_global.16, args=[Var(index_frac.1, interpn.py:33)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:replaced with: offset.1 = call $122load_global.16(index_frac.1, func=$122load_global.16, args=[Var(index_frac.1, interpn.py:33)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: eta.1 = index_frac.1 - offset
DEBUG:numba.core.ssa:on stmt: $138load_global.22 = global(min: <built-in f

DEBUG:numba.core.ssa:on stmt: $370inplace_add.34 = inplace_binop(fn=<built-in function iadd>, immutable_fn=<built-in function add>, lhs=$350binary_subscr.24, rhs=$368binary_multiply.33, static_lhs=Undefined, static_rhs=Undefined)
DEBUG:numba.core.ssa:on stmt: y[t] = $370inplace_add.34
DEBUG:numba.core.ssa:on stmt: jump 304
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 380
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3c0c0c6c40>
DEBUG:numba.core.ssa:on stmt: jump 80
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 382
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3c0c0c6c40>
DEBUG:numba.core.ssa:on stmt: $const382.0 = const(NoneType, None)
DEBUG:numba.core.ssa:on stmt: $384return_value.1 = cast(value=$const382.0)
DEBUG:numba.core.ssa:on stmt: return $384return_value.1
DEBUG:numba.core.ssa:Replaced assignments: defaultdict(<class 'list'>,
            {0: [<numba.core.ir.Assign object at 0x7f3c0c1dec40>],
             8

DEBUG:numba.core.ssa:on stmt: $198binary_add.16 = offset + $196binary_multiply.15
DEBUG:numba.core.ssa:find_def var='offset' stmt=$198binary_add.16 = offset + $196binary_multiply.15
DEBUG:numba.core.ssa:find_def_from_top label 170
DEBUG:numba.core.ssa:idom 168 from label 170
DEBUG:numba.core.ssa:find_def_from_bottom label 168
DEBUG:numba.core.ssa:find_def_from_top label 168
DEBUG:numba.core.ssa:idom 84 from label 168
DEBUG:numba.core.ssa:find_def_from_bottom label 84
DEBUG:numba.core.ssa:replaced with: $198binary_add.16 = offset.1 + $196binary_multiply.15
DEBUG:numba.core.ssa:on stmt: $200binary_subscr.17 = getitem(value=interp_delta, index=$198binary_add.16, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:on stmt: $202binary_multiply.18 = eta.1 * $200binary_subscr.17
DEBUG:numba.core.ssa:on stmt: weight = $184binary_subscr.9 + $202binary_multiply.18
DEBUG:numba.core.ssa:on stmt: $214binary_subscr.24 = getitem(value=y, index=t, fn=<built-in function getitem>)
DEBUG:numba.core.ssa:

DEBUG:numba.core.ssa:on stmt: index_step = call $2load_global.0($8binary_multiply.3, func=$2load_global.0, args=[Var($8binary_multiply.3, interpn.py:9)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: time_register = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: n = const(int, 0)
DEBUG:numba.core.ssa:on stmt: frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: index_frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: offset = const(int, 0)
DEBUG:numba.core.ssa:on stmt: eta = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: weight = const(float, 0.0)
DEBUG:numba.core.ssa:first assign: weight
DEBUG:numba.core.ssa:replaced with: weight = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: $44load_attr.13 = getattr(value=interp_win, attr=shape)
DEBUG:numba.core.ssa:on stmt: $const46.14 = const(int, 0)
DEBUG:numba.core.ssa:on stmt: nwin = static_getitem(value=$44load_attr.13, index=0, index_var=$const46.14, fn=<built-in function getitem>)
DEBUG:numba.core.

DEBUG:numba.core.ssa:on stmt: $296load_global.25 = global(range: <class 'range'>)
DEBUG:numba.core.ssa:on stmt: $300call_function.27 = call $296load_global.25(k_max, func=$296load_global.25, args=[Var(k_max, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $302get_iter.28 = getiter(value=$300call_function.27)
DEBUG:numba.core.ssa:on stmt: $phi304.1 = $302get_iter.28
DEBUG:numba.core.ssa:on stmt: jump 304
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 304
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3c0c1d1a60>
DEBUG:numba.core.ssa:on stmt: $304for_iter.2 = iternext(value=$phi304.1)
DEBUG:numba.core.ssa:on stmt: $304for_iter.3 = pair_first(value=$304for_iter.2)
DEBUG:numba.core.ssa:on stmt: $304for_iter.4 = pair_second(value=$304for_iter.2)
DEBUG:numba.core.ssa:on stmt: $phi306.2 = $304for_iter.3
DEBUG:numba.core.ssa:on stmt: branch $304for_iter.4, 306, 380
DEBUG:numba.core.ssa:==== SSA block rewrite pass

DEBUG:numba.core.ssa:on stmt: $160load_global.32 = global(range: <class 'range'>)
DEBUG:numba.core.ssa:on stmt: $164call_function.34 = call $160load_global.32(i_max, func=$160load_global.32, args=[Var(i_max, interpn.py:40)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $166get_iter.35 = getiter(value=$164call_function.34)
DEBUG:numba.core.ssa:on stmt: $phi168.1 = $166get_iter.35
DEBUG:numba.core.ssa:on stmt: jump 168
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 168
DEBUG:numba.core.ssa:Running <numba.core.ssa._FixSSAVars object at 0x7f3c0c0c6760>
DEBUG:numba.core.ssa:on stmt: $168for_iter.2 = iternext(value=$phi168.1)
DEBUG:numba.core.ssa:on stmt: $168for_iter.3 = pair_first(value=$168for_iter.2)
DEBUG:numba.core.ssa:on stmt: $168for_iter.4 = pair_second(value=$168for_iter.2)
DEBUG:numba.core.ssa:on stmt: $phi170.2 = $168for_iter.3
DEBUG:numba.core.ssa:on stmt: branch $168for_iter.4, 170, 238
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 1

DEBUG:numba.core.ssa:on stmt: y = arg(6, name=y)
DEBUG:numba.core.ssa:on stmt: $2load_global.0 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: $8binary_multiply.3 = scale * num_table
DEBUG:numba.core.ssa:on stmt: index_step = call $2load_global.0($8binary_multiply.3, func=$2load_global.0, args=[Var($8binary_multiply.3, interpn.py:9)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: time_register = const(float, 0.0)
DEBUG:numba.core.ssa:first assign: time_register
DEBUG:numba.core.ssa:replaced with: time_register = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: n = const(int, 0)
DEBUG:numba.core.ssa:on stmt: frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: index_frac = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: offset = const(int, 0)
DEBUG:numba.core.ssa:on stmt: eta = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: weight = const(float, 0.0)
DEBUG:numba.core.ssa:on stmt: $44load_attr.13 = getattr(value=interp_win, attr=shape)
DEBUG:

DEBUG:numba.core.ssa:on stmt: $290binary_floor_divide.23 = $286binary_subtract.21 // index_step
DEBUG:numba.core.ssa:on stmt: k_max = call $270load_global.13($280binary_subtract.18, $290binary_floor_divide.23, func=$270load_global.13, args=[Var($280binary_subtract.18, interpn.py:60), Var($290binary_floor_divide.23, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $296load_global.25 = global(range: <class 'range'>)
DEBUG:numba.core.ssa:on stmt: $300call_function.27 = call $296load_global.25(k_max, func=$296load_global.25, args=[Var(k_max, interpn.py:60)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: $302get_iter.28 = getiter(value=$300call_function.27)
DEBUG:numba.core.ssa:on stmt: $phi304.1 = $302get_iter.28
DEBUG:numba.core.ssa:on stmt: jump 304
DEBUG:numba.core.ssa:==== SSA block rewrite pass on 304
DEBUG:numba.core.ssa:Running <numba.core.ssa._FreshVarHandler object at 0x7f3c0c0b8a00>
DEBUG:numba.core.s

DEBUG:numba.core.ssa:on stmt: index_frac.1 = frac.1 * num_table
DEBUG:numba.core.ssa:on stmt: $122load_global.16 = global(int: <class 'int'>)
DEBUG:numba.core.ssa:on stmt: offset.1 = call $122load_global.16(index_frac.1, func=$122load_global.16, args=[Var(index_frac.1, interpn.py:33)], kws=(), vararg=None, varkwarg=None, target=None)
DEBUG:numba.core.ssa:on stmt: eta.1 = index_frac.1 - offset.1
DEBUG:numba.core.ssa:on stmt: $138load_global.22 = global(min: <built-in function min>)
DEBUG:numba.core.ssa:on stmt: $const142.24 = const(int, 1)
DEBUG:numba.core.ssa:on stmt: $144binary_add.25 = n.1 + $const142.24
DEBUG:numba.core.ssa:on stmt: $150binary_subtract.28 = nwin - offset.1
DEBUG:numba.core.ssa:on stmt: $154binary_floor_divide.30 = $150binary_subtract.28 // index_step
DEBUG:numba.core.ssa:on stmt: i_max = call $138load_global.22($144binary_add.25, $154binary_floor_divide.30, func=$138load_global.22, args=[Var($144binary_add.25, interpn.py:40), Var($154binary_floor_divide.30, interpn.

DEBUG:numba.core.ssa:on stmt: return $384return_value.1


Time taken for inference: 0.06787633895874023


In [7]:
import torch
import librosa
import soundfile as sf
import IPython.display as ipd

# Provided input preparation
text_prompt = 'Great to see you again!'
stn_tst = get_text(text_prompt, hps)
full_length = stn_tst.size(0)

# Step 1: Automatically find the length of the last word
# Tokenize the full sentence
tokens_full = get_text(text_prompt, hps)
last_space = text_prompt.rfind(' ')
if last_space != -1:
    text_without_last_word = text_prompt[:last_space]
else:
    text_without_last_word = ''  # In case there's only one word in the text
tokens_without_last_word = get_text(text_without_last_word, hps)
length_without_last_word = len(tokens_without_last_word)

# Step 2: Calculate the length of the phonemes for the last word
length_last_word = full_length - length_without_last_word

# Step 3: Normalize the phoneme durations for the last word
phoneme_durations = w_ceil[0, 0, -length_last_word:].cpu().numpy()
normalized_durations = phoneme_durations / sum(w_ceil[0, 0, :].cpu().numpy())

# Step 4: Extract the corresponding audio segment
# Calculate the start and end frame for the last word in the audio
fraction_of_audio = sum(normalized_durations)
start_frame = int(audio.shape[2] * (1 - fraction_of_audio))
end_frame = audio.shape[2]
audio_segment = audio[0, 0, start_frame:end_frame].data.cpu().float().numpy()

# Step 5: Resample and play back the audio
audio_segment_resampled = librosa.resample(audio_segment, orig_sr=hps.data.sampling_rate, target_sr=16000)

# Write to file and play
output_file_segment = '../audio_last_word.wav'
sf.write(output_file_segment, audio_segment_resampled, 16000)
ipd.display(ipd.Audio(output_file_segment))

In [8]:
import librosa
import time
import soundfile as sf
import torch
import IPython.display as ipd
import numpy as np
import string
import random
import scipy
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def synthesize_text(text, net_g, hps):
    stn_tst = get_text(text, hps)
    with torch.no_grad():
        start = time.time()
        x_tst = stn_tst.unsqueeze(0).cuda()
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        set_seed(42)
        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
        end = time.time()
    return audio, end - start

# Original method
text_prompt = 'Great to see you again!'
audio, latency_original = synthesize_text(text_prompt, net_g, hps)
audio = librosa.resample(audio, orig_sr=hps.data.sampling_rate, target_sr=16000)
output_file = '../audio.wav'
sf.write(output_file, audio, 16000)
ipd.display(ipd.Audio(output_file))
print(f"Latency for original method: {latency_original} seconds")

# New method - word by word
words = text_prompt.split()
def synthesize_last_word(text, net_g, hps, apply_voice_conversion=False):
    
    # Tokenize the text
    start = time.time()
    stn_tst = get_text(text, hps)
    x_tst = stn_tst.unsqueeze(0).cuda()
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    print("Encoder latency: ", time.time()-start)

    # Perform inference
    with torch.no_grad():
        outputs = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
    audio, _, _, _, _, w_ceil = outputs

    # Extract the last word audio
    last_space = text.rfind(' ')
    text_without_last_word = text[:last_space] if last_space != -1 else ''
    tokens_without_last_word = get_text(text_without_last_word, hps)
    length_last_word = len(stn_tst) - len(tokens_without_last_word)
    phoneme_durations = w_ceil[0, 0, -length_last_word:].cpu().numpy()
    normalized_durations = phoneme_durations / sum(w_ceil[0, 0, :].cpu().numpy())
    fraction_of_audio = sum(normalized_durations)
    start_frame = int(audio.shape[2] * (1 - fraction_of_audio))
    end_frame = audio.shape[2]
    last_word_audio = audio[0, 0, start_frame:end_frame].data.cpu().float().numpy()

    # Resample the last word audio
    last_word_audio_resampled = librosa.resample(last_word_audio, orig_sr=hps.data.sampling_rate, target_sr=16000)

    # Check if voice conversion should be applied
    if apply_voice_conversion:
        
        # Define temporary file paths
        temp_input_wav = '../audio_word_level.wav'
        temp_output_wav = '../audio_word_level_converted.wav'
        speaker_wav = "../b3_voice/b3-slow.wav"
        model_dir = "tts_models/multilingual/multi-dataset/your_tts"

        # Write the resampled last word audio to a temporary file
        scipy.io.wavfile.write(temp_input_wav, 16000, last_word_audio_resampled)

        # Run the voice conversion command
        os.system('tts --model_name "{}" --language_idx="en" --speaker_wav "{}" '
                  '--reference_wav "{}" --out_path "{}" --use_cuda False'.format(model_dir, speaker_wav, temp_input_wav, temp_output_wav))
    
        # Load the converted audio
        converted_audio, sr = librosa.load(temp_output_wav)
                  
        # Resample to the desired sample rate if needed
        if sr != 16000:
            converted_audio = librosa.resample(converted_audio, orig_sr=sr, target_sr=16000)
    
        # Use the converted audio as the output
        last_word_audio_resampled = converted_audio

    # Calculate latency
    end = time.time()
    latency = end - start

    return last_word_audio_resampled, latency

# Incremental synthesis method
cumulative_text = ''
combined_audio_incremental = np.array([])
total_latency_incremental = 0

for word in words:
    cumulative_text += (' ' + word) if cumulative_text else word
    audio, latency = synthesize_last_word(cumulative_text, net_g, hps, apply_voice_conversion=False)
    combined_audio_incremental = np.concatenate((combined_audio_incremental, audio))
    total_latency_incremental += latency
    print(f"Latency for incremental synthesis up to word '{word}': {latency} seconds")

output_file_incremental = '../audio_incremental.wav'
sf.write(output_file_incremental, combined_audio_incremental, 16000)
ipd.display(ipd.Audio(output_file_incremental))
print(f"Total latency for incremental synthesis method: {total_latency_incremental} seconds")

Latency for original method: 0.0680687427520752 seconds
Encoder latency:  0.37519168853759766
Latency for incremental synthesis up to word 'Great': 1.2312917709350586 seconds
Encoder latency:  0.3796677589416504
Latency for incremental synthesis up to word 'to': 0.8583920001983643 seconds
Encoder latency:  0.36397385597229004
Latency for incremental synthesis up to word 'see': 0.8561248779296875 seconds
Encoder latency:  0.41966915130615234
Latency for incremental synthesis up to word 'you': 0.8917090892791748 seconds
Encoder latency:  0.40080761909484863
Latency for incremental synthesis up to word 'again!': 0.886730432510376 seconds


Total latency for incremental synthesis method: 4.724248170852661 seconds


### Voice Conversion

In [None]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [None]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))