<span style="font-size: 200%; color: red;">First! check the changing "your config.yaml" </span>

## [seq] import

In [1]:
import os
import json
import yaml
import sys
import time
import copy
import IPython.display as ipd
import pprint
from pathlib import Path
from tqdm import tqdm


import numpy as np
import torch
import torchaudio
from librosa.filters import mel as librosa_mel_fn
#import matplotlib
#matplotlib.use("Agg")
import matplotlib.pyplot as plt
from scipy.io.wavfile import write


import toybox

In [2]:
def plot_audio(audio, samplerate, title='time-domain waveform'):
    """
    usage:
        # audio is [channel, time(num_frames)] ex.torch.Size([1, 68608])
        # audio[0,:]: list of 1ch audio data
        # audio.shape[1]: int value of 1ch audio data length
        audio, sample_rate = torchaudio.load(str(iwav_path))
        %matplotlib inline
        plot_audio(audio, sample_rate)
    """
    # transform to mono
    channel = 0
    audio = audio[channel,:].view(1,-1)
    # to numpy
    audio = audio.to('cpu').detach().numpy().copy()
    time = np.linspace(0., audio.shape[1]/samplerate, audio.shape[1])
    
    fig, ax = plt.subplots(figsize=(12,9))
    
    ax.plot(time, audio[0, :])
    ax.set_title(title, fontsize=20, y=-0.12)
    ax.tick_params(direction='in')
    #ax.set_xlim(0, 3)
    ax.set_xlabel('Time')
    ax.set_ylabel('Amp')
    #ax.legend()
    plt.tight_layout()
    fig.canvas.draw()
    plt.show()
    #fig.savefig('figure.png')
    plt.close(fig)
    return fig

def plot_mel(tensors:list, titles:list[str]):
    """
    usage:
        mel = mel_process(...)
        fig_mel = plot_mel([mel_groundtruth[0], mel_prediction[0]],
                            ['groundtruth', 'inferenced(model)'])

    """
    xlim = max([t.shape[1] for t in tensors])
    fig, axs = plt.subplots(nrows=len(tensors),
                            ncols=1,
                            figsize=(12, 9),
                            constrained_layout=True)

    if len(tensors) == 1:
        axs = [axs]
    
    for i in range(len(tensors)):
        im = axs[i].imshow(tensors[i],
                           aspect="auto",
                           origin="lower",
                           interpolation='none')
        #plt.colorbar(im, ax=axs[i])
        fig.colorbar(im, ax=axs[i])
        axs[i].set_title(titles[i])
        axs[i].set_xlim([0, xlim])
    fig.canvas.draw()
    #plt.show()
    #plt.close()
    plt.close(fig)  # fig.close() 
    return fig

def convert_phn_to_id(phonemes, phn2id):
    """
    phonemes: phonemes separated by ' '
    phn2id: phn2id dict
    """
    return [phn2id[x] for x in ['<bos>'] + phonemes.split(' ') + ['<eos>']]


def text2phnid(text, phn2id, language='en', add_blank=True):
    if language == 'en':
        from text import G2pEn
        word2phn = G2pEn()
        phonemes = word2phn(text)
        if add_blank:
            phonemes = ' <blank> '.join(phonemes)
        return phonemes, convert_phn_to_id(phonemes, phn2id)
    else:
        raise ValueError(
            'Language should be en (for English)!')

In [3]:
#ckpt_file_dir: logs4model/<model_name>/<runtime_name>/ckpt/
#run_tfkfulmask_k3,run_tfkfulplus_k3 by gradtfkfultts
model_name = 'gradtfk5tts' #  gradseptts, gradtfktts, gradtfk5tts, gradtimektts, gradfreqktts, gradtfkful_mask, gradtfkful_plus
model_dir = 'run_tfk_k5'
runtime_name = 'infer4colb'
hifigan_versions = 'LJ_V1'
config_path4model = 'configs/config_exp_mid.yaml'
wav_dir_name = f'wav_{hifigan_versions}'
test_ds_path = Path('configs/test_dataset.json') #Path(config['test_datalist_path'])
#~/aoi/logs4model/gradtfk5tts/run_tfk_k5/ckpt/gradtfk5tts_500_397001.pt
ckpt_filename= 'gradtfk5tts_500_397001.pt'
# for runtime to load model
print(f'runtime_name: {runtime_name}')
ckpt_dir = f'logs4model/{model_name}/{model_dir}/ckpt'
ckpt_path = Path(ckpt_dir + "/" + ckpt_filename)


if ckpt_path.exists()==True:
    print(f"ckpt_path: {ckpt_path}")
else:
    print(f"Not find")

config = toybox.load_yaml_and_expand_var('configs/config_exp_mid.yaml')
print(model_name)
print(runtime_name)
n_mels: int = config['n_mels'] # 80
n_fft: int = config['n_fft'] # 1024
sample_rate: int = config['sample_rate'] # 22050
hop_size: int = config['hop_size'] # 256
win_size: int = config['win_size'] # 1024
f_min: int = config['f_min'] # 0
f_max: int = config['f_max'] # 8000
random_seed: int = 1234#config['random_seed'] # 42 or 1234
print(n_mels, n_fft, sample_rate, hop_size, win_size, f_min, f_max, random_seed)

print(f"phn2id_path: {config['phn2id_path']}")
with open(config['phn2id_path']) as f:
    phn2id = json.load(f)

vocab_size = len(phn2id) + 1

toybox.set_seed(random_seed)

runtime_name: infer4colb
ckpt_path: logs4model/gradtfk5tts/run_tfk_k5/ckpt/gradtfk5tts_500_397001.pt
gradtfk5tts
infer4colb
80 1024 22050 256 1024 0 8000 1234
phn2id_path: ./configs/phn2id.json


In [4]:
#devicesetting
import os

print(f"all cpu at using device: {os.cpu_count()}")
print(f"Number of available CPU: {len(os.sched_getaffinity(0))}") # Number of available CPUs can also be obtained. ,use systemcall at linux.
#print(f"GPU_name: {torch.cuda.get_device_name()}\nGPU avail: {torch.cuda.is_available()}\n")

DEVICE = 'cpu' #'cuda' or 'cpu'
#DEVICE_HiFiGAN = 'cuda'
#if torch.cuda.is_available():
#    print('use cuda')
#else:
#    os._exit(os.EX_OK)
#    print('use cpu')

#device = torch.device(DEVICE_HiFiGAN)
device = torch.device(DEVICE)
print(f'device: {device}')

all cpu at using device: 52
Number of available CPU: 4
device: cpu


In [5]:
RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/e500_n50')
RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel_LJ_V1'
RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / wav_dir_name
#RESULT_JSON_PATH = RESULT_DIR_PATH / f'eval4mid_{hifigan_versions}.json'
#eval_jsonl_path = RESULT_DIR_PATH / 'eval4mid.json'
RESULT_JSON_PATH = RESULT_DIR_PATH / f'eval4mid_{hifigan_versions}.json'

print(RESULT_DIR_PATH)
print(RESULT_MEL_DIR_PATH)
print(RESULT_WAV_DIR_PATH)

result4eval/infer4colb/gradtfk5tts/cpu/e500_n50
result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/mel_LJ_V1
result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/wav_LJ_V1


In [6]:
# check path

# for text2mel
print('test_ds_path-----------------------------------------')
if test_ds_path.exists():
    print(f'Exists {str(test_ds_path)}')
    with open(config['test_datalist_path']) as j:
        test_ds_list = json.load(j)
    print(f'loaded {test_ds_path}')
else:
    print(f'No exist {test_ds_path}')

print('RESULT_DIR_PATH-------------------------------------------')
if RESULT_DIR_PATH.exists():
    print(f'Exists {RESULT_DIR_PATH}')
else:
    RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_DIR_PATH}')

print('RESULT_MEL_DIR_PATH-------------------------------------------')
if RESULT_MEL_DIR_PATH.exists():
    print(f'Exists {RESULT_MEL_DIR_PATH}')
else:
    RESULT_MEL_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_MEL_DIR_PATH}')

print('RESULT_WAV_DIR_PATH-------------------------------------------')
if RESULT_WAV_DIR_PATH.exists():
    print(f'Exists {RESULT_WAV_DIR_PATH}')
else:
    RESULT_WAV_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_WAV_DIR_PATH}')

print('RESULT_JSON_PATH-------------------------------------------')
if RESULT_JSON_PATH.exists():
    print(f'Exists {RESULT_JSON_PATH}')
else:
    #RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_JSON_PATH}')

test_ds_path-----------------------------------------
Exists configs/test_dataset.json
loaded configs/test_dataset.json
RESULT_DIR_PATH-------------------------------------------
Exists result4eval/infer4colb/gradtfk5tts/cpu/e500_n50
RESULT_MEL_DIR_PATH-------------------------------------------
No exist result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/mel_LJ_V1
RESULT_WAV_DIR_PATH-------------------------------------------
No exist result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/wav_LJ_V1
RESULT_JSON_PATH-------------------------------------------
Exists result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/eval4mid_LJ_V1.json


## [seq] load model 

In [7]:
# import models
from gradtts import GradTTS
from gradseptts import GradSepTTS
from gradtfktts import GradTFKTTS
from gradtfk5tts import GradTFKTTS as GradTFK5TTS
from gradtimektts import GradTimeKTTS
from gradfreqktts import GradFreqKTTS
from gradtfkfultts import GradTFKFULTTS

print(model_name)
print("[seq] loading Model")

print("loading diffusion-TTS ===================================")
N_STEP = 50
TEMP = 1.5

print('loading ', ckpt_path)
_, _, state_dict = torch.load(ckpt_path,
                            map_location=device)

#with open(config_path4model) as f:
#    config = yaml.load(f, yaml.SafeLoader)
config4model = toybox.load_yaml_and_expand_var(config_path4model)

print("[seq] Initializing diffusion-TTS...")
if model_name == "gradtts":
    model = GradTTS.build_model(config4model, vocab_size)
elif model_name == "gradseptts":
    model = GradSepTTS.build_model(config4model, vocab_size)
elif model_name == "gradtfktts":
    model = GradTFKTTS.build_model(config4model, vocab_size)
elif model_name == "gradtfk5tts":
    model = GradTFK5TTS.build_model(config4model, vocab_size)
elif model_name == "gradtfkfultts":
    model = GradTFKFULTTS.build_model(config4model, vocab_size)
elif model_name == "gradtimektts":
    model = GradTimeKTTS.build_model(config4model, vocab_size)
elif model_name == "gradfreqktts":
    model = GradFreqKTTS.build_model(config4model, vocab_size)
else:
    raise ValueError(f"Error: '{model_name}' is not supported")

model = model.to(device)
model.load_state_dict(state_dict)
print(f'Number of encoder + duration predictor parameters: {model.encoder.nparams/1e6}m')
print(f'Number of decoder parameters: {model.decoder.nparams/1e6}m')
print(f'Total parameters: {model.nparams/1e6}m')

print("loading HiFi-GAN ===================================")
#setting file paths
# from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS/hifi-gan
# https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing
if hifigan_versions == 'LJ_V1':
    HiFiGAN_CONFIG = f'./hifigan/official_pretrained/LJ_V1/config.json'
    HiFiGAN_ckpt = './hifigan/official_pretrained/LJ_V1/generator_v1'
elif hifigan_versions == 'LJ_V2':
    HiFiGAN_CONFIG = './hifigan/official_pretrained/LJ_V2/config.json'
    HiFiGAN_ckpt = './hifigan/official_pretrained/LJ_V2/generator_v2'
else:
    print('Dont supported.')


from hifigan import models, env

with open(HiFiGAN_CONFIG) as f:
    hifigan_hparams = env.AttrDict(json.load(f))

hifigan_randomseed = hifigan_hparams.seed
print(f'hifigan_randomseed: {hifigan_randomseed}')

# generator ===================
print("[seq] loading HiFiGAN")
vocoder = models.Generator(hifigan_hparams)

vocoder.load_state_dict(torch.load(
    HiFiGAN_ckpt, map_location=device)['generator'])
vocoder = vocoder.eval().to(device)
vocoder.remove_weight_norm()

print("loading UTMOS ===================================")
predictor_utmos = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)

gradtfk5tts
[seq] loading Model
loading  logs4model/gradtfk5tts/run_tfk_k5/ckpt/gradtfk5tts_500_397001.pt
[seq] Initializing diffusion-TTS...
Number of encoder + duration predictor parameters: 3.549137m
Number of decoder parameters: 2.044639m
Total parameters: 5.593776m
hifigan_randomseed: 1234
[seq] loading HiFiGAN




Removing weight norm...


Using cache found in /work/sora-sa/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0


In [8]:
infer_data_num: int = 101 #len(test_ds_list) is 200
print(f'infer_data_num: {infer_data_num}')
print(f'RESULT_DIR_PATH: {RESULT_DIR_PATH}')
print(f'RESULT_MEL_DIR_PATH: {RESULT_DIR_PATH}')
print(f'RESULT_WAV_DIR_PATH: {RESULT_WAV_DIR_PATH}')
print(f'RESULT_JSON_PATH: {RESULT_JSON_PATH}')

infer_data_num: 101
RESULT_DIR_PATH: result4eval/infer4colb/gradtfk5tts/cpu/e500_n50
RESULT_MEL_DIR_PATH: result4eval/infer4colb/gradtfk5tts/cpu/e500_n50
RESULT_WAV_DIR_PATH: result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/wav_LJ_V1
RESULT_JSON_PATH: result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/eval4mid_LJ_V1.json


## [seq] infer

In [9]:
eval_list = []


for i in tqdm(range(infer_data_num)):
    test_ds_filename = test_ds_list[i]['name']
    mel_npy_path = RESULT_MEL_DIR_PATH / f"{test_ds_filename}.npy"
    synth_wav_path = RESULT_WAV_DIR_PATH / f"{test_ds_filename}.wav"
    print(f'test_ds_index_{i}: {test_ds_filename}')
    # [seq]text2mel =========================================================
    # load txt
    print('[seq]text2mel')
    text = test_ds_list[i]['text']
    phonemes, phnid = text2phnid(text, phn2id, 'en')
    phonemes_len_int = len(phonemes)
    phnid_len_int = len(phnid)
    print(f'phonemes_len: {phonemes_len_int}')
    print(f'phnid_len: {phnid_len_int}')
    phnid_len = torch.tensor(len(phnid), dtype=torch.long).unsqueeze(0).to(device)
    phnid = torch.tensor(phnid).unsqueeze(0).to(device)

    # [seq] synth speech
    # process text to mel
    # mel is [n_mels, n_frame]
    start_time = time.perf_counter()
    _, mel_prediction, _ = model.forward(phnid,
                                        phnid_len,
                                        n_timesteps=N_STEP,
                                        temperature=TEMP,
                                        solver='original')
    end_time = time.perf_counter()

    dt = end_time - start_time
    dt4mel = dt * 22050 / ( mel_prediction.shape[-1] * 256)
    print(f'{model_name} dt: {dt}')
    print(f'{model_name} RTF: {dt4mel}')
    
    # for save mel
    mel4save = mel_prediction.unsqueeze(0) # [batch, channel(freq), n_frame(time)] ex.[1, 80, 619]
    # save
    #mel_npy_path =  RESULT_MEL_DIR_PATH / f"{test_ds_filename}.npy"
    #print(f'test_ds_index_{i}: {mel_npy_path}')
    np.save(mel_npy_path, mel4save.cpu().detach().numpy().copy())

    # [seq]mel2wav =========================================================
    print('[seq]mel2wav')
    x = np.load(mel_npy_path) # [1, n_mel, n_frame]
    x2audio = torch.FloatTensor(x).to(device)
    x2audio = x2audio.squeeze().unsqueeze(0)
    # x2audio is [1, n_mels, n_frames]
    assert x2audio.shape[0] == 1
    with torch.no_grad():
        # vocoder.forward(x).cpu() is torch.Size([1, 1, 167168])
        audio = (vocoder.forward(x2audio).cpu().squeeze().clamp(-1,1).numpy() * 32768).astype(np.int16)
    write(
        synth_wav_path,
        hifigan_hparams.sampling_rate,
        audio)

    # [seq]wav2utmos =========================================================
    print('[seq]wav2utmos')
    #iwav_path = RESULT_WAV_DIR_PATH / f"{filename}.wav"
    #wav, samplerate = torchaudio.load(iwav_path)
    wav, samplerate = torchaudio.load(synth_wav_path)
    score_utmos = predictor_utmos(wav, samplerate)
    score_utmos_float = score_utmos.item()
    print(f'utmos: {score_utmos_float}')
    #eval_dict = {'name': filename, 'path': str(iwav_path), 'utmos': score_float}
    #score_utmos_list.append(eval_dict)
    
    # path, テキスト文、phonimes, phonimes数, dt, RTF, utmos
    eval_dict = {
        'name': test_ds_filename,
        'wav_path': synth_wav_path
        'phonemes_len': phonemes_len_int,
        'phnid_len': phnid_len_int,
        'dt': dt,
        'RTF4mel': dt4mel,
        'utmos': score_utmos_float
    }
    eval_list.append(eval_dict)
    

  0%|                                                                                                                                                          | 0/101 [00:00<?, ?it/s]

test_ds_index_0: LJ045-0049
[seq]text2mel
phonemes_len: 1127
phnid_len: 215
gradtfk5tts dt: 18.204049011692405
gradtfk5tts RTF: 2.165698812520597
[seq]mel2wav
[seq]wav2utmos


  1%|█▍                                                                                                                                                | 1/101 [00:26<44:32, 26.72s/it]

utmos: 4.020130634307861
test_ds_index_1: LJ017-0027
[seq]text2mel
phonemes_len: 411
phnid_len: 79
gradtfk5tts dt: 6.0498573537915945
gradtfk5tts RTF: 1.981335471885466
[seq]mel2wav
[seq]wav2utmos


  2%|██▉                                                                                                                                               | 2/101 [00:35<26:38, 16.15s/it]

utmos: 4.3763427734375
test_ds_index_2: LJ023-0031
[seq]text2mel
phonemes_len: 223
phnid_len: 45
gradtfk5tts dt: 3.9734194856137037
gradtfk5tts RTF: 1.966906870909262
[seq]mel2wav


  3%|████▎                                                                                                                                             | 3/101 [00:41<18:40, 11.43s/it]

[seq]wav2utmos
utmos: 4.2750444412231445
test_ds_index_3: LJ032-0046
[seq]text2mel
phonemes_len: 650
phnid_len: 125
gradtfk5tts dt: 8.503336627967656
gradtfk5tts RTF: 1.6312167024523838
[seq]mel2wav
[seq]wav2utmos


  4%|█████▊                                                                                                                                            | 4/101 [00:53<19:03, 11.79s/it]

utmos: 4.087663173675537
test_ds_index_4: LJ030-0026
[seq]text2mel
phonemes_len: 1025
phnid_len: 195
gradtfk5tts dt: 12.444862617179751
gradtfk5tts RTF: 1.8076071136489087
[seq]mel2wav
[seq]wav2utmos


  5%|███████▏                                                                                                                                          | 5/101 [01:10<21:54, 13.70s/it]

utmos: 4.032395362854004
test_ds_index_5: LJ020-0041
[seq]text2mel
phonemes_len: 1154
phnid_len: 219
gradtfk5tts dt: 15.640930163674057
gradtfk5tts RTF: 1.751882061265711
[seq]mel2wav
[seq]wav2utmos


  6%|████████▋                                                                                                                                         | 6/101 [01:32<25:56, 16.39s/it]

utmos: 4.092081546783447
test_ds_index_6: LJ001-0070
[seq]text2mel
phonemes_len: 1137
phnid_len: 215
gradtfk5tts dt: 15.769096032716334
gradtfk5tts RTF: 1.803767054290106
[seq]mel2wav
[seq]wav2utmos


  7%|██████████                                                                                                                                        | 7/101 [01:54<28:27, 18.17s/it]

utmos: 4.273282527923584
test_ds_index_7: LJ019-0334
[seq]text2mel
phonemes_len: 611
phnid_len: 117
gradtfk5tts dt: 7.116166369058192
gradtfk5tts RTF: 1.67012376998609
[seq]mel2wav
[seq]wav2utmos


  8%|███████████▌                                                                                                                                      | 8/101 [02:04<24:15, 15.66s/it]

utmos: 4.419826507568359
test_ds_index_8: LJ022-0152
[seq]text2mel
phonemes_len: 387
phnid_len: 75
gradtfk5tts dt: 5.522836602292955
gradtfk5tts RTF: 1.7553411421898013
[seq]mel2wav


  9%|█████████████                                                                                                                                     | 9/101 [02:12<20:19, 13.26s/it]

[seq]wav2utmos
utmos: 4.140111923217773
test_ds_index_9: LJ050-0154
[seq]text2mel
phonemes_len: 896
phnid_len: 171
gradtfk5tts dt: 10.798972580581903
gradtfk5tts RTF: 1.6232912399230406
[seq]mel2wav
[seq]wav2utmos


 10%|██████████████▎                                                                                                                                  | 10/101 [02:27<21:07, 13.93s/it]

utmos: 4.285102844238281
test_ds_index_10: LJ016-0045
[seq]text2mel
phonemes_len: 1237
phnid_len: 235
gradtfk5tts dt: 16.45781345013529
gradtfk5tts RTF: 1.815054750397543
[seq]mel2wav
[seq]wav2utmos


 11%|███████████████▊                                                                                                                                 | 11/101 [02:50<24:50, 16.56s/it]

utmos: 4.417806148529053
test_ds_index_11: LJ036-0100
[seq]text2mel
phonemes_len: 802
phnid_len: 153
gradtfk5tts dt: 9.29722505621612
gradtfk5tts RTF: 1.61776998492397
[seq]mel2wav
[seq]wav2utmos


 12%|█████████████████▏                                                                                                                               | 12/101 [03:03<23:03, 15.55s/it]

utmos: 4.399158954620361
test_ds_index_12: LJ046-0016
[seq]text2mel
phonemes_len: 884
phnid_len: 167
gradtfk5tts dt: 9.811902531422675
gradtfk5tts RTF: 1.6315188436434451
[seq]mel2wav
[seq]wav2utmos


 13%|██████████████████▋                                                                                                                              | 13/101 [03:17<22:10, 15.12s/it]

utmos: 4.0698723793029785
test_ds_index_13: LJ048-0085
[seq]text2mel
phonemes_len: 1181
phnid_len: 223
gradtfk5tts dt: 14.66832981724292
gradtfk5tts RTF: 1.7307184956667723
[seq]mel2wav
[seq]wav2utmos


 14%|████████████████████                                                                                                                             | 14/101 [03:38<24:16, 16.74s/it]

utmos: 4.379518508911133
test_ds_index_14: LJ050-0197
[seq]text2mel
phonemes_len: 282
phnid_len: 55
gradtfk5tts dt: 4.353356514126062
gradtfk5tts RTF: 2.005170269395047
[seq]mel2wav


 15%|█████████████████████▌                                                                                                                           | 15/101 [03:44<19:30, 13.61s/it]

[seq]wav2utmos
utmos: 4.386788368225098
test_ds_index_15: LJ050-0178
[seq]text2mel
phonemes_len: 709
phnid_len: 135
gradtfk5tts dt: 8.449960780330002
gradtfk5tts RTF: 1.6579017939055074
[seq]mel2wav
[seq]wav2utmos


 16%|██████████████████████▉                                                                                                                          | 16/101 [03:56<18:43, 13.21s/it]

utmos: 4.41018533706665
test_ds_index_16: LJ043-0079
[seq]text2mel
phonemes_len: 1445
phnid_len: 273
gradtfk5tts dt: 22.128804153762758
gradtfk5tts RTF: 2.0190848930352425
[seq]mel2wav
[seq]wav2utmos


 17%|████████████████████████▍                                                                                                                        | 17/101 [04:26<25:18, 18.08s/it]

utmos: 4.002967834472656
test_ds_index_17: LJ050-0207
[seq]text2mel
phonemes_len: 737
phnid_len: 141
gradtfk5tts dt: 9.355369661934674
gradtfk5tts RTF: 1.6752688169638412
[seq]mel2wav
[seq]wav2utmos


 18%|█████████████████████████▊                                                                                                                       | 18/101 [04:39<23:03, 16.67s/it]

utmos: 4.476344585418701
test_ds_index_18: LJ034-0005
[seq]text2mel
phonemes_len: 1442
phnid_len: 273
gradtfk5tts dt: 17.611228240653872
gradtfk5tts RTF: 1.7178987768368572
[seq]mel2wav
[seq]wav2utmos


 19%|███████████████████████████▎                                                                                                                     | 19/101 [05:04<26:00, 19.03s/it]

utmos: 3.590054512023926
test_ds_index_19: LJ031-0151
[seq]text2mel
phonemes_len: 1203
phnid_len: 227
gradtfk5tts dt: 13.614811620675027
gradtfk5tts RTF: 1.6020246127683377
[seq]mel2wav
[seq]wav2utmos


 20%|████████████████████████████▋                                                                                                                    | 20/101 [05:23<25:49, 19.14s/it]

utmos: 3.932727813720703
test_ds_index_20: LJ023-0021
[seq]text2mel
phonemes_len: 564
phnid_len: 109
gradtfk5tts dt: 7.08059654943645
gradtfk5tts RTF: 1.77805158886518
[seq]mel2wav
[seq]wav2utmos


 21%|██████████████████████████████▏                                                                                                                  | 21/101 [05:33<21:54, 16.43s/it]

utmos: 4.366967678070068
test_ds_index_21: LJ015-0301
[seq]text2mel
phonemes_len: 780
phnid_len: 149
gradtfk5tts dt: 10.005746187642217
gradtfk5tts RTF: 1.632240644512835
[seq]mel2wav
[seq]wav2utmos


 22%|███████████████████████████████▌                                                                                                                 | 22/101 [05:47<20:44, 15.76s/it]

utmos: 4.260054111480713
test_ds_index_22: LJ021-0153
[seq]text2mel
phonemes_len: 320
phnid_len: 63
gradtfk5tts dt: 4.836178437806666
gradtfk5tts RTF: 1.928489123148817
[seq]mel2wav


 23%|█████████████████████████████████                                                                                                                | 23/101 [05:55<17:07, 13.17s/it]

[seq]wav2utmos
utmos: 4.082495212554932
test_ds_index_23: LJ014-0037
[seq]text2mel
phonemes_len: 365
phnid_len: 71
gradtfk5tts dt: 5.641663175076246
gradtfk5tts RTF: 1.8268132197255527
[seq]mel2wav


 24%|██████████████████████████████████▍                                                                                                              | 24/101 [06:03<14:56, 11.65s/it]

[seq]wav2utmos
utmos: 3.585376262664795
test_ds_index_24: LJ004-0200
[seq]text2mel
phonemes_len: 1097
phnid_len: 207
gradtfk5tts dt: 14.528892766684294
gradtfk5tts RTF: 1.7142662965827733
[seq]mel2wav
[seq]wav2utmos


 25%|███████████████████████████████████▉                                                                                                             | 25/101 [06:20<16:52, 13.32s/it]

utmos: 4.356453895568848
test_ds_index_25: LJ049-0010
[seq]text2mel
phonemes_len: 471
phnid_len: 91
gradtfk5tts dt: 6.215475244447589
gradtfk5tts RTF: 1.7727031914847544
[seq]mel2wav
[seq]wav2utmos


 26%|█████████████████████████████████████▎                                                                                                           | 26/101 [06:29<15:03, 12.05s/it]

utmos: 4.102656841278076
test_ds_index_26: LJ008-0291
[seq]text2mel
phonemes_len: 365
phnid_len: 71
gradtfk5tts dt: 5.010367488488555
gradtfk5tts RTF: 1.852176152111934
[seq]mel2wav


 27%|██████████████████████████████████████▊                                                                                                          | 27/101 [06:36<13:09, 10.67s/it]

[seq]wav2utmos
utmos: 4.043349266052246
test_ds_index_27: LJ048-0221
[seq]text2mel
phonemes_len: 602
phnid_len: 115
gradtfk5tts dt: 7.768719890154898
gradtfk5tts RTF: 1.6854954500345904
[seq]mel2wav
[seq]wav2utmos


 28%|████████████████████████████████████████▏                                                                                                        | 28/101 [06:47<13:08, 10.79s/it]

utmos: 3.9701976776123047
test_ds_index_28: LJ004-0157
[seq]text2mel
phonemes_len: 637
phnid_len: 121
gradtfk5tts dt: 8.265502794645727
gradtfk5tts RTF: 1.6870402901171715
[seq]mel2wav
[seq]wav2utmos


 29%|█████████████████████████████████████████▋                                                                                                       | 29/101 [07:00<13:24, 11.18s/it]

utmos: 4.383975982666016
test_ds_index_29: LJ013-0175
[seq]text2mel
phonemes_len: 681
phnid_len: 131
gradtfk5tts dt: 8.405464829877019
gradtfk5tts RTF: 1.6454234685616855
[seq]mel2wav
[seq]wav2utmos


 30%|███████████████████████████████████████████                                                                                                      | 30/101 [07:12<13:33, 11.46s/it]

utmos: 4.3374714851379395
test_ds_index_30: LJ021-0100
[seq]text2mel
phonemes_len: 752
phnid_len: 143
gradtfk5tts dt: 9.716659414581954
gradtfk5tts RTF: 1.6474866210286363
[seq]mel2wav
[seq]wav2utmos


 31%|████████████████████████████████████████████▌                                                                                                    | 31/101 [07:25<14:11, 12.16s/it]

utmos: 3.1179280281066895
test_ds_index_31: LJ018-0132
[seq]text2mel
phonemes_len: 687
phnid_len: 131
gradtfk5tts dt: 9.184533631429076
gradtfk5tts RTF: 1.6515442863795924
[seq]mel2wav
[seq]wav2utmos


 32%|█████████████████████████████████████████████▉                                                                                                   | 32/101 [07:39<14:22, 12.50s/it]

utmos: 3.749347686767578
test_ds_index_32: LJ023-0059
[seq]text2mel
phonemes_len: 1184
phnid_len: 225
gradtfk5tts dt: 14.10395510867238
gradtfk5tts RTF: 1.6284360869754628
[seq]mel2wav
[seq]wav2utmos


 33%|███████████████████████████████████████████████▍                                                                                                 | 33/101 [07:59<16:38, 14.69s/it]

utmos: 3.689878463745117
test_ds_index_33: LJ003-0027
[seq]text2mel
phonemes_len: 964
phnid_len: 183
gradtfk5tts dt: 11.247079495340586
gradtfk5tts RTF: 1.6503280908769427
[seq]mel2wav
[seq]wav2utmos


 34%|████████████████████████████████████████████████▊                                                                                                | 34/101 [08:15<16:51, 15.09s/it]

utmos: 3.8000941276550293
test_ds_index_34: LJ018-0133
[seq]text2mel
phonemes_len: 674
phnid_len: 129
gradtfk5tts dt: 8.596480740234256
gradtfk5tts RTF: 1.6752015017159692
[seq]mel2wav
[seq]wav2utmos


 35%|██████████████████████████████████████████████████▏                                                                                              | 35/101 [08:27<15:39, 14.24s/it]

utmos: 4.390878677368164
test_ds_index_35: LJ033-0060
[seq]text2mel
phonemes_len: 596
phnid_len: 115
gradtfk5tts dt: 7.788042035885155
gradtfk5tts RTF: 1.7025532091853155
[seq]mel2wav
[seq]wav2utmos


 36%|███████████████████████████████████████████████████▋                                                                                             | 36/101 [08:38<14:28, 13.36s/it]

utmos: 4.04732084274292
test_ds_index_36: LJ003-0299
[seq]text2mel
phonemes_len: 881
phnid_len: 167
gradtfk5tts dt: 11.625033477321267
gradtfk5tts RTF: 1.641470211161206
[seq]mel2wav
[seq]wav2utmos


 37%|█████████████████████████████████████████████████████                                                                                            | 37/101 [08:55<15:13, 14.28s/it]

utmos: 4.181947708129883
test_ds_index_37: LJ011-0060
[seq]text2mel
phonemes_len: 1083
phnid_len: 205
gradtfk5tts dt: 12.895217238925397
gradtfk5tts RTF: 1.6262098515184904
[seq]mel2wav
[seq]wav2utmos


 38%|██████████████████████████████████████████████████████▌                                                                                          | 38/101 [09:13<16:16, 15.50s/it]

utmos: 4.257242679595947
test_ds_index_38: LJ013-0240
[seq]text2mel
phonemes_len: 768
phnid_len: 147
gradtfk5tts dt: 9.12598229572177
gradtfk5tts RTF: 1.6724394084164314
[seq]mel2wav
[seq]wav2utmos


 39%|███████████████████████████████████████████████████████▉                                                                                         | 39/101 [09:26<15:16, 14.78s/it]

utmos: 4.26240348815918
test_ds_index_39: LJ047-0076
[seq]text2mel
phonemes_len: 1287
phnid_len: 245
gradtfk5tts dt: 17.662135793827474
gradtfk5tts RTF: 1.7689411984642796
[seq]mel2wav
[seq]wav2utmos


 40%|█████████████████████████████████████████████████████████▍                                                                                       | 40/101 [09:50<17:57, 17.66s/it]

utmos: 4.3642578125
test_ds_index_40: LJ041-0133
[seq]text2mel
phonemes_len: 1018
phnid_len: 193
gradtfk5tts dt: 12.548585112206638
gradtfk5tts RTF: 1.6180313302544698
[seq]mel2wav
[seq]wav2utmos


 41%|██████████████████████████████████████████████████████████▊                                                                                      | 41/101 [10:08<17:44, 17.75s/it]

utmos: 4.12418794631958
test_ds_index_41: LJ038-0264
[seq]text2mel
phonemes_len: 1116
phnid_len: 211
gradtfk5tts dt: 13.648945421911776
gradtfk5tts RTF: 1.6215476646182903
[seq]mel2wav
[seq]wav2utmos


 42%|████████████████████████████████████████████████████████████▎                                                                                    | 42/101 [10:27<17:52, 18.18s/it]

utmos: 3.9864602088928223
test_ds_index_42: LJ011-0016
[seq]text2mel
phonemes_len: 609
phnid_len: 117
gradtfk5tts dt: 7.287036772817373
gradtfk5tts RTF: 1.705578728352402
[seq]mel2wav
[seq]wav2utmos


 43%|█████████████████████████████████████████████████████████████▋                                                                                   | 43/101 [10:38<15:22, 15.91s/it]

utmos: 4.4351301193237305
test_ds_index_43: LJ003-0185
[seq]text2mel
phonemes_len: 1105
phnid_len: 209
gradtfk5tts dt: 12.84432527422905
gradtfk5tts RTF: 1.6080201461252643
[seq]mel2wav
[seq]wav2utmos


 44%|███████████████████████████████████████████████████████████████▏                                                                                 | 44/101 [10:56<15:44, 16.57s/it]

utmos: 4.428409576416016
test_ds_index_44: LJ014-0063
[seq]text2mel
phonemes_len: 793
phnid_len: 151
gradtfk5tts dt: 9.54791250359267
gradtfk5tts RTF: 1.6349673110106422
[seq]mel2wav
[seq]wav2utmos


 45%|████████████████████████████████████████████████████████████████▌                                                                                | 45/101 [11:10<14:43, 15.79s/it]

utmos: 4.418869972229004
test_ds_index_45: LJ005-0185
[seq]text2mel
phonemes_len: 588
phnid_len: 113
gradtfk5tts dt: 7.816385887563229
gradtfk5tts RTF: 1.6705888339482124
[seq]mel2wav
[seq]wav2utmos


 46%|██████████████████████████████████████████████████████████████████                                                                               | 46/101 [11:21<13:13, 14.43s/it]

utmos: 4.436697006225586
test_ds_index_46: LJ014-0135
[seq]text2mel
phonemes_len: 826
phnid_len: 157
gradtfk5tts dt: 9.54873242136091
gradtfk5tts RTF: 1.641635088346807
[seq]mel2wav
[seq]wav2utmos


 47%|███████████████████████████████████████████████████████████████████▍                                                                             | 47/101 [11:35<12:48, 14.23s/it]

utmos: 4.115898132324219
test_ds_index_47: LJ009-0046
[seq]text2mel
phonemes_len: 1011
phnid_len: 193
gradtfk5tts dt: 12.49320343695581
gradtfk5tts RTF: 1.6133054710040036
[seq]mel2wav
[seq]wav2utmos


 48%|████████████████████████████████████████████████████████████████████▉                                                                            | 48/101 [11:53<13:30, 15.29s/it]

utmos: 4.342774391174316
test_ds_index_48: LJ037-0024
[seq]text2mel
phonemes_len: 1505
phnid_len: 285
gradtfk5tts dt: 18.65577405691147
gradtfk5tts RTF: 1.7504077220983876
[seq]mel2wav
[seq]wav2utmos


 49%|██████████████████████████████████████████████████████████████████████▎                                                                          | 49/101 [12:19<15:58, 18.44s/it]

utmos: 4.388545036315918
test_ds_index_49: LJ002-0217
[seq]text2mel
phonemes_len: 1093
phnid_len: 207
gradtfk5tts dt: 12.274767408147454
gradtfk5tts RTF: 1.6116771945839568
[seq]mel2wav
[seq]wav2utmos


 50%|███████████████████████████████████████████████████████████████████████▊                                                                         | 50/101 [12:36<15:27, 18.19s/it]

utmos: 3.9998700618743896
test_ds_index_50: LJ044-0017
[seq]text2mel
phonemes_len: 782
phnid_len: 149
gradtfk5tts dt: 10.219252285547554
gradtfk5tts RTF: 1.6300239648171555
[seq]mel2wav
[seq]wav2utmos


 50%|█████████████████████████████████████████████████████████████████████████▏                                                                       | 51/101 [12:51<14:14, 17.08s/it]

utmos: 4.19450044631958
test_ds_index_51: LJ017-0074
[seq]text2mel
phonemes_len: 481
phnid_len: 93
gradtfk5tts dt: 6.595776392146945
gradtfk5tts RTF: 1.7215538523536948
[seq]mel2wav
[seq]wav2utmos


 51%|██████████████████████████████████████████████████████████████████████████▋                                                                      | 52/101 [13:00<12:04, 14.79s/it]

utmos: 4.400047302246094
test_ds_index_52: LJ033-0153
[seq]text2mel
phonemes_len: 714
phnid_len: 137
gradtfk5tts dt: 9.357323511503637
gradtfk5tts RTF: 1.6549745205712205
[seq]mel2wav
[seq]wav2utmos


 52%|████████████████████████████████████████████████████████████████████████████                                                                     | 53/101 [13:14<11:31, 14.41s/it]

utmos: 4.19612455368042
test_ds_index_53: LJ032-0124
[seq]text2mel
phonemes_len: 1056
phnid_len: 199
gradtfk5tts dt: 13.317642216570675
gradtfk5tts RTF: 1.6133417439971398
[seq]mel2wav
[seq]wav2utmos


 53%|█████████████████████████████████████████████████████████████████████████████▌                                                                   | 54/101 [13:33<12:19, 15.72s/it]

utmos: 4.343605995178223
test_ds_index_54: LJ018-0287
[seq]text2mel
phonemes_len: 567
phnid_len: 109
gradtfk5tts dt: 7.12151360604912
gradtfk5tts RTF: 1.6668369460489885
[seq]mel2wav
[seq]wav2utmos


 54%|██████████████████████████████████████████████████████████████████████████████▉                                                                  | 55/101 [13:41<10:28, 13.67s/it]

utmos: 4.381563186645508
test_ds_index_55: LJ020-0038
[seq]text2mel
phonemes_len: 620
phnid_len: 119
gradtfk5tts dt: 7.6453019650653005
gradtfk5tts RTF: 1.65871879260164
[seq]mel2wav
[seq]wav2utmos


 55%|████████████████████████████████████████████████████████████████████████████████▍                                                                | 56/101 [13:51<09:20, 12.44s/it]

utmos: 3.9917092323303223
test_ds_index_56: LJ001-0007
[seq]text2mel
phonemes_len: 1016
phnid_len: 193
gradtfk5tts dt: 13.22627237252891
gradtfk5tts RTF: 1.6159092742368266
[seq]mel2wav
[seq]wav2utmos


 56%|█████████████████████████████████████████████████████████████████████████████████▊                                                               | 57/101 [14:10<10:29, 14.30s/it]

utmos: 3.3469934463500977
test_ds_index_57: LJ003-0313
[seq]text2mel
phonemes_len: 1175
phnid_len: 223
gradtfk5tts dt: 15.072748289443552
gradtfk5tts RTF: 1.7287059950390642
[seq]mel2wav
[seq]wav2utmos


 57%|███████████████████████████████████████████████████████████████████████████████████▎                                                             | 58/101 [14:31<11:40, 16.29s/it]

utmos: 4.3468708992004395
test_ds_index_58: LJ019-0265
[seq]text2mel
phonemes_len: 590
phnid_len: 113
gradtfk5tts dt: 8.775131031870842
gradtfk5tts RTF: 1.657514727699699
[seq]mel2wav
[seq]wav2utmos


 58%|████████████████████████████████████████████████████████████████████████████████████▋                                                            | 59/101 [14:43<10:39, 15.23s/it]

utmos: 3.8264949321746826
test_ds_index_59: LJ038-0281
[seq]text2mel
phonemes_len: 1193
phnid_len: 225
gradtfk5tts dt: 14.592796127311885
gradtfk5tts RTF: 1.7312927998408825
[seq]mel2wav
[seq]wav2utmos


 59%|██████████████████████████████████████████████████████████████████████████████████████▏                                                          | 60/101 [15:04<11:27, 16.77s/it]

utmos: 4.375202655792236
test_ds_index_60: LJ045-0235
[seq]text2mel
phonemes_len: 1185
phnid_len: 223
gradtfk5tts dt: 14.725278238765895
gradtfk5tts RTF: 1.72796952254762
[seq]mel2wav
[seq]wav2utmos


 60%|███████████████████████████████████████████████████████████████████████████████████████▌                                                         | 61/101 [15:24<11:57, 17.94s/it]

utmos: 4.4139790534973145
test_ds_index_61: LJ038-0255
[seq]text2mel
phonemes_len: 240
phnid_len: 47
gradtfk5tts dt: 4.061228713952005
gradtfk5tts RTF: 2.021994516407192
[seq]mel2wav


 61%|█████████████████████████████████████████████████████████████████████████████████████████                                                        | 62/101 [15:30<09:19, 14.35s/it]

[seq]wav2utmos
utmos: 3.5223584175109863
test_ds_index_62: LJ028-0205
[seq]text2mel
phonemes_len: 899
phnid_len: 171
gradtfk5tts dt: 11.08000769186765
gradtfk5tts RTF: 1.6202924024145908
[seq]mel2wav
[seq]wav2utmos


 62%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 63/101 [15:46<09:23, 14.82s/it]

utmos: 4.084481239318848
test_ds_index_63: LJ014-0260
[seq]text2mel
phonemes_len: 1068
phnid_len: 203
gradtfk5tts dt: 14.956518476828933
gradtfk5tts RTF: 1.832499291063289
[seq]mel2wav
[seq]wav2utmos


 63%|███████████████████████████████████████████████████████████████████████████████████████████▉                                                     | 64/101 [16:07<10:09, 16.48s/it]

utmos: 4.37827730178833
test_ds_index_64: LJ033-0166
[seq]text2mel
phonemes_len: 967
phnid_len: 183
gradtfk5tts dt: 11.777347585186362
gradtfk5tts RTF: 1.6178884709763712
[seq]mel2wav
[seq]wav2utmos


 64%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 65/101 [16:23<09:56, 16.57s/it]

utmos: 3.9016008377075195
test_ds_index_65: LJ037-0125
[seq]text2mel
phonemes_len: 1085
phnid_len: 205
gradtfk5tts dt: 12.755455991253257
gradtfk5tts RTF: 1.601549998901776
[seq]mel2wav
[seq]wav2utmos


 65%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 66/101 [16:42<09:55, 17.01s/it]

utmos: 4.20513391494751
test_ds_index_66: LJ013-0142
[seq]text2mel
phonemes_len: 740
phnid_len: 143
gradtfk5tts dt: 9.411455696448684
gradtfk5tts RTF: 1.6376467653621645
[seq]mel2wav
[seq]wav2utmos


 66%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 67/101 [16:53<08:42, 15.37s/it]

utmos: 4.254004001617432
test_ds_index_67: LJ031-0199
[seq]text2mel
phonemes_len: 1439
phnid_len: 273
gradtfk5tts dt: 16.904389871284366
gradtfk5tts RTF: 1.7190349978869368
[seq]mel2wav
[seq]wav2utmos


 67%|█████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 68/101 [17:17<09:48, 17.83s/it]

utmos: 4.376668930053711
test_ds_index_68: LJ004-0017
[seq]text2mel
phonemes_len: 619
phnid_len: 119
gradtfk5tts dt: 8.501616106368601
gradtfk5tts RTF: 1.668036688010995
[seq]mel2wav
[seq]wav2utmos


 68%|███████████████████████████████████████████████████████████████████████████████████████████████████                                              | 69/101 [17:27<08:19, 15.62s/it]

utmos: 4.0664849281311035
test_ds_index_69: LJ024-0115
[seq]text2mel
phonemes_len: 1011
phnid_len: 191
gradtfk5tts dt: 12.476086680777371
gradtfk5tts RTF: 1.608683285642432
[seq]mel2wav
[seq]wav2utmos


 69%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                            | 70/101 [17:42<07:57, 15.41s/it]

utmos: 3.9488306045532227
test_ds_index_70: LJ017-0171
[seq]text2mel
phonemes_len: 620
phnid_len: 119
gradtfk5tts dt: 7.371526814065874
gradtfk5tts RTF: 1.7068019809533823
[seq]mel2wav
[seq]wav2utmos


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 71/101 [17:53<06:59, 13.97s/it]

utmos: 3.8708739280700684
test_ds_index_71: LJ017-0040
[seq]text2mel
phonemes_len: 412
phnid_len: 79
gradtfk5tts dt: 6.578960847109556
gradtfk5tts RTF: 1.7489642008917547
[seq]mel2wav
[seq]wav2utmos


 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 72/101 [18:02<06:06, 12.65s/it]

utmos: 4.33411979675293
test_ds_index_72: LJ005-0044
[seq]text2mel
phonemes_len: 796
phnid_len: 151
gradtfk5tts dt: 9.571519335731864
gradtfk5tts RTF: 1.6859343155106692
[seq]mel2wav
[seq]wav2utmos


 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 73/101 [18:16<06:01, 12.91s/it]

utmos: 4.385761260986328
test_ds_index_73: LJ007-0169
[seq]text2mel
phonemes_len: 399
phnid_len: 77
gradtfk5tts dt: 5.3296765042468905
gradtfk5tts RTF: 1.8585426195382708
[seq]mel2wav


 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 74/101 [18:24<05:08, 11.41s/it]

[seq]wav2utmos
utmos: 4.4238972663879395
test_ds_index_74: LJ015-0153
[seq]text2mel
phonemes_len: 1156
phnid_len: 219
gradtfk5tts dt: 13.73683741223067
gradtfk5tts RTF: 1.6163831163533464
[seq]mel2wav
[seq]wav2utmos


 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                     | 75/101 [18:40<05:33, 12.84s/it]

utmos: 4.400643348693848
test_ds_index_75: LJ045-0043
[seq]text2mel
phonemes_len: 1166
phnid_len: 221
gradtfk5tts dt: 13.498578226193786
gradtfk5tts RTF: 1.6398737762670408
[seq]mel2wav
[seq]wav2utmos


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 76/101 [18:59<06:08, 14.73s/it]

utmos: 3.956836223602295
test_ds_index_76: LJ050-0010
[seq]text2mel
phonemes_len: 1046
phnid_len: 197
gradtfk5tts dt: 11.867003886960447
gradtfk5tts RTF: 1.6459555889409587
[seq]mel2wav
[seq]wav2utmos


 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 77/101 [19:16<06:07, 15.33s/it]

utmos: 3.3796465396881104
test_ds_index_77: LJ006-0126
[seq]text2mel
phonemes_len: 1061
phnid_len: 201
gradtfk5tts dt: 12.220774352550507
gradtfk5tts RTF: 1.611959672148609
[seq]mel2wav
[seq]wav2utmos


 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 78/101 [19:33<06:06, 15.92s/it]

utmos: 4.364470481872559
test_ds_index_78: LJ018-0356
[seq]text2mel
phonemes_len: 979
phnid_len: 185
gradtfk5tts dt: 12.408914651721716
gradtfk5tts RTF: 1.6169662920200445
[seq]mel2wav
[seq]wav2utmos


 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 79/101 [19:51<06:02, 16.48s/it]

utmos: 3.4449803829193115
test_ds_index_79: LJ040-0223
[seq]text2mel
phonemes_len: 1070
phnid_len: 201
gradtfk5tts dt: 14.088872083462775
gradtfk5tts RTF: 1.736071784694397
[seq]mel2wav
[seq]wav2utmos


 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 80/101 [20:10<06:05, 17.42s/it]

utmos: 4.455361843109131
test_ds_index_80: LJ008-0281
[seq]text2mel
phonemes_len: 651
phnid_len: 125
gradtfk5tts dt: 8.761208430863917
gradtfk5tts RTF: 1.665844421741768
[seq]mel2wav
[seq]wav2utmos


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 81/101 [20:23<05:19, 15.99s/it]

utmos: 3.934906244277954
test_ds_index_81: LJ008-0222
[seq]text2mel
phonemes_len: 515
phnid_len: 99
gradtfk5tts dt: 7.051969449967146
gradtfk5tts RTF: 1.7255851204254213
[seq]mel2wav
[seq]wav2utmos


 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 82/101 [20:33<04:30, 14.23s/it]

utmos: 4.414393424987793
test_ds_index_82: LJ046-0123
[seq]text2mel
phonemes_len: 1130
phnid_len: 213
gradtfk5tts dt: 14.987715037539601
gradtfk5tts RTF: 1.8207814515258518
[seq]mel2wav
[seq]wav2utmos


 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 83/101 [20:51<04:36, 15.34s/it]

utmos: 3.7065839767456055
test_ds_index_83: LJ030-0044
[seq]text2mel
phonemes_len: 965
phnid_len: 185
gradtfk5tts dt: 13.535769511945546
gradtfk5tts RTF: 1.774541700784813
[seq]mel2wav
[seq]wav2utmos


 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                        | 84/101 [21:10<04:37, 16.32s/it]

utmos: 4.429839611053467
test_ds_index_84: LJ018-0051
[seq]text2mel
phonemes_len: 820
phnid_len: 157
gradtfk5tts dt: 10.55325423553586
gradtfk5tts RTF: 1.6496941349078786
[seq]mel2wav
[seq]wav2utmos


 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 85/101 [21:25<04:16, 16.01s/it]

utmos: 3.6578121185302734
test_ds_index_85: LJ042-0231
[seq]text2mel
phonemes_len: 574
phnid_len: 111
gradtfk5tts dt: 7.682793917134404
gradtfk5tts RTF: 1.7277823706022928
[seq]mel2wav
[seq]wav2utmos


 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 86/101 [21:36<03:37, 14.51s/it]

utmos: 4.389186382293701
test_ds_index_86: LJ011-0121
[seq]text2mel
phonemes_len: 1123
phnid_len: 213
gradtfk5tts dt: 14.476134680211544
gradtfk5tts RTF: 1.7174520580377526
[seq]mel2wav
[seq]wav2utmos


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 87/101 [21:53<03:33, 15.27s/it]

utmos: 4.165977478027344
test_ds_index_87: LJ016-0186
[seq]text2mel
phonemes_len: 848
phnid_len: 161
gradtfk5tts dt: 10.828251303173602
gradtfk5tts RTF: 1.6566034444034325
[seq]mel2wav
[seq]wav2utmos


 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 88/101 [22:08<03:19, 15.34s/it]

utmos: 4.430983066558838
test_ds_index_88: LJ011-0164
[seq]text2mel
phonemes_len: 1007
phnid_len: 191
gradtfk5tts dt: 12.792317127808928
gradtfk5tts RTF: 1.610874638318867
[seq]mel2wav
[seq]wav2utmos


 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 89/101 [22:27<03:14, 16.19s/it]

utmos: 3.8106303215026855
test_ds_index_89: LJ026-0039
[seq]text2mel
phonemes_len: 1150
phnid_len: 219
gradtfk5tts dt: 13.764843406155705
gradtfk5tts RTF: 1.6353167947507183
[seq]mel2wav
[seq]wav2utmos


 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 90/101 [22:43<02:58, 16.23s/it]

utmos: 3.62725830078125
test_ds_index_90: LJ003-0105
[seq]text2mel
phonemes_len: 971
phnid_len: 185
gradtfk5tts dt: 12.456740371882915
gradtfk5tts RTF: 1.6085968258059542
[seq]mel2wav
[seq]wav2utmos


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 91/101 [22:58<02:38, 15.87s/it]

utmos: 4.40883731842041
test_ds_index_91: LJ039-0104
[seq]text2mel
phonemes_len: 383
phnid_len: 75
gradtfk5tts dt: 6.525944961234927
gradtfk5tts RTF: 1.7402414666605812
[seq]mel2wav
[seq]wav2utmos


 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 92/101 [23:07<02:05, 13.93s/it]

utmos: 3.6669578552246094
test_ds_index_92: LJ002-0038
[seq]text2mel
phonemes_len: 642
phnid_len: 123
gradtfk5tts dt: 9.152987749315798
gradtfk5tts RTF: 1.6632332861320984
[seq]mel2wav
[seq]wav2utmos


 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 93/101 [23:20<01:49, 13.63s/it]

utmos: 4.228143692016602
test_ds_index_93: LJ046-0194
[seq]text2mel
phonemes_len: 1259
phnid_len: 239
gradtfk5tts dt: 15.94194093439728
gradtfk5tts RTF: 1.746977365634244
[seq]mel2wav
[seq]wav2utmos


 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 94/101 [23:42<01:53, 16.17s/it]

utmos: 4.181455612182617
test_ds_index_94: LJ008-0115
[seq]text2mel
phonemes_len: 845
phnid_len: 161
gradtfk5tts dt: 9.890202133916318
gradtfk5tts RTF: 1.650912647263012
[seq]mel2wav
[seq]wav2utmos


 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 95/101 [23:57<01:33, 15.54s/it]

utmos: 4.366379737854004
test_ds_index_95: LJ016-0104
[seq]text2mel
phonemes_len: 521
phnid_len: 101
gradtfk5tts dt: 6.670721271075308
gradtfk5tts RTF: 1.7202634265906922
[seq]mel2wav
[seq]wav2utmos


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 96/101 [24:06<01:09, 13.82s/it]

utmos: 4.37525749206543
test_ds_index_96: LJ019-0301
[seq]text2mel
phonemes_len: 1122
phnid_len: 213
gradtfk5tts dt: 12.69476520176977
gradtfk5tts RTF: 1.6127372136512688
[seq]mel2wav
[seq]wav2utmos


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 97/101 [24:24<01:00, 15.05s/it]

utmos: 3.924506425857544
test_ds_index_97: LJ028-0012
[seq]text2mel
phonemes_len: 1165
phnid_len: 219
gradtfk5tts dt: 14.043797022663057
gradtfk5tts RTF: 1.614995641843919
[seq]mel2wav
[seq]wav2utmos


 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 98/101 [24:44<00:49, 16.50s/it]

utmos: 4.210549354553223
test_ds_index_98: LJ018-0059
[seq]text2mel
phonemes_len: 1004
phnid_len: 191
gradtfk5tts dt: 14.011151043698192
gradtfk5tts RTF: 1.7540986130174936
[seq]mel2wav
[seq]wav2utmos


 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 99/101 [25:01<00:33, 16.56s/it]

utmos: 4.085937976837158
test_ds_index_99: LJ029-0081
[seq]text2mel
phonemes_len: 1427
phnid_len: 269
gradtfk5tts dt: 21.1776631642133
gradtfk5tts RTF: 1.9040623074231116
[seq]mel2wav
[seq]wav2utmos


 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 100/101 [25:30<00:20, 20.24s/it]

utmos: 4.281447410583496
test_ds_index_100: LJ017-0230
[seq]text2mel
phonemes_len: 1085
phnid_len: 207
gradtfk5tts dt: 13.309624452143908
gradtfk5tts RTF: 1.6377076962598949
[seq]mel2wav
[seq]wav2utmos


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [25:48<00:00, 15.34s/it]

utmos: 3.5967650413513184





In [10]:
#RESULT_JSON_PATH = RESULT_DIR_PATH / 'eval4mid.json'
if RESULT_JSON_PATH.exists() == False:
    with open(RESULT_JSON_PATH, 'w') as f:
        for entry in eval_list:
            f.write(json.dumps(entry) + '\n')
    print(f'Make {RESULT_JSON_PATH}')
else:
    print(f'Already Exists {RESULT_JSON_PATH}')

Already Exists result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/eval4mid_LJ_V1.json


In [None]:
# recheck

In [11]:
a = toybox.load_json('result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/eval4mid_LJ_V1.json')

Exist result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/eval4mid_LJ_V1.json


In [12]:
dt_list = [a[n]['dt'] for n in range(len(a))]
RTF4mel_list = [a[n]['RTF4mel'] for n in range(len(a))]
utmos_list = [a[n]['utmos'] for n in range(len(a))]

dt_nparr = np.array(dt_list[1:101])
RTF4mel_nparr = np.array(RTF4mel_list[1:101])
utmos_nparr = np.array(utmos_list[1:101])
print(len(dt_nparr))

significant_digits = 8

# for culc difference time to infer text2mel
dt_mean = toybox.round_significant_digits(np.mean(dt_nparr), significant_digits=significant_digits)
dt_var = toybox.round_significant_digits(np.var(dt_nparr), significant_digits=significant_digits)
dt_std = toybox.round_significant_digits(np.std(dt_nparr), significant_digits=significant_digits)
print(f'dt ---------------------------')
print(f'dt mean: {dt_mean}')
print(f'dt var: {dt_var}')
print(f'dt std: {dt_std}')

# for culc RTF4mel to infer text2mel
RTF4mel_mean = toybox.round_significant_digits(np.mean(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_var = toybox.round_significant_digits(np.var(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_std = toybox.round_significant_digits(np.std(RTF4mel_nparr), significant_digits=significant_digits)
print(f'RTF ---------------------------')
print(f'RTF mean: {RTF4mel_mean}')
print(f'RTF var: {RTF4mel_var}')
print(f'RTF std: {RTF4mel_std}')

# for culc utmos to infer 
print(f'utmos ---------------------------')
utmos_mean = toybox.round_significant_digits(np.mean(utmos_nparr), significant_digits=significant_digits)
utmos_var = toybox.round_significant_digits(np.var(utmos_nparr), significant_digits=significant_digits)
utmos_std = toybox.round_significant_digits(np.std(utmos_nparr), significant_digits=significant_digits)
print(f'utmos mean: {utmos_mean}')
print(f'utmos var: {utmos_var}')
print(f'utmos std: {utmos_std}')

100
dt ---------------------------
dt mean: 10.763684
dt var: 13.17348
dt std: 3.629529
RTF ---------------------------
RTF mean: 1.6718893000000001
RTF var: 0.0087506937
RTF std: 0.093545142
utmos ---------------------------
utmos mean: 4.1353975
utmos var: 0.089310245
utmos std: 0.2988482
