In [1]:
%%capture
%pip install git+https://github.com/jimregan/Matcha-TTS@more-data


In [17]:
%%capture
!apt install espeak-ng

In [18]:
!git lfs install

Git LFS initialized.


In [19]:
!git clone https://huggingface.co/jimregan/matcha-hu-anna

fatal: destination path 'matcha-hu-anna' already exists and is not an empty directory.


In [20]:
!matcha-tts --vocoder hifigan_univ_v1 --model matcha_vctk --text "test to start" --output_folder /content/output

[-] GPU not available or forced CPU run! Using CPU
[!] Configurations: 
	- Model: matcha_vctk
	- Vocoder: hifigan_univ_v1
	- Temperature: 0.667
	- Speaking rate: 0.85
	- Number of ODE steps: 10
	- Speaker: 0
[+] Model already present at /root/.local/share/matcha_tts/matcha_vctk.ckpt!
[+] Model already present at /root/.local/share/matcha_tts/hifigan_univ_v1!
[!] Loading matcha_vctk!
  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
[+] matcha_vctk loaded!
[!] Loading hifigan_univ_v1!
  WeightNorm.apply(module, name, dim)
  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
Removing weight norm...
[+] hifigan_univ_v1 loaded!
[1] - Input text: test to start
[1] - Phonetised text: tˈɛst tə stˈɑːɹt
[🍵] Whisking Matcha-T(ea)TS for: 1
[🍵-1] Matcha-TTS RTF: 0.3778
[🍵-1] Matcha-TTS + VOCODER RTF: 1.4682
[+] Waveform saved: /content/output/utterance_001_speaker_000.wav
[🍵] Average Matcha-TTS RTF: 0.3778 ± 0.0
[🍵] Average Matcha-TTS + VOCODER R

In [21]:
!ls -al /root/.local/share/matcha_tts/hifigan_univ_v1

-rw-r--r-- 1 root root 55788858 Oct 22 08:59 /root/.local/share/matcha_tts/hifigan_univ_v1


In [22]:
import datetime as dt
from pathlib import Path

import IPython.display as ipd
import numpy as np
import soundfile as sf
import torch
from tqdm.auto import tqdm

# Hifigan imports
from matcha.hifigan.config import v1
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.env import AttrDict
from matcha.hifigan.models import Generator as HiFiGAN
# Matcha imports
from matcha.models.matcha_tts import MatchaTTS
from matcha.text import sequence_to_text, text_to_sequence
from matcha.utils.model import denormalize
from matcha.utils.utils import get_user_data_dir, intersperse

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [24]:
#MATCHA_CHECKPOINT = "/content/matcha-pl-gosia/checkpoints/last.ckpt"
MATCHA_CHECKPOINT = "/content/matcha-hu-anna/checkpoints/last.ckpt"
HIFIGAN_CHECKPOINT = "/root/.local/share/matcha_tts/hifigan_univ_v1"
OUTPUT_FOLDER = "synth_output"

In [25]:
def load_model(checkpoint_path):
    model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
    model.eval()
    return model
count_params = lambda x: f"{sum(p.numel() for p in x.parameters()):,}"


model = load_model(MATCHA_CHECKPOINT)
print(f"Model loaded! Parameter count: {count_params(model)}")

Model loaded! Parameter count: 18,204,193


In [26]:
def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
denoiser = Denoiser(vocoder, mode='zeros')

  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])


Removing weight norm...


In [27]:
@torch.inference_mode()
def process_text(text: str):
    x = torch.tensor(intersperse(text_to_sequence(text, ['hungarian_cleaners'])[0], 0),dtype=torch.long, device=device)[None]
    x_lengths = torch.tensor([x.shape[-1]],dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    return {
        'x_orig': text,
        'x': x,
        'x_lengths': x_lengths,
        'x_phones': x_phones
    }


@torch.inference_mode()
def synthesise(text, spks=None):
    text_processed = process_text(text)
    start_t = dt.datetime.now()
    output = model.synthesise(
        text_processed['x'],
        text_processed['x_lengths'],
        n_timesteps=n_timesteps,
        temperature=temperature,
        spks=spks,
        length_scale=length_scale
    )
    # merge everything to one dict
    output.update({'start_t': start_t, **text_processed})
    return output

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
    return audio.cpu().squeeze()

def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')

In [28]:
## Number of ODE Solver steps
n_timesteps = 10

## Changes to the speaking rate
length_scale=1.0

## Sampling temperature
temperature = 0.667

In [29]:
texts = [
    #"sam inglisz łit a weri strong połlisz aksent"
    #"bóbr! kurwa! ja pierdolę! jakie bydlę!"
    #"dej er inte bra"
    "kurva anyád",
    "az óvónő a vízen sétal!"
]

In [30]:
outputs, rtfs = [], []
rtfs_w = []
for i, text in enumerate(tqdm(texts)):
    output = synthesise(text) #, torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
    output['waveform'] = to_waveform(output['mel'], vocoder)

    # Compute Real Time Factor (RTF) with HiFi-GAN
    t = (dt.datetime.now() - output['start_t']).total_seconds()
    rtf_w = t * 22050 / (output['waveform'].shape[-1])

    ## Pretty print
    print(f"{'*' * 53}")
    print(f"Input text - {i}")
    print(f"{'-' * 53}")
    print(output['x_orig'])
    print(f"{'*' * 53}")
    print(f"Phonetised text - {i}")
    print(f"{'-' * 53}")
    print(output['x_phones'])
    print(f"{'*' * 53}")
    print(f"RTF:\t\t{output['rtf']:.6f}")
    print(f"RTF Waveform:\t{rtf_w:.6f}")
    rtfs.append(output['rtf'])
    rtfs_w.append(rtf_w)

    ## Display the synthesised waveform
    ipd.display(ipd.Audio(output['waveform'], rate=22050))

    ## Save the generated waveform
    save_to_folder(i, output, OUTPUT_FOLDER)

print(f"Number of ODE steps: {n_timesteps}")
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
print(f"Mean RTF Waveform (incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}")

  0%|          | 0/2 [00:00<?, ?it/s]

*****************************************************
Input text - 0
-----------------------------------------------------
kurva anyád
*****************************************************
Phonetised text - 0
-----------------------------------------------------
_k_ˈ_u_r_v_ɑ_ _ˈ_ɑ_ɲ_a_ː_d_
*****************************************************
RTF:		0.426252
RTF Waveform:	1.467788


*****************************************************
Input text - 1
-----------------------------------------------------
az óvónő a vízen sétal!
*****************************************************
Phonetised text - 1
-----------------------------------------------------
_ˌ_ɑ_z_ _ˈ_o_ː_v_o_ː_n_ø_ː_ _ˌ_ɑ_ _v_ˈ_i_z_ɛ_n_ _ʃ_ˈ_e_ː_t_ɑ_l_!_
*****************************************************
RTF:		0.292793
RTF Waveform:	1.309756


Number of ODE steps: 10
Mean RTF:				0.359523 ± 0.066730
Mean RTF Waveform (incl. vocoder):	1.388772 ± 0.079016
