[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ilya16/isp-tts/blob/main/notebooks/inference.ipynb)

# Setup

Run two cells below once to download pre-trained models and install packages.

In [5]:
#@title Download models and code

import os
import IPython.display as ipd

INDEX_ID = "1ejkNbKM5k11ALiXoS_qYbb01O1s2JVpq"
INDEX_FILE = "/content/index.txt"

!gdown {INDEX_ID} -O {INDEX_FILE}

FILE_INDEX = {}
with open(INDEX_FILE, 'r') as f:
    for line in f:
        gid, name = line.strip().split('\t')
        FILE_INDEX[name] = gid

for name, gid in FILE_INDEX.items():
    path = os.path.join("/content", name)
    if not os.path.exists(path) and "dataset" not in name:
        !gdown {gid} -O {path}

!git clone https://github.com/ilya16/isp-tts

ipd.clear_output()
print('Successfully downloaded data and code')

Successfully downloaded data and code


In [6]:
#@title Install dependencies
!pip install -r /content/isp-tts/requirements.txt
!apt-get install espeak-ng

ipd.clear_output()
print('Installed required Python libraries')

Installed required Python libraries


# Initialization

Start from here after loading data and installing all packages.

In [1]:
#@title Imports
from __future__ import annotations
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd

CONTENT_DIR = "/content" # @param {type:"string"}

os.chdir(CONTENT_DIR)
sys.path.append(os.path.join(CONTENT_DIR, 'isp-tts'))

os.environ['OMP_NUM_THREADS'] = '1'

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ipd.clear_output()

In [2]:
#@title Build Acoustic Model
from omegaconf import OmegaConf
from tts.data.providers import TextProvider
from tts.models import AcousticModel
from tts.utils import count_parameters

SAMPLE_RATE = 22050

ACOUSTIC_MODEL = "acoustic_ru_en" #@param ["acoustic_en_ipa", "acoustic_ru_en"] {allow-input: false}
ACOUSTIC_MODEL_PATH = f'/content/{ACOUSTIC_MODEL}.pt'

model = AcousticModel.from_pretrained(ACOUSTIC_MODEL_PATH)
model.eval()
model = model.to(device)

print(f'Built AcousticModel `{ACOUSTIC_MODEL}` with {count_parameters(model)} parameters')



Built AcousticModel `acoustic_ru_en` with 23638635 parameters


In [3]:
#@title Build Vocoder Model

VOCODER = "vocos_ms_fp16" #@param ["vocos_ms_fp16"] {allow-input: false}
VOCODER_PATH = f'/content/{VOCODER}.pts'

vocoder_path = "/content/vocos_ms_fp16.pts"
vocoder = torch.jit.load(vocoder_path, map_location='cpu')
vocoder = vocoder.eval().to(device)

print(f'Built Vocoder `{VOCODER}` with {count_parameters(vocoder)} parameters')

Built Vocoder `vocos_ms_fp16` with 13454850 parameters


In [4]:
#@title Build Text Provider

if "ё" in model.encoding_map:  # a simple check for Russian symbols
    charset = ['<pad>', '</s>', '#punct', '#marks', '#ru', '#en']
else:
    charset = ['<pad>', '</s>', '#punct', '#en', '#ipa_ph']

text_provider = TextProvider(
    charset=charset,
    phonemizer='#ipa_ph' in charset,
    mask_phonemes=False
)
print(f'Built TextProvider with symbols: `{text_provider.coding_table}`')


Built TextProvider with symbols: `<pad> </s> . ! ? ( ) : ; , — -   ' " + * \ а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я a b c d e f g h i j k l m n o p q r s t u v w x y z`


# Inspect AcousticModel

### Text Embedding and Transformer Text Encoder

In [5]:
print("Text Embedding")
print(f"Parameters: {count_parameters(model.text_embedding)}\n")
print(model.text_embedding)

Text Embedding
Parameters: 29568

Embedding(77, 384, padding_idx=0)


In [6]:
print("Text Encoder")
print(f"Parameters: {count_parameters(model.encoder)}\n")
print(model.encoder)

Text Encoder
Parameters: 9152292

Transformer(
  (layers): ModuleList(
    (0-5): 6 x TransformerLayer(
      (attention_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (attention): Attention(
        (to_q): Linear(in_features=384, out_features=384, bias=False)
        (to_kv): Linear(in_features=384, out_features=128, bias=False)
        (rel_pos): LearnedALiBiPositionalBias()
        (attend): Attend(
          (attn_dropout): Dropout(p=0.1, inplace=False)
        )
        (to_out): Linear(in_features=384, out_features=384, bias=False)
      )
      (feed_forward_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (feed_forward): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=False)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=1536, out_features=384, bias=False)
        )
      )
    )
  )
  (project_emb): Identity()
  (

### Aligner

In [7]:
print("Aligner")
print(f"Parameters: {count_parameters(model.aligner)}\n")
print(model.aligner)

Aligner
Parameters: 1713120

Aligner(
  (attention): ConvAttention(
    (key_proj): ModuleList(
      (0): ConvBlock1D(
        (conv): Conv1d(384, 768, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (act): GELU(approximate='none')
        (norm): MaskedInstanceNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): ConvBlock1D(
        (conv): Conv1d(768, 128, kernel_size=(1,), stride=(1,), bias=False)
        (act): Identity()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (query_proj): ModuleList(
      (0): ConvBlock1D(
        (conv): Conv1d(80, 160, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (act): GELU(approximate='none')
        (norm): MaskedInstanceNorm1d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): ConvBlock1D(
        (conv): Conv1d(160

### Flow-based Transformer Temporal Adaptor

In [8]:
print("Temporal Adaptor")
print(f"Parameters: {count_parameters(model.temporal_adaptor)}\n")
print(model.temporal_adaptor)

Temporal Adaptor
Parameters: 3058675

FlowTemporalAdaptor(
  (length_regulator): LengthRegulator()
  (averager): TemporalAverager()
  (predictor): FlowTransformerTemporalModule(
    (time_embedding): TimePositionalEmbedding(
      (freq_emb): SinusoidalEmbedding(dim=64, theta=1000.000, freq_scale=1000.000, with_positions=True)
      (mlp): Sequential(
        (0): Linear(in_features=65, out_features=32, bias=True)
        (1): SiLU()
        (2): Linear(in_features=32, out_features=32, bias=True)
      )
    )
    (transformer): Transformer(
      (layers): ModuleList(
        (0-2): 3 x TransformerLayer(
          (attention_norm): AdaptiveLayerNorm(
            bias=True
            (weight): Linear(in_features=32, out_features=256, bias=True)
            (bias): Linear(in_features=32, out_features=256, bias=True)
          )
          (attention): Attention(
            (to_q): Linear(in_features=256, out_features=256, bias=False)
            (to_kv): Linear(in_features=256, out_fea

### Speaker Embeddings

In [9]:
print("Speaker Embeddings")
print(f"Parameters: {count_parameters(model.speaker_embedding)}\n")
print(model.speaker_embedding)

Speaker Embeddings
Parameters: 501888

Embedding(1307, 384)


### Transformer Mel Decoder

In [10]:
print("Decoder")
print(f"Parameters: {count_parameters(model.decoder)}\n")
print(model.decoder)

print()
print("Mel Projection")
print(f"Parameters: {count_parameters(model.to_mel)}\n")
print(model.to_mel)

Decoder
Parameters: 9152292

Transformer(
  (layers): ModuleList(
    (0-5): 6 x TransformerLayer(
      (attention_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (attention): Attention(
        (to_q): Linear(in_features=384, out_features=384, bias=False)
        (to_kv): Linear(in_features=384, out_features=128, bias=False)
        (rel_pos): LearnedALiBiPositionalBias()
        (attend): Attend(
          (attn_dropout): Dropout(p=0.1, inplace=False)
        )
        (to_out): Linear(in_features=384, out_features=384, bias=False)
      )
      (feed_forward_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (feed_forward): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=False)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=1536, out_features=384, bias=False)
        )
      )
    )
  )
  (project_emb): Identity()
  (norm)

# Inference

Define acoustic and vocoder inference functions:

In [11]:
#@title Processing functions

def text2vector(text_provider, text: str):
    text_data = text_provider(text)
    return text_data.vector[None]


def textvector2mel(
    model: AcousticModel,
    text_vector: torch.Tensor,
    duration_target: torch.Tensor | None = None,
    duration_factor: float = 1.0,
    pitch_target: torch.Tensor | None = None,
    pitch_factor: float = 1.0,
    pitch_delta: float = 0.,
    pitch_normalize: bool = True,
    energy_target: torch.Tensor | None = None,
    steps: int = 4,
    speaker: int | None = None
):
    with torch.amp.autocast(enabled=True, device_type=device.type):
        with torch.inference_mode():
            mel, adaptor_output = model.infer(
                input_sequence=text_vector.to(device),
                duration_target=duration_target,
                duration_factor=duration_factor,
                pitch_target=pitch_target,
                pitch_factor=pitch_factor,
                pitch_delta=pitch_delta,
                pitch_normalize=pitch_normalize,
                energy_target=energy_target,
                steps=steps,
                speaker=torch.tensor([speaker]).to(device) if speaker is not None else None
            )

    return mel, adaptor_output


def mel2audio(vocoder, mel):
    with torch.amp.autocast(enabled=True, device_type=device.type):
        with torch.inference_mode():
            return vocoder.infer(mel.half()).squeeze().cpu().numpy()


In [12]:
#@title Combined synthesis function

def synthesize(
    text,
    speaker: int | None = None,
    duration_target: torch.Tensor | None = None,
    duration_factor: float = 1.0,
    pitch_target: torch.Tensor | None = None,
    pitch_factor: float = 1.0,
    pitch_delta: float = 0.,
    pitch_normalize: bool = True,
    energy_target: torch.Tensor | None = None,
    steps: int = 4,
    sample_rate: int = SAMPLE_RATE,
    display: bool = True
):
    text_vector = text2vector(text_provider, text)

    import time
    s = time.perf_counter()
    mel, adaptor_output = textvector2mel(
        model,
        text_vector=text_vector,
        speaker=speaker,
        duration_target=duration_target,
        duration_factor=duration_factor,
        pitch_target=pitch_target,
        pitch_factor=pitch_factor,
        pitch_delta=pitch_delta,
        pitch_normalize=pitch_normalize,
        energy_target=energy_target,
        steps=steps
    )
    if display:
        print('acoustic:', time.perf_counter() - s)

    s = time.perf_counter()
    audio = mel2audio(vocoder, mel)
    if display:
        print('vocoder:', time.perf_counter() - s)

    audio = np.clip(audio, -1., 1.)
    if display:
        ipd.display(ipd.Audio(audio, rate=sample_rate, normalize=False))

    return audio, mel, adaptor_output

In [13]:
#@title Test Model

#@markdown Inputs
text = "Speech Synthesis is awesome! Do you know?" # @param {"type":"string","placeholder":"Speech Synthesis is awesome!"}
speaker = 100 # @param {"type":"integer","placeholder":"100"}

#@markdown Speaking rate control
duration_factor = 1. # @param {"type":"number","placeholder":"1."}

#@markdown Pitch control
pitch_factor = 1. # @param {"type":"number","placeholder":"1."}
pitch_delta = 0 # @param {"type":"number","placeholder":"0"}

#@markdown Flow inference steps
steps = 4 # @param {"type":"integer","placeholder":"4"}

audio, mel, adaptor_output = synthesize(
    text,
    speaker=speaker,
    duration_factor=duration_factor,
    pitch_factor=pitch_factor,
    pitch_delta=pitch_delta,
    steps=steps
)


acoustic: 2.6973961469998358
vocoder: 2.945626270000048


In [None]:
audio

In [None]:
mel

In [None]:
adaptor_output

## Tasks

1. Plot mel spectrogram, duration and pitch plots.
2. Try different texts, speakers and controls using the form above. Report the findings.
3. Compare the results for the same texts side-by-side.

