# Модель coqui

In [1]:
from TTS.api import TTS
from pathlib import Path
import torch

In [None]:
model_name = "tts_models/en/ljspeech/vits"
device = "cuda"

if device == "cuda" and not torch.cuda.is_available():
    print("CUDA недоступна, переключение на CPU.")
    device = "cpu"

model = TTS(model_name=model_name)
model = model.to(device)

 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


TTS(
  (synthesizer): Synthesizer(
    (tts_model): Vits(
      (text_encoder): TextEncoder(
        (emb): Embedding(179, 192)
        (encoder): RelativePositionTransformer(
          (dropout): Dropout(p=0.1, inplace=False)
          (attn_layers): ModuleList(
            (0-5): 6 x RelativePositionMultiHeadAttention(
              (conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (norm_layers_1): ModuleList(
            (0-5): 6 x LayerNorm2()
          )
          (ffn_layers): ModuleList(
            (0-5): 6 x FeedForwardNetwork(
              (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
              (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,)

In [8]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Функция для синтеза речи
def text_to_speech_coqui(text, model, file_to_save="output.wav", speed=1.0):
    """
    Преобразует текст в аудио.

    Args:
        text (str): Текст для озвучки
        model: Инициализированная модель TTS
        file_to_save (str, Path): Путь для сохранения файла с TTS (по умолчанию: "output.wav")
        speed (float): Скорость речи (по умолчанию: 1.0)

    Returns:
        None
    """
    try:
        # Генерация речи и сохранение в файл
        model.tts_to_file(
            text=text,
            file_path=file_to_save,
            speed=speed
        )

        logging.info(f"TTS сохранён в {file_to_save}")

    except Exception as e:
        logging.error(f"Ошибка во время генерации TTS: {e}")

In [6]:
# Текст для синтеза
text = "Artificial intelligence is revolutionizing the way we interact with technology. It has the potential to enhance our daily lives by providing personalized experiences and improving efficiency in various industries. As we continue to innovate, it is essential to consider the ethical implications of AI and ensure that it benefits everyone. Together, we can shape a future where technology and humanity coexist harmoniously."
file_to_save = "output_1.mp3"

# Вызов функции синтеза речи
text_to_speech_coqui(text, model, file_to_save=file_to_save)

 > Text splitted to sentences.
['Artificial intelligence is revolutionizing the way we interact with technology.', 'It has the potential to enhance our daily lives by providing personalized experiences and improving efficiency in various industries.', 'As we continue to innovate, it is essential to consider the ethical implications of AI and ensure that it benefits everyone.', 'Together, we can shape a future where technology and humanity coexist harmoniously.']
 > Processing time: 1.3853058815002441
 > Real-time factor: 0.04296346808764344


# Модель Silero

In [4]:
import torch
import torchaudio
from pathlib import Path

In [5]:
# Параметры модели
language = 'ru'
model_id = 'v4_ru'
device = 'cuda'

# Проверка доступности CUDA и переключение на CPU при необходимости
if device == "cuda" and not torch.cuda.is_available():
    print("CUDA недоступна, переключение на CPU.")
    device = "cpu"

# Загрузка модели Silero TTS
model, example_text  = torch.hub.load(
    repo_or_dir='snakers4/silero-models',
    model='silero_tts',
    language=language,
    speaker=model_id
)

model.to(device)

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


In [9]:

def text_to_speech_silero(text, model, file_to_save="output.mp3", sample_rate=24000, speaker='eugene', put_accent=True, put_yo=True):
    """
    Преобразует текст в аудио, используя модель Silero TTS.

    Args:
        text (str): Текст для озвучки (может быть обычным текстом или SSML)
        model: Инициализированная модель TTS Silero
        file_to_save (str): Путь для сохранения файла с TTS (по умолчанию: "output.mp3")
        sample_rate (int): Частота дискретизации аудио (по умолчанию: 24000)
        speaker (str): Имя диктора (по умолчанию: 'eugene')
        put_accent (bool): Расставлять ударения (по умолчанию: True)
        put_yo (bool): Использовать букву "ё" (по умолчанию: True)

    Returns:
        None
    """
    try:
        # Проверяем, является ли текст SSML
        if "<speak>" in text:
            audio = model.apply_tts(
                ssml_text=text,
                speaker=speaker,
                sample_rate=sample_rate,
                put_accent=put_accent,
                put_yo=put_yo
            )
        else:
            audio = model.apply_tts(
                text=text,
                speaker=speaker,
                sample_rate=sample_rate,
                put_accent=put_accent,
                put_yo=put_yo
            )

        if audio.dim() == 1:
            audio = audio.unsqueeze(0)

        torchaudio.save(file_to_save, audio, sample_rate)
        logging.info(f"TTS сохранён в {file_to_save}")

    except Exception as e:
        logging.error(f"Ошибка во время генерации TTS: {e}")


# def convert_to_ssml(text):
#     """
#     Преобразует обычную строку в строку, отформатированную как SSML.

#     Args:
#         text (str): Обычная строка текста.

#     Returns:
#         str: Строка SSML.
#     """
#     ssml_text = f"<speak>{text}</speak>"
#     return ssml_text


In [10]:
text = "<speak>Привет, это пример текста, который будет преобразован в аудио.</speak>"

file_to_save = "output_silero_2.mp3"

text_to_speech_silero(text=text, model=model, file_to_save=file_to_save)