Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
Support for speech-to-text and text-to-speech with [SimpliSmart](https://simplismart.ai/).
SimpliSmart provides high-quality STT and TTS for Indian languages.
SimpliSmart hosts a range of STT and TTS models, including Whisper-based transcription
and TTS models such as Orpheus and Qwen 3 TTS.
For API access, visit https://simplismart.ai/
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Literal

TTSModels = Literal["canopylabs/orpheus-3b-0.1-ft", "maya-research/Veena"]
TTSModels = Literal["canopylabs/orpheus-3b-0.1-ft", "maya-research/Veena", "qwen-tts"]

STTModels = Literal[
"openai/whisper-large-v2",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,81 +1,143 @@
from __future__ import annotations

import asyncio
import os
import traceback
from dataclasses import dataclass, replace
from typing import cast

import aiohttp

# from .log import logger
from pydantic import BaseModel

from livekit.agents import (
DEFAULT_API_CONNECT_OPTIONS,
APIConnectionError,
APIConnectOptions,
APIStatusError,
APITimeoutError,
create_api_error_from_http,
tts,
utils,
)

from .log import logger
from .models import TTSModels

SIMPLISMART_BASE_URL = "https://api.simplismart.live/tts"
QWEN_BASE_URL = "https://api.simplismart.live/v1/audio/speech"
DEFAULT_ORPHEUS_MODEL = "canopylabs/orpheus-3b-0.1-ft"
DEFAULT_QWEN_MODEL = "qwen-tts"
DEFAULT_ORPHEUS_VOICE = "tara"
DEFAULT_QWEN_VOICE = "Chelsie"


@dataclass
class _SimplismartTTSOptions:
temperature: float
top_p: float
repetition_penalty: float
max_tokens: int


@dataclass
class _QwenTTSOptions:
language: str
leading_silence: bool


class SimplismartTTSOptions(BaseModel):
temperature: float = 0.7
top_p: float = 0.9
repetition_penalty: float = 1.5
max_tokens: int = 1000
@dataclass
class _TTSOptions:
model: str
voice: str
simplismart_options: _SimplismartTTSOptions | None = None
qwen_options: _QwenTTSOptions | None = None


def _is_qwen_model(model: str) -> bool:
return "qwen" in model.lower()


class TTS(tts.TTS):
def __init__(
self,
*,
base_url: str = SIMPLISMART_BASE_URL,
model: TTSModels | str = "canopylabs/orpheus-3b-0.1-ft",
voice: str = "tara",
base_url: str | None = None,
model: TTSModels | str = DEFAULT_ORPHEUS_MODEL,
voice: str | None = None,
api_key: str | None = None,
http_session: aiohttp.ClientSession | None = None,
# sample_rate is used by the audio framework for playback; not sent to the server
sample_rate: int = 24000,
Comment thread
u9g marked this conversation as resolved.
# Simplismart TTS options
temperature: float = 0.7,
Comment thread
u9g marked this conversation as resolved.
top_p: float = 0.9,
repetition_penalty: float = 1.5,
max_tokens: int = 1000,
# Qwen 3 TTS options
language: str = "English",
leading_silence: bool = True,
) -> None:
"""
Configuration options for SimpliSmart TTS (Text-to-Speech).

Attributes:
temperature (float): Controls the randomness in the model output. Lower values make output more deterministic.
top_p (float): Nucleus sampling probability threshold. Limits the sampling pool of predicted tokens.
repetition_penalty (float): Penalty applied to repeated text to reduce repetition.
max_tokens (int): Maximum number of output tokens allowed in the synthesized speech.
"""Initialize SimpliSmart TTS.

SimpliSmart hosts multiple TTS models. The model name determines which endpoint
and payload format to use. Defaults are set for the Orpheus model
(``"canopylabs/orpheus-3b-0.1-ft"``).

Args:
base_url: Base URL for the TTS endpoint.
model: TTS model identifier.
voice: Voice/speaker identifier.
api_key: API key for authentication (defaults to ``SIMPLISMART_API_KEY`` env var).
http_session: Optional aiohttp session for reuse.
sample_rate: Expected sample rate of the returned PCM audio (default: 24000).
Used by the framework for playback; not sent to the server.
temperature: Controls output randomness.
top_p: Nucleus sampling threshold.
repetition_penalty: Penalty for repeated tokens.
max_tokens: Maximum number of output tokens.
language: Qwen 3 TTS only — language for synthesis (default: ``"English"``).
leading_silence: Qwen 3 TTS only — whether to include leading silence (default: ``True``).
"""
super().__init__(
capabilities=tts.TTSCapabilities(streaming=False),
sample_rate=24000,
sample_rate=sample_rate,
num_channels=1,
)

self._base_url = base_url
self._model = model
self._voice = voice
is_qwen = _is_qwen_model(model)

self._base_url = (
base_url
if base_url is not None
else (QWEN_BASE_URL if is_qwen else SIMPLISMART_BASE_URL)
)
self._opts = _TTSOptions(
model=model,
voice=voice
if voice is not None
else (DEFAULT_QWEN_VOICE if is_qwen else DEFAULT_ORPHEUS_VOICE),
)

if is_qwen:
self._opts.qwen_options = _QwenTTSOptions(
language=language,
leading_silence=leading_silence,
)
else:
self._opts.simplismart_options = _SimplismartTTSOptions(
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
max_tokens=max_tokens,
)

self._api_key = api_key or os.environ.get("SIMPLISMART_API_KEY")
if not self._api_key:
raise ValueError("SIMPLISMART_API_KEY is not set")

self._session = http_session

self._opts = SimplismartTTSOptions(
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
max_tokens=max_tokens,
)

@property
def model(self) -> str:
return self._model
return self._opts.model

@property
def provider(self) -> str:
Expand All @@ -91,27 +153,48 @@ def synthesize(
text: str,
*,
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> "ChunkedStream":
) -> ChunkedStream:
return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)


class ChunkedStream(tts.ChunkedStream):
def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
self._tts: TTS = tts
self._opts = tts._opts
self._model = tts._model
self._opts = replace(tts._opts)

async def _run(self, output_emitter: tts.AudioEmitter) -> None:
payload = self._opts.model_dump()
payload["prompt"] = self._input_text
payload["voice"] = self._tts._voice
payload["model"] = self._tts._model

headers = {
"Authorization": f"Bearer {self._tts._api_key}",
"Content-Type": "application/json",
}
if self._opts.qwen_options is not None:
qwen_opts = self._opts.qwen_options
payload: dict = {
"model": self._opts.model,
"text": self._input_text,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"text": self._input_text,
"model": self._opts.model,
"text": self._input_text,

i got this error when trying qwen:

livekit.agents._exceptions.APIStatusError: message='Simplismart TTS API Error: {"error":"Missing required field: model"}', status_code=400, 
retryable=False, body={"error":"Missing required field: model"}

"language": qwen_opts.language,
"voice": self._opts.voice,
"leading_silence": qwen_opts.leading_silence,
}
headers = {
"Authorization": f"Bearer {self._tts._api_key}",
"Content-Type": "application/json",
"Accept": "audio/L16",
}
else:
simplismart_opts = cast(_SimplismartTTSOptions, self._opts.simplismart_options)
payload = {
"prompt": self._input_text,
"voice": self._opts.voice,
"model": self._opts.model,
"temperature": simplismart_opts.temperature,
"top_p": simplismart_opts.top_p,
"repetition_penalty": simplismart_opts.repetition_penalty,
"max_tokens": simplismart_opts.max_tokens,
}
headers = {
"Authorization": f"Bearer {self._tts._api_key}",
"Content-Type": "application/json",
}

logger.debug("TTS request to %s (model: %s)", self._tts._base_url, self._opts.model)

try:
async with self._tts._ensure_session().post(
Expand All @@ -123,19 +206,37 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
sock_connect=self._conn_options.timeout,
),
) as resp:
resp.raise_for_status()
if resp.status != 200:
error_text = await resp.text()
logger.error("Simplismart TTS API error: %s - %s", resp.status, error_text)
raise APIStatusError(
message=f"Simplismart TTS API Error: {error_text}",
status_code=resp.status,
request_id=None,
body=error_text,
)

output_emitter.initialize(
request_id=utils.shortuuid(),
sample_rate=self._tts.sample_rate,
num_channels=self._tts.num_channels,
mime_type="audio/pcm",
)

async for audio_data, _ in resp.content.iter_chunks():
output_emitter.push(audio_data)
if audio_data:
output_emitter.push(audio_data)

output_emitter.flush()
except asyncio.TimeoutError:
raise APITimeoutError() from None
except aiohttp.ClientResponseError as e:
raise create_api_error_from_http(e.message, status=e.status) from None

except asyncio.TimeoutError as e:
logger.error("Simplismart TTS API timeout: %s", e)
raise APITimeoutError("Simplismart TTS API request timed out") from e
except aiohttp.ClientError as e:
logger.error("Simplismart TTS API client error: %s", e)
raise APIConnectionError(f"Simplismart TTS API connection error: {e}") from e
except APIStatusError:
raise
except Exception as e:
raise APIConnectionError() from e
logger.error("Error during Simplismart TTS processing: %s", traceback.format_exc())
raise APIConnectionError(f"Unexpected error in Simplismart TTS: {e}") from e