Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "jambonz-python-sdk"
version = "0.3.0"
version = "0.3.2"
description = "Python SDK for jambonz CPaaS platform"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion scripts/sync_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from pathlib import Path

# ── Pin the schema version here ──────────────────────────────────────
SCHEMA_VERSION = "v0.2.1"
SCHEMA_VERSION = "v0.3.8"
# ────────────────────────────────────────────────────────────────────

DEST = Path(__file__).resolve().parent.parent / "src" / "jambonz_sdk" / "schema"
Expand Down
7 changes: 6 additions & 1 deletion src/jambonz_sdk/schema/callbacks/call-status.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://jambonz.org/schema/callbacks/call-status",
"title": "Call Status Webhook Payload",
"description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.",
"description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.\n\n**Capturing B-leg call_sid:** When using the dial verb to bridge calls, status events are sent for both legs. The A-leg (original inbound call) has `direction: 'inbound'`. The B-leg (outbound dialed call) has `direction: 'outbound'`. To capture the B-leg's call_sid for later use (e.g., injecting commands to the B-leg), listen for status events where `direction === 'outbound'` and extract the `call_sid` field.",
"allOf": [
{ "$ref": "base" }
],
"type": "object",
"properties": {
"direction": {
"type": "string",
"enum": ["inbound", "outbound"],
"description": "Call direction. 'inbound' = A-leg (original incoming call to the application). 'outbound' = B-leg (call placed by the dial verb). Use this field to identify which leg generated the status event, especially when capturing the B-leg's call_sid for mid-call control."
},
"call_termination_by": {
"type": "string",
"enum": ["caller", "jambonz"],
Expand Down
2 changes: 1 addition & 1 deletion src/jambonz_sdk/schema/components/llm-base.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"vendor": {
"type": "string",
"description": "The LLM vendor to use.",
"examples": ["openai", "anthropic", "google", "groq", "deepseek", "deepgram", "ultravox", "custom"]
"examples": ["openai", "anthropic", "google", "groq", "deepseek", "baseten", "deepgram", "ultravox", "custom"]
},
"model": {
"type": "string",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@
},
"minEndOfTurnSilenceWhenConfident": {
"type": "number",
"description": "Minimum silence duration (seconds) to trigger end-of-turn when confidence is met."
"description": "Minimum silence duration (milliseconds) to trigger end-of-turn when confidence is met. Default: 400."
},
"maxTurnSilence": {
"type": "number",
"description": "Maximum silence duration (seconds) before forcing end-of-turn."
"description": "Maximum silence duration (milliseconds) before forcing end-of-turn. Default: 1280."
},
"minTurnSilence": {
"type": "number",
"description": "Minimum silence duration (seconds) before allowing end-of-turn."
"description": "Minimum silence duration (milliseconds) before allowing end-of-turn."
},
"keyterms": {
"type": "array",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@
"eagerEotThreshold": {
"type": "number",
"description": "Eager end-of-turn threshold for faster response."
},
"languageHints": {
"type": "array",
"items": { "type": "string" },
"description": "Language hints for Deepgram Flux Multilingual. BCP-47 codes (e.g. 'en', 'es', 'fr'). Biases transcription toward specified languages."
}
},
"additionalProperties": false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
"type": "string",
"description": "ID of a Google Speech recognizer resource (v2 only)."
},
"parentPath": {
"type": "string",
"description": "Parent resource path for the Google Speech recognizer (v2 only), e.g. 'projects/{project}/locations/{location}'."
},
"speechStartTimeoutMs": {
"type": "number",
"description": "Timeout in milliseconds to wait for speech to start."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@
"description": "Custom vocabulary terms."
},
"languageModel": { "type": "string", "description": "Language model to use." },
"audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." }
"audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." },
"eoqThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "End-of-query likelihood threshold (0.0-1.0) to trigger end of speech when segmentation is disabled. Default 0.8, set to 0 to disable." },
"vadStopThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "VAD probability threshold to trigger end of speech when segmentation is disabled. When VAD drops below this value after speech is detected, streaming stops. Default 0.05, set to 0 to disable." }
},
"additionalProperties": false
}
171 changes: 169 additions & 2 deletions src/jambonz_sdk/schema/verbs/agent.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@
"$ref": "../components/synthesizer",
"description": "Text-to-speech configuration for the agent."
},
"autoLockLanguage": {
"oneOf": [
{ "type": "boolean" },
{ "type": "string", "enum": ["always"] }
],
"description": "When using Deepgram Flux Multilingual, automatically adjust STT language hints and switch TTS voice based on detected language. Values: false (disabled), true (lock on first utterance), 'always' (continuously adapt on every turn). Default: false.",
"default": false
},
"languageConfig": {
"type": "object",
"description": "Per-language overrides for TTS. Keys are BCP-47 language codes. When autoLockLanguage detects a language switch, the agent uses the corresponding config.",
"additionalProperties": {
"type": "object",
"properties": {
"tts": {
"$ref": "../components/synthesizer",
"description": "TTS config override for this language. Merged with default tts."
}
},
"additionalProperties": false
}
},
"turnDetection": {
"oneOf": [
{
Expand Down Expand Up @@ -86,8 +108,93 @@
},
"llm": {
"type": "object",
"description": "LLM configuration for the agent. See the 'llm' verb schema for details.",
"additionalProperties": true
"description": "LLM configuration for the agent.",
"required": ["vendor", "model"],
"properties": {
"vendor": {
"type": "string",
"enum": [
"openai",
"anthropic",
"google",
"vertex-gemini",
"vertex-openai",
"bedrock",
"deepseek",
"baseten",
"azure-openai",
"groq",
"huggingface"
],
"description": "LLM vendor id. Must match a `@jambonz/llm` registered adapter."
},
"model": {
"type": "string",
"description": "Vendor-specific model id (e.g. 'gpt-4o', 'claude-sonnet-4-5-20250929')."
},
"label": {
"type": "string",
"description": "Optional label to disambiguate when the account has multiple credentials for the same vendor."
},
"auth": {
"type": "object",
"description": "Optional inline credentials. When omitted, feature-server looks up credentials by (vendor, label) from the database.",
"properties": {
"apiKey": { "type": "string" }
},
"additionalProperties": true
},
"connectOptions": {
"type": "object",
"description": "SDK-level client options.",
"properties": {
"timeout": { "type": "number", "minimum": 0 },
"maxRetries": { "type": "integer", "minimum": 0 },
"endpoint": { "type": "string" },
"baseURL": { "type": "string" }
},
"additionalProperties": false
},
"llmOptions": {
"type": "object",
"description": "Per-call LLM configuration.",
"properties": {
"systemPrompt": {
"type": "string",
"description": "System prompt for the model. Placed vendor-appropriately (top-level for Anthropic/Bedrock, config.systemInstruction for Gemini, role:'system' for OpenAI-compatibles)."
},
"messages": {
"type": "array",
"description": "Seed conversation history. A role:'system' entry is extracted into systemPrompt internally.",
"items": { "$ref": "#/$defs/llmMessage" }
},
"initialMessages": {
"type": "array",
"description": "Alias of 'messages' (historical).",
"items": { "$ref": "#/$defs/llmMessage" }
},
"maxTokens": {
"type": "integer",
"minimum": 1,
"description": "Maximum tokens the model may generate per turn."
},
"temperature": {
"type": "number",
"minimum": 0,
"description": "Sampling temperature."
},
"tools": {
"type": "array",
"description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",
"items": {
"type": "object"
}
}
},
"additionalProperties": false
}
},
"additionalProperties": false
},
"actionHook": {
"$ref": "../components/actionHook",
Expand Down Expand Up @@ -172,11 +279,71 @@
"required": ["url"]
},
"description": "External MCP servers that provide tools to the LLM. The agent connects at startup via SSE, discovers available tools, and makes them callable by the LLM."
},
"toolFiller": {
"oneOf": [
{ "type": "boolean", "const": false },
{ "$ref": "#/$defs/toolFillerConfig" }
],
"description": "Configuration for playing filler audio while tool calls are in progress. Prevents silence during long-running tool executions."
}
},
"required": [
"llm"
],
"$defs": {
"llmMessage": {
"type": "object",
"description": "A conversation-history message. The library normalizes content to a string; adapters may carry vendor-native shapes internally.",
"required": ["role", "content"],
"properties": {
"role": {
"type": "string",
"enum": ["system", "user", "assistant", "tool"]
},
"content": {}
},
"additionalProperties": true
},
"toolFillerConfig": {
"type": "object",
"required": ["type"],
"properties": {
"type": {
"type": "string",
"enum": ["audio", "backchannel"],
"description": "Filler mode. 'audio' plays a looping audio file (killed immediately when tool returns). 'backchannel' uses TTS to speak short phrases like 'one moment...' or 'checking that...' (allowed to complete before next action)."
},
"startDelaySecs": {
"type": "number",
"minimum": 0,
"description": "Seconds to wait after tool invocation before playing filler. Prevents filler on fast tools. Default: 2",
"default": 2
},
"url": {
"type": "string",
"format": "uri",
"description": "Audio file URL (for type='audio'). Should be loopable. Playback stops immediately when tool returns."
},
"style": {
"type": "string",
"description": "Free-form style hint for LLM-generated phrases (for type='backchannel'). Examples: 'casual', 'professional', 'friendly and patient'. The LLM interprets this when generating filler phrases at call start."
},
"escalationSecs": {
"type": "number",
"minimum": 0,
"description": "Seconds from tool invocation before generating a longer explanatory response (for type='backchannel'). Useful for tools that may take 10+ seconds. Default: 10",
"default": 10
}
},
"allOf": [
{
"if": { "properties": { "type": { "const": "audio" } } },
"then": { "required": ["url"] }
}
]
}
},
"examples": [
{
"verb": "agent",
Expand Down
2 changes: 1 addition & 1 deletion src/jambonz_sdk/schema/verbs/dub.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"$id": "https://jambonz.org/schema/verbs/dub",
"minVersion": "0.9.6",
"title": "Dub",
"description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.",
"description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.\n\n**Track Routing:** Tracks are heard by the party on whose call leg they are created. A dub verb in the main verb stack (A-leg) creates tracks heard by the caller. A dub verb nested in the dial verb's `dub` array creates tracks heard by the callee. When using injectCommand to play/say on a track from a different call leg, pass the target call's `call_sid` as the third argument to `session.injectCommand()` to route the command to the correct leg.",
"type": "object",
"properties": {
"verb": {
Expand Down
3 changes: 2 additions & 1 deletion src/jambonz_sdk/schema/verbs/transcribe.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
},
"channel": {
"type": "number",
"description": "Specific audio channel to transcribe."
"enum": [1, 2],
"description": "Specific audio channel to transcribe. Channel 1 = near-end (local party's audio, i.e. caller on A-leg or callee on B-leg). Channel 2 = far-end (remote party's audio). When transcribe is nested in the dial verb, omitting channel captures both legs mixed; specifying channel: 2 isolates the B-leg's inbound audio."
}
},
"examples": [
Expand Down
44 changes: 38 additions & 6 deletions src/jambonz_sdk/websocket/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,19 +229,30 @@ async def clear_tts_tokens(self) -> None:
async def tool_output(self, tool_call_id: str, result: Any) -> Session:
"""Return a tool call result to the agent LLM.

Canonical wire shape (validated by ``@jambonz/schema``)::

{"type": "command", "command": "llm:tool-output",
"tool_call_id": "...", "data": {"result": ...}}

The ``result`` argument becomes ``data.result`` when it is not a dict,
matching the Node SDK's convenience wrapping. Passing a dict sends it
as-is so callers can include richer structured output (feature-server
JSON-stringifies the full ``data`` object on the way to the LLM).

Args:
tool_call_id: The tool_call_id from the llm:tool-call event.
result: The tool result (will be JSON-serialized).
result: The tool result. A non-dict value is wrapped as
``{"result": result}``; a dict is sent as-is.

Returns:
self for chaining with .reply().
"""
payload = result if isinstance(result, dict) else {"result": result}
msg = {
"type": "llm:tool-output",
"data": {
"tool_call_id": tool_call_id,
"output": result,
},
"type": "command",
"command": "llm:tool-output",
"tool_call_id": tool_call_id,
"data": payload,
}
await self._ws.send(json.dumps(msg))
return self
Expand All @@ -257,3 +268,24 @@ async def update_agent(self, data: dict[str, Any]) -> None:
"""
msg = {"type": "agent:update", "data": data}
await self._ws.send(json.dumps(msg))

async def inject_stt_reconfigure(
self,
language_hints: list[str] | None = None,
opts: dict[str, Any] | None = None
) -> None:
"""Reconfigure STT (speech-to-text) settings mid-call.

Currently supports updating language hints for Deepgram Flux Multilingual.

Args:
language_hints: List of BCP-47 language codes (e.g., ['en', 'es']).
Pass empty list [] to clear hints and enable auto-detection.
opts: Additional STT reconfiguration options.
"""
data: dict[str, Any] = {}
if language_hints is not None:
data["languageHints"] = language_hints
if opts:
data.update(opts)
await self.inject_command("stt:reconfigure", data)
Loading
Loading