jambonz · sammachin · May 19, 2026 · Apr 26, 2026 · Apr 26, 2026 · May 12, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "jambonz-python-sdk"
-version = "0.3.0"
+version = "0.3.2"
 description = "Python SDK for jambonz CPaaS platform"
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/scripts/sync_schema.py b/scripts/sync_schema.py
@@ -22,7 +22,7 @@
 from pathlib import Path
 
 # ── Pin the schema version here ──────────────────────────────────────
-SCHEMA_VERSION = "v0.2.1"
+SCHEMA_VERSION = "v0.3.8"
 # ────────────────────────────────────────────────────────────────────
 
 DEST = Path(__file__).resolve().parent.parent / "src" / "jambonz_sdk" / "schema"

diff --git a/src/jambonz_sdk/schema/callbacks/call-status.schema.json b/src/jambonz_sdk/schema/callbacks/call-status.schema.json
@@ -2,12 +2,17 @@
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://jambonz.org/schema/callbacks/call-status",
   "title": "Call Status Webhook Payload",
-  "description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.",
+  "description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.\n\n**Capturing B-leg call_sid:** When using the dial verb to bridge calls, status events are sent for both legs. The A-leg (original inbound call) has `direction: 'inbound'`. The B-leg (outbound dialed call) has `direction: 'outbound'`. To capture the B-leg's call_sid for later use (e.g., injecting commands to the B-leg), listen for status events where `direction === 'outbound'` and extract the `call_sid` field.",
   "allOf": [
     { "$ref": "base" }
   ],
   "type": "object",
   "properties": {
+    "direction": {
+      "type": "string",
+      "enum": ["inbound", "outbound"],
+      "description": "Call direction. 'inbound' = A-leg (original incoming call to the application). 'outbound' = B-leg (call placed by the dial verb). Use this field to identify which leg generated the status event, especially when capturing the B-leg's call_sid for mid-call control."
+    },
     "call_termination_by": {
       "type": "string",
       "enum": ["caller", "jambonz"],

diff --git a/src/jambonz_sdk/schema/components/llm-base.schema.json b/src/jambonz_sdk/schema/components/llm-base.schema.json
@@ -12,7 +12,7 @@
     "vendor": {
       "type": "string",
       "description": "The LLM vendor to use.",
-      "examples": ["openai", "anthropic", "google", "groq", "deepseek", "deepgram", "ultravox", "custom"]
+      "examples": ["openai", "anthropic", "google", "groq", "deepseek", "baseten", "deepgram", "ultravox", "custom"]
     },
     "model": {
       "type": "string",

diff --git a/src/jambonz_sdk/schema/components/recognizer-assemblyAiOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-assemblyAiOptions.schema.json
@@ -28,15 +28,15 @@
     },
     "minEndOfTurnSilenceWhenConfident": {
       "type": "number",
-      "description": "Minimum silence duration (seconds) to trigger end-of-turn when confidence is met."
+      "description": "Minimum silence duration (milliseconds) to trigger end-of-turn when confidence is met. Default: 400."
     },
     "maxTurnSilence": {
       "type": "number",
-      "description": "Maximum silence duration (seconds) before forcing end-of-turn."
+      "description": "Maximum silence duration (milliseconds) before forcing end-of-turn. Default: 1280."
     },
     "minTurnSilence": {
       "type": "number",
-      "description": "Minimum silence duration (seconds) before allowing end-of-turn."
+      "description": "Minimum silence duration (milliseconds) before allowing end-of-turn."
     },
     "keyterms": {
       "type": "array",

diff --git a/src/jambonz_sdk/schema/components/recognizer-deepgramOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-deepgramOptions.schema.json
@@ -141,6 +141,11 @@
     "eagerEotThreshold": {
       "type": "number",
       "description": "Eager end-of-turn threshold for faster response."
+    },
+    "languageHints": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "Language hints for Deepgram Flux Multilingual. BCP-47 codes (e.g. 'en', 'es', 'fr'). Biases transcription toward specified languages."
     }
   },
   "additionalProperties": false

diff --git a/src/jambonz_sdk/schema/components/recognizer-googleOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-googleOptions.schema.json
@@ -14,6 +14,10 @@
       "type": "string",
       "description": "ID of a Google Speech recognizer resource (v2 only)."
     },
+    "parentPath": {
+      "type": "string",
+      "description": "Parent resource path for the Google Speech recognizer (v2 only), e.g. 'projects/{project}/locations/{location}'."
+    },
     "speechStartTimeoutMs": {
       "type": "number",
       "description": "Timeout in milliseconds to wait for speech to start."

diff --git a/src/jambonz_sdk/schema/components/recognizer-houndifyOptions.schema.json b/src/jambonz_sdk/schema/components/recognizer-houndifyOptions.schema.json
@@ -47,7 +47,9 @@
       "description": "Custom vocabulary terms."
     },
     "languageModel": { "type": "string", "description": "Language model to use." },
-    "audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." }
+    "audioQueryAbsoluteTimeout": { "type": "number", "description": "Absolute timeout for audio queries." },
+    "eoqThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "End-of-query likelihood threshold (0.0-1.0) to trigger end of speech when segmentation is disabled. Default 0.8, set to 0 to disable." },
+    "vadStopThreshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "VAD probability threshold to trigger end of speech when segmentation is disabled. When VAD drops below this value after speech is detected, streaming stops. Default 0.05, set to 0 to disable." }
   },
   "additionalProperties": false
 }
diff --git a/src/jambonz_sdk/schema/verbs/agent.schema.json b/src/jambonz_sdk/schema/verbs/agent.schema.json
@@ -21,6 +21,28 @@
       "$ref": "../components/synthesizer",
       "description": "Text-to-speech configuration for the agent."
     },
+    "autoLockLanguage": {
+      "oneOf": [
+        { "type": "boolean" },
+        { "type": "string", "enum": ["always"] }
+      ],
+      "description": "When using Deepgram Flux Multilingual, automatically adjust STT language hints and switch TTS voice based on detected language. Values: false (disabled), true (lock on first utterance), 'always' (continuously adapt on every turn). Default: false.",
+      "default": false
+    },
+    "languageConfig": {
+      "type": "object",
+      "description": "Per-language overrides for TTS. Keys are BCP-47 language codes. When autoLockLanguage detects a language switch, the agent uses the corresponding config.",
+      "additionalProperties": {
+        "type": "object",
+        "properties": {
+          "tts": {
+            "$ref": "../components/synthesizer",
+            "description": "TTS config override for this language. Merged with default tts."
+          }
+        },
+        "additionalProperties": false
+      }
+    },
     "turnDetection": {
       "oneOf": [
         {
@@ -86,8 +108,93 @@
     },
     "llm": {
       "type": "object",
-      "description": "LLM configuration for the agent. See the 'llm' verb schema for details.",
-      "additionalProperties": true
+      "description": "LLM configuration for the agent.",
+      "required": ["vendor", "model"],
+      "properties": {
+        "vendor": {
+          "type": "string",
+          "enum": [
+            "openai",
+            "anthropic",
+            "google",
+            "vertex-gemini",
+            "vertex-openai",
+            "bedrock",
+            "deepseek",
+            "baseten",
+            "azure-openai",
+            "groq",
+            "huggingface"
+          ],
+          "description": "LLM vendor id. Must match a `@jambonz/llm` registered adapter."
+        },
+        "model": {
+          "type": "string",
+          "description": "Vendor-specific model id (e.g. 'gpt-4o', 'claude-sonnet-4-5-20250929')."
+        },
+        "label": {
+          "type": "string",
+          "description": "Optional label to disambiguate when the account has multiple credentials for the same vendor."
+        },
+        "auth": {
+          "type": "object",
+          "description": "Optional inline credentials. When omitted, feature-server looks up credentials by (vendor, label) from the database.",
+          "properties": {
+            "apiKey": { "type": "string" }
+          },
+          "additionalProperties": true
+        },
+        "connectOptions": {
+          "type": "object",
+          "description": "SDK-level client options.",
+          "properties": {
+            "timeout": { "type": "number", "minimum": 0 },
+            "maxRetries": { "type": "integer", "minimum": 0 },
+            "endpoint": { "type": "string" },
+            "baseURL": { "type": "string" }
+          },
+          "additionalProperties": false
+        },
+        "llmOptions": {
+          "type": "object",
+          "description": "Per-call LLM configuration.",
+          "properties": {
+            "systemPrompt": {
+              "type": "string",
+              "description": "System prompt for the model. Placed vendor-appropriately (top-level for Anthropic/Bedrock, config.systemInstruction for Gemini, role:'system' for OpenAI-compatibles)."
+            },
+            "messages": {
+              "type": "array",
+              "description": "Seed conversation history. A role:'system' entry is extracted into systemPrompt internally.",
+              "items": { "$ref": "#/$defs/llmMessage" }
+            },
+            "initialMessages": {
+              "type": "array",
+              "description": "Alias of 'messages' (historical).",
+              "items": { "$ref": "#/$defs/llmMessage" }
+            },
+            "maxTokens": {
+              "type": "integer",
+              "minimum": 1,
+              "description": "Maximum tokens the model may generate per turn."
+            },
+            "temperature": {
+              "type": "number",
+              "minimum": 0,
+              "description": "Sampling temperature."
+            },
+            "tools": {
+              "type": "array",
+              "description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",
+              "items": {
+                "type": "object"
+              }
+            }
+          },
+          "additionalProperties": false
+        }
+      },
+      "additionalProperties": false
     },
     "actionHook": {
       "$ref": "../components/actionHook",
@@ -172,11 +279,71 @@
         "required": ["url"]
       },
       "description": "External MCP servers that provide tools to the LLM. The agent connects at startup via SSE, discovers available tools, and makes them callable by the LLM."
+    },
+    "toolFiller": {
+      "oneOf": [
+        { "type": "boolean", "const": false },
+        { "$ref": "#/$defs/toolFillerConfig" }
+      ],
+      "description": "Configuration for playing filler audio while tool calls are in progress. Prevents silence during long-running tool executions."
     }
   },
   "required": [
     "llm"
   ],
+  "$defs": {
+    "llmMessage": {
+      "type": "object",
+      "description": "A conversation-history message. The library normalizes content to a string; adapters may carry vendor-native shapes internally.",
+      "required": ["role", "content"],
+      "properties": {
+        "role": {
+          "type": "string",
+          "enum": ["system", "user", "assistant", "tool"]
+        },
+        "content": {}
+      },
+      "additionalProperties": true
+    },
+    "toolFillerConfig": {
+      "type": "object",
+      "required": ["type"],
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": ["audio", "backchannel"],
+          "description": "Filler mode. 'audio' plays a looping audio file (killed immediately when tool returns). 'backchannel' uses TTS to speak short phrases like 'one moment...' or 'checking that...' (allowed to complete before next action)."
+        },
+        "startDelaySecs": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Seconds to wait after tool invocation before playing filler. Prevents filler on fast tools. Default: 2",
+          "default": 2
+        },
+        "url": {
+          "type": "string",
+          "format": "uri",
+          "description": "Audio file URL (for type='audio'). Should be loopable. Playback stops immediately when tool returns."
+        },
+        "style": {
+          "type": "string",
+          "description": "Free-form style hint for LLM-generated phrases (for type='backchannel'). Examples: 'casual', 'professional', 'friendly and patient'. The LLM interprets this when generating filler phrases at call start."
+        },
+        "escalationSecs": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Seconds from tool invocation before generating a longer explanatory response (for type='backchannel'). Useful for tools that may take 10+ seconds. Default: 10",
+          "default": 10
+        }
+      },
+      "allOf": [
+        {
+          "if": { "properties": { "type": { "const": "audio" } } },
+          "then": { "required": ["url"] }
+        }
+      ]
+    }
+  },
   "examples": [
     {
       "verb": "agent",

diff --git a/src/jambonz_sdk/schema/verbs/dub.schema.json b/src/jambonz_sdk/schema/verbs/dub.schema.json
@@ -3,7 +3,7 @@
   "$id": "https://jambonz.org/schema/verbs/dub",
   "minVersion": "0.9.6",
   "title": "Dub",
-  "description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.",
+  "description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.\n\n**Track Routing:** Tracks are heard by the party on whose call leg they are created. A dub verb in the main verb stack (A-leg) creates tracks heard by the caller. A dub verb nested in the dial verb's `dub` array creates tracks heard by the callee. When using injectCommand to play/say on a track from a different call leg, pass the target call's `call_sid` as the third argument to `session.injectCommand()` to route the command to the correct leg.",
   "type": "object",
   "properties": {
     "verb": {

diff --git a/src/jambonz_sdk/schema/verbs/transcribe.schema.json b/src/jambonz_sdk/schema/verbs/transcribe.schema.json
@@ -37,7 +37,8 @@
     },
     "channel": {
       "type": "number",
-      "description": "Specific audio channel to transcribe."
+      "enum": [1, 2],
+      "description": "Specific audio channel to transcribe. Channel 1 = near-end (local party's audio, i.e. caller on A-leg or callee on B-leg). Channel 2 = far-end (remote party's audio). When transcribe is nested in the dial verb, omitting channel captures both legs mixed; specifying channel: 2 isolates the B-leg's inbound audio."
     }
   },
   "examples": [

diff --git a/src/jambonz_sdk/websocket/session.py b/src/jambonz_sdk/websocket/session.py
@@ -229,19 +229,30 @@ async def clear_tts_tokens(self) -> None:
     async def tool_output(self, tool_call_id: str, result: Any) -> Session:
         """Return a tool call result to the agent LLM.
 
+        Canonical wire shape (validated by ``@jambonz/schema``)::
+
+            {"type": "command", "command": "llm:tool-output",
+             "tool_call_id": "...", "data": {"result": ...}}
+
+        The ``result`` argument becomes ``data.result`` when it is not a dict,
+        matching the Node SDK's convenience wrapping. Passing a dict sends it
+        as-is so callers can include richer structured output (feature-server
+        JSON-stringifies the full ``data`` object on the way to the LLM).
+
         Args:
             tool_call_id: The tool_call_id from the llm:tool-call event.
-            result: The tool result (will be JSON-serialized).
+            result: The tool result. A non-dict value is wrapped as
+                ``{"result": result}``; a dict is sent as-is.
 
         Returns:
             self for chaining with .reply().
         """
+        payload = result if isinstance(result, dict) else {"result": result}
         msg = {
-            "type": "llm:tool-output",
-            "data": {
-                "tool_call_id": tool_call_id,
-                "output": result,
-            },
+            "type": "command",
+            "command": "llm:tool-output",
+            "tool_call_id": tool_call_id,
+            "data": payload,
         }
         await self._ws.send(json.dumps(msg))
         return self
@@ -257,3 +268,24 @@ async def update_agent(self, data: dict[str, Any]) -> None:
         """
         msg = {"type": "agent:update", "data": data}
         await self._ws.send(json.dumps(msg))
+
+    async def inject_stt_reconfigure(
+        self,
+        language_hints: list[str] | None = None,
+        opts: dict[str, Any] | None = None
+    ) -> None:
+        """Reconfigure STT (speech-to-text) settings mid-call.
+
+        Currently supports updating language hints for Deepgram Flux Multilingual.
+
+        Args:
+            language_hints: List of BCP-47 language codes (e.g., ['en', 'es']).
+                            Pass empty list [] to clear hints and enable auto-detection.
+            opts: Additional STT reconfiguration options.
+        """
+        data: dict[str, Any] = {}
+        if language_hints is not None:
+            data["languageHints"] = language_hints
+        if opts:
+            data.update(opts)
+        await self.inject_command("stt:reconfigure", data)