From dcff99e4c0e1debd968ca22f4de457efd35a28f3 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Wed, 26 Nov 2025 12:08:51 +0100 Subject: [PATCH 1/2] fix: Handle gradio Audio object when calling MCP tools --- src/lib/server/textGeneration/mcp/fileRefs.ts | 25 ++++++++++++++++- .../textGeneration/mcp/routerResolution.ts | 4 ++- .../server/textGeneration/mcp/runMcpFlow.ts | 27 +++++++++++++++++-- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/lib/server/textGeneration/mcp/fileRefs.ts b/src/lib/server/textGeneration/mcp/fileRefs.ts index 0ee04201dbb..9b42c244369 100644 --- a/src/lib/server/textGeneration/mcp/fileRefs.ts +++ b/src/lib/server/textGeneration/mcp/fileRefs.ts @@ -21,7 +21,15 @@ const IMAGE_REF_KIND: RefKind = { toDataUrl: (payload) => `data:${payload.mime};base64,${payload.base64}`, }; -const DEFAULT_REF_KINDS: RefKind[] = [IMAGE_REF_KIND]; +const AUDIO_REF_KIND: RefKind = { + prefix: "audio", + matches: (mime) => + typeof mime === "string" && + (mime.startsWith("audio/") || mime === "mp3" || mime === "wav" || mime === "x-wav"), + toDataUrl: (payload) => `data:${payload.mime};base64,${payload.base64}`, +}; + +const DEFAULT_REF_KINDS: RefKind[] = [IMAGE_REF_KIND, AUDIO_REF_KIND]; /** * Build a resolver that maps short ref strings (e.g. "image_1", "image_2") to the @@ -76,6 +84,10 @@ export function buildImageRefResolver(messages: EndpointMessage[]): FileRefResol return buildFileRefResolver(messages, [IMAGE_REF_KIND]); } +export function buildAudioRefResolver(messages: EndpointMessage[]): FileRefResolver | undefined { + return buildFileRefResolver(messages, [AUDIO_REF_KIND]); +} + type FieldRule = { keys: string[]; action: "attachPayload" | "replaceWithDataUrl"; @@ -95,6 +107,17 @@ const DEFAULT_FIELD_RULES: FieldRule[] = [ action: "replaceWithDataUrl", allowedPrefixes: ["image"], }, + { + keys: ["audio_ref"], + action: "attachPayload", + attachKey: "audio", + allowedPrefixes: ["audio"], + }, + { + keys: ["input_audio", "audio", "audio_url"], + action: "replaceWithDataUrl", + allowedPrefixes: ["audio"], + }, ]; /** diff --git a/src/lib/server/textGeneration/mcp/routerResolution.ts b/src/lib/server/textGeneration/mcp/routerResolution.ts index 2d762f98e59..fed225e5ad5 100644 --- a/src/lib/server/textGeneration/mcp/routerResolution.ts +++ b/src/lib/server/textGeneration/mcp/routerResolution.ts @@ -18,6 +18,7 @@ export interface RouterResolutionInput { messages: EndpointMessage[]; conversationId: string; hasImageInput: boolean; + hasAudioInput?: boolean; locals: App.Locals | undefined; } @@ -33,6 +34,7 @@ export async function resolveRouterTarget({ messages, conversationId, hasImageInput, + hasAudioInput, locals, }: RouterResolutionInput): Promise { let targetModel = model; @@ -48,7 +50,7 @@ export async function resolveRouterTarget({ const mod = await import("../../models"); const allModels = mod.models as ProcessedModel[]; - if (hasImageInput) { + if (hasImageInput || hasAudioInput) { const multimodalCandidate = findConfiguredMultimodalModel(allModels); if (!multimodalCandidate) { runMcp = false; diff --git a/src/lib/server/textGeneration/mcp/runMcpFlow.ts b/src/lib/server/textGeneration/mcp/runMcpFlow.ts index 33ed86e5d8b..1623e365921 100644 --- a/src/lib/server/textGeneration/mcp/runMcpFlow.ts +++ b/src/lib/server/textGeneration/mcp/runMcpFlow.ts @@ -18,7 +18,7 @@ import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation"; import { drainPool } from "$lib/server/mcp/clientPool"; import type { TextGenerationContext } from "../types"; import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf"; -import { buildImageRefResolver } from "./fileRefs"; +import { buildImageRefResolver, buildAudioRefResolver } from "./fileRefs"; import { prepareMessagesWithFiles } from "$lib/server/textGeneration/utils/prepareFiles"; import { makeImageProcessor } from "$lib/server/endpoints/images"; @@ -202,7 +202,11 @@ export async function* runMcpFlow({ // If anything goes wrong reading the flag, proceed (previous behavior) } - const resolveFileRef = buildImageRefResolver(messages); + const resolveFileRef = (ref: string) => { + const imageResolver = buildImageRefResolver(messages); + const audioResolver = buildAudioRefResolver(messages); + return imageResolver?.(ref) ?? audioResolver?.(ref); + }; const imageProcessor = makeImageProcessor({ supportedMimeTypes: ["image/png", "image/jpeg"], preferredMimeType: "image/jpeg", @@ -217,11 +221,18 @@ export async function* runMcpFlow({ ) ); + const hasAudioInput = messages.some((msg) => + (msg.files ?? []).some( + (file) => typeof file?.mime === "string" && file.mime.startsWith("audio/") + ) + ); + const { runMcp, targetModel, candidateModelId, resolvedRoute } = await resolveRouterTarget({ model, messages, conversationId: conv._id.toString(), hasImageInput, + hasAudioInput, locals, }); @@ -353,6 +364,18 @@ export async function* runMcpFlow({ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") { return value; } + + // Handle Gradio Audio objects: {path, url, orig_name, mime_type, size, is_stream, meta} + if ( + typeof value === "object" && + value !== null && + !Array.isArray(value) && + "url" in value && + typeof (value as { url?: unknown }).url === "string" + ) { + return (value as { url: string }).url; + } + return undefined; }; From a611be26e0faa352394c4eb861b94f7688bf0425 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Wed, 26 Nov 2025 12:11:52 +0100 Subject: [PATCH 2/2] parameterize mcp request timeout --- src/lib/server/config.ts | 3 ++- src/lib/server/mcp/httpClient.ts | 3 ++- src/lib/server/textGeneration/mcp/toolInvocation.ts | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lib/server/config.ts b/src/lib/server/config.ts index 8a597dd1c98..19b47bc29c3 100644 --- a/src/lib/server/config.ts +++ b/src/lib/server/config.ts @@ -158,7 +158,8 @@ type ExtraConfigKeys = | "METRICS_ENABLED" | "METRICS_PORT" | "MCP_SERVERS" - | "MCP_FORWARD_HF_USER_TOKEN"; + | "MCP_FORWARD_HF_USER_TOKEN" + | "MCP_TOOL_TIMEOUT_MS"; type ConfigProxy = ConfigManager & { [K in ConfigKey | ExtraConfigKeys]: string }; diff --git a/src/lib/server/mcp/httpClient.ts b/src/lib/server/mcp/httpClient.ts index c1373478b18..93b99c677d7 100644 --- a/src/lib/server/mcp/httpClient.ts +++ b/src/lib/server/mcp/httpClient.ts @@ -1,5 +1,6 @@ import { Client } from "@modelcontextprotocol/sdk/client"; import { getClient } from "./clientPool"; +import { config } from "$lib/server/config"; export interface McpServerConfig { name: string; @@ -7,7 +8,7 @@ export interface McpServerConfig { headers?: Record; } -const DEFAULT_TIMEOUT_MS = 30_000; +const DEFAULT_TIMEOUT_MS = Number(config.MCP_TOOL_TIMEOUT_MS) || 30_000; export type McpToolTextResponse = { text: string; diff --git a/src/lib/server/textGeneration/mcp/toolInvocation.ts b/src/lib/server/textGeneration/mcp/toolInvocation.ts index 6fa57322d8b..4013bd4b097 100644 --- a/src/lib/server/textGeneration/mcp/toolInvocation.ts +++ b/src/lib/server/textGeneration/mcp/toolInvocation.ts @@ -1,5 +1,6 @@ import { randomUUID } from "crypto"; import { logger } from "../../logger"; +import { config } from "$lib/server/config"; import type { MessageUpdate } from "$lib/types/MessageUpdate"; import { MessageToolUpdateType, MessageUpdateType } from "$lib/types/MessageUpdate"; import { ToolResultStatus } from "$lib/types/Tool"; @@ -69,7 +70,7 @@ export async function* executeToolCalls({ toPrimitive, processToolOutput, abortSignal, - toolTimeoutMs = 30_000, + toolTimeoutMs = Number(config.MCP_TOOL_TIMEOUT_MS) || 30_000, }: ExecuteToolCallsParams): AsyncGenerator { const toolMessages: ChatCompletionMessageParam[] = []; const toolRuns: ToolRun[] = [];